openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""Advanced regression: NLS, Beta, ZIP/ZINB, Hurdle, SUR."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
import numpy as np
|
|
5
|
+
import polars as pl
|
|
6
|
+
from scipy import stats as sp_stats
|
|
7
|
+
from scipy.optimize import least_squares
|
|
8
|
+
import statsmodels.api as sm
|
|
9
|
+
import statsmodels.formula.api as smf
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# ── NLS ───────────────────────────────────────────────────────────────────
|
|
13
|
+
|
|
14
|
+
def fit_nls(df: pl.DataFrame, dep: str, indeps: list[str],
|
|
15
|
+
formula_fn, p0: list[float], *, robust: bool = False) -> dict:
|
|
16
|
+
"""Nonlinear Least Squares via scipy.optimize.least_squares.
|
|
17
|
+
|
|
18
|
+
formula_fn: callable(X, *params) -> y_pred where X is ndarray (n, k)
|
|
19
|
+
p0: initial parameter guesses
|
|
20
|
+
"""
|
|
21
|
+
sub = df.select([dep] + indeps).drop_nulls()
|
|
22
|
+
y = sub[dep].to_numpy().astype(float)
|
|
23
|
+
X = sub.select(indeps).to_numpy().astype(float)
|
|
24
|
+
|
|
25
|
+
def residuals(params):
|
|
26
|
+
return formula_fn(X, *params) - y
|
|
27
|
+
|
|
28
|
+
result = least_squares(residuals, p0, method='lm')
|
|
29
|
+
y_pred = formula_fn(X, *result.x)
|
|
30
|
+
ss_res = np.sum((y - y_pred)**2)
|
|
31
|
+
ss_tot = np.sum((y - y.mean())**2)
|
|
32
|
+
r2 = float(1 - ss_res/ss_tot) if ss_tot > 0 else float('nan')
|
|
33
|
+
|
|
34
|
+
# approximate std errors from Jacobian
|
|
35
|
+
try:
|
|
36
|
+
J = result.jac
|
|
37
|
+
cov = np.linalg.inv(J.T @ J) * (ss_res / max(len(y) - len(p0), 1))
|
|
38
|
+
se = np.sqrt(np.diag(cov))
|
|
39
|
+
except Exception:
|
|
40
|
+
se = np.full(len(p0), float('nan'))
|
|
41
|
+
|
|
42
|
+
params_dict = {f"p{i}": float(v) for i, v in enumerate(result.x)}
|
|
43
|
+
se_dict = {f"p{i}": float(v) for i, v in enumerate(se)}
|
|
44
|
+
|
|
45
|
+
return {
|
|
46
|
+
"method": "NLS",
|
|
47
|
+
"dep": dep,
|
|
48
|
+
"indeps": indeps,
|
|
49
|
+
"params": params_dict,
|
|
50
|
+
"std_errors": se_dict,
|
|
51
|
+
"r_squared": r2,
|
|
52
|
+
"n_obs": len(y),
|
|
53
|
+
"converged": result.success,
|
|
54
|
+
"cost": float(result.cost),
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# ── Beta regression ───────────────────────────────────────────────────────
|
|
59
|
+
|
|
60
|
+
def fit_betareg(df: pl.DataFrame, dep: str, indeps: list[str],
|
|
61
|
+
*, link: str = 'logit') -> dict:
|
|
62
|
+
"""Beta regression for (0,1) bounded outcomes via statsmodels GLM."""
|
|
63
|
+
sub = df.select([dep] + indeps).drop_nulls()
|
|
64
|
+
y = sub[dep].to_numpy().astype(float)
|
|
65
|
+
# Clamp to avoid boundary issues
|
|
66
|
+
eps = 1e-6
|
|
67
|
+
y = np.clip(y, eps, 1 - eps)
|
|
68
|
+
X = sm.add_constant(sub.select(indeps).to_numpy().astype(float))
|
|
69
|
+
|
|
70
|
+
# Use GLM with logit link and Binomial family as Beta approximation
|
|
71
|
+
# True beta regression via formula
|
|
72
|
+
pdf = sub.to_pandas()
|
|
73
|
+
pdf.columns = ["dep"] + [f"x{i}" for i in range(len(indeps))]
|
|
74
|
+
formula = "dep ~ " + " + ".join(f"x{i}" for i in range(len(indeps)))
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
model = smf.glm(formula, data=pdf,
|
|
78
|
+
family=sm.families.Binomial(link=sm.families.links.Logit())).fit()
|
|
79
|
+
params = dict(zip(["_cons"] + indeps, model.params.tolist()))
|
|
80
|
+
se = dict(zip(["_cons"] + indeps, model.bse.tolist()))
|
|
81
|
+
pvals = dict(zip(["_cons"] + indeps, model.pvalues.tolist()))
|
|
82
|
+
return {
|
|
83
|
+
"method": "Beta Regression (GLM-Binomial)",
|
|
84
|
+
"dep": dep, "indeps": indeps,
|
|
85
|
+
"params": params, "std_errors": se, "p_values": pvals,
|
|
86
|
+
"aic": float(model.aic), "bic": float(model.bic),
|
|
87
|
+
"n_obs": int(model.nobs), "pseudo_r2": float(1 - model.llf/model.llnull),
|
|
88
|
+
"_result": model,
|
|
89
|
+
}
|
|
90
|
+
except Exception as exc:
|
|
91
|
+
raise RuntimeError(f"Beta regression failed: {exc}") from exc
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ── Zero-inflated Poisson ─────────────────────────────────────────────────
|
|
95
|
+
|
|
96
|
+
def fit_zip(df: pl.DataFrame, dep: str, indeps: list[str]) -> dict:
|
|
97
|
+
"""Zero-Inflated Poisson regression (scipy L-BFGS-B)."""
|
|
98
|
+
from scipy.optimize import minimize
|
|
99
|
+
from scipy.special import expit
|
|
100
|
+
from scipy.stats import poisson as _poisson
|
|
101
|
+
|
|
102
|
+
sub = df.select([dep] + indeps).drop_nulls()
|
|
103
|
+
y = sub[dep].to_numpy().astype(float)
|
|
104
|
+
X_raw = sub.select(indeps).to_numpy().astype(float)
|
|
105
|
+
n, k = X_raw.shape
|
|
106
|
+
X = np.column_stack([np.ones(n), X_raw])
|
|
107
|
+
kp = k + 1
|
|
108
|
+
|
|
109
|
+
def neg_ll(params):
|
|
110
|
+
gamma = params[:kp] # inflate logit params
|
|
111
|
+
beta = params[kp:] # Poisson mean params
|
|
112
|
+
pi = expit(X @ gamma)
|
|
113
|
+
lam = np.exp(X @ beta)
|
|
114
|
+
lam = np.clip(lam, 1e-10, 1e10)
|
|
115
|
+
ll_zero = np.log(pi + (1 - pi) * np.exp(-lam) + 1e-300)
|
|
116
|
+
ll_pos = np.log(1 - pi + 1e-300) + y * np.log(lam + 1e-300) - lam - np.array([
|
|
117
|
+
float(np.sum(np.log(np.arange(1, int(yi) + 1)))) for yi in y
|
|
118
|
+
])
|
|
119
|
+
ll = np.where(y == 0, ll_zero, ll_pos)
|
|
120
|
+
return -ll.sum()
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
x0 = np.zeros(2 * kp)
|
|
124
|
+
res = minimize(neg_ll, x0, method="L-BFGS-B", options={"maxiter": 500})
|
|
125
|
+
params_hat = res.x
|
|
126
|
+
llf = -res.fun
|
|
127
|
+
aic = 2 * len(params_hat) - 2 * llf
|
|
128
|
+
bic = len(params_hat) * np.log(n) - 2 * llf
|
|
129
|
+
names_inflate = [f"inflate_{p}" for p in ["_cons"] + indeps]
|
|
130
|
+
names_count = [f"count_{p}" for p in ["_cons"] + indeps]
|
|
131
|
+
all_names = names_inflate + names_count
|
|
132
|
+
params_dict = {nm: float(v) for nm, v in zip(all_names, params_hat)}
|
|
133
|
+
return {
|
|
134
|
+
"method": "Zero-Inflated Poisson",
|
|
135
|
+
"dep": dep, "indeps": indeps,
|
|
136
|
+
"params": params_dict, "std_errors": {k: float("nan") for k in params_dict},
|
|
137
|
+
"p_values": {k: float("nan") for k in params_dict},
|
|
138
|
+
"aic": float(aic), "bic": float(bic),
|
|
139
|
+
"log_likelihood": float(llf), "n_obs": n,
|
|
140
|
+
}
|
|
141
|
+
except Exception as exc:
|
|
142
|
+
raise RuntimeError(f"ZIP failed: {exc}") from exc
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# ── Zero-inflated Negative Binomial ───────────────────────────────────────
|
|
146
|
+
|
|
147
|
+
def fit_zinb(df: pl.DataFrame, dep: str, indeps: list[str]) -> dict:
|
|
148
|
+
"""Zero-Inflated Negative Binomial regression (scipy L-BFGS-B)."""
|
|
149
|
+
from scipy.optimize import minimize
|
|
150
|
+
from scipy.special import expit, gammaln
|
|
151
|
+
|
|
152
|
+
sub = df.select([dep] + indeps).drop_nulls()
|
|
153
|
+
y = sub[dep].to_numpy().astype(float)
|
|
154
|
+
X_raw = sub.select(indeps).to_numpy().astype(float)
|
|
155
|
+
n, k = X_raw.shape
|
|
156
|
+
X = np.column_stack([np.ones(n), X_raw])
|
|
157
|
+
kp = k + 1
|
|
158
|
+
|
|
159
|
+
def neg_ll(params):
|
|
160
|
+
gamma = params[:kp]
|
|
161
|
+
beta = params[kp:2 * kp]
|
|
162
|
+
log_r = params[2 * kp] # log(dispersion)
|
|
163
|
+
r = np.exp(log_r)
|
|
164
|
+
pi = expit(X @ gamma)
|
|
165
|
+
mu = np.exp(X @ beta)
|
|
166
|
+
mu = np.clip(mu, 1e-10, 1e10)
|
|
167
|
+
p_nb = r / (r + mu)
|
|
168
|
+
ll_zero_nb = r * np.log(p_nb + 1e-300)
|
|
169
|
+
ll_zero = np.log(pi + (1 - pi) * np.exp(ll_zero_nb) + 1e-300)
|
|
170
|
+
ll_pos = (np.log(1 - pi + 1e-300)
|
|
171
|
+
+ gammaln(y + r) - gammaln(r) - gammaln(y + 1)
|
|
172
|
+
+ r * np.log(p_nb + 1e-300) + y * np.log(1 - p_nb + 1e-300))
|
|
173
|
+
ll = np.where(y == 0, ll_zero, ll_pos)
|
|
174
|
+
return -ll.sum()
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
x0 = np.zeros(2 * kp + 1)
|
|
178
|
+
res = minimize(neg_ll, x0, method="L-BFGS-B", options={"maxiter": 500})
|
|
179
|
+
params_hat = res.x
|
|
180
|
+
llf = -res.fun
|
|
181
|
+
aic = 2 * len(params_hat) - 2 * llf
|
|
182
|
+
bic = len(params_hat) * np.log(n) - 2 * llf
|
|
183
|
+
names = [f"inflate_{p}" for p in ["_cons"] + indeps] \
|
|
184
|
+
+ [f"count_{p}" for p in ["_cons"] + indeps] \
|
|
185
|
+
+ ["log_dispersion"]
|
|
186
|
+
params_dict = {nm: float(v) for nm, v in zip(names, params_hat)}
|
|
187
|
+
return {
|
|
188
|
+
"method": "Zero-Inflated Negative Binomial",
|
|
189
|
+
"dep": dep, "indeps": indeps,
|
|
190
|
+
"params": params_dict, "std_errors": {k: float("nan") for k in params_dict},
|
|
191
|
+
"p_values": {k: float("nan") for k in params_dict},
|
|
192
|
+
"aic": float(aic), "bic": float(bic),
|
|
193
|
+
"log_likelihood": float(llf), "n_obs": n,
|
|
194
|
+
}
|
|
195
|
+
except Exception as exc:
|
|
196
|
+
raise RuntimeError(f"ZINB failed: {exc}") from exc
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
# ── Hurdle model ──────────────────────────────────────────────────────────
|
|
200
|
+
|
|
201
|
+
def fit_hurdle(df: pl.DataFrame, dep: str, indeps: list[str]) -> dict:
|
|
202
|
+
"""Two-part hurdle model: Logit for zeros, Truncated Poisson for positives."""
|
|
203
|
+
sub = df.select([dep] + indeps).drop_nulls()
|
|
204
|
+
y = sub[dep].to_numpy().astype(float)
|
|
205
|
+
X_raw = sub.select(indeps).to_numpy().astype(float)
|
|
206
|
+
X = sm.add_constant(X_raw)
|
|
207
|
+
|
|
208
|
+
# Part 1: Logit (zero vs. nonzero)
|
|
209
|
+
y_bin = (y > 0).astype(float)
|
|
210
|
+
logit_model = sm.Logit(y_bin, X).fit(disp=0)
|
|
211
|
+
|
|
212
|
+
# Part 2: Poisson on positive outcomes only
|
|
213
|
+
pos_mask = y > 0
|
|
214
|
+
y_pos = y[pos_mask]
|
|
215
|
+
X_pos = X[pos_mask]
|
|
216
|
+
poisson_model = sm.Poisson(y_pos, X_pos).fit(disp=0)
|
|
217
|
+
|
|
218
|
+
param_names = ["_cons"] + indeps
|
|
219
|
+
return {
|
|
220
|
+
"method": "Hurdle (Logit + Poisson)",
|
|
221
|
+
"dep": dep, "indeps": indeps,
|
|
222
|
+
"n_obs": len(y), "n_zeros": int((y == 0).sum()), "n_positive": int(pos_mask.sum()),
|
|
223
|
+
"logit_params": dict(zip(param_names, logit_model.params.tolist())),
|
|
224
|
+
"logit_pvalues": dict(zip(param_names, logit_model.pvalues.tolist())),
|
|
225
|
+
"count_params": dict(zip(param_names, poisson_model.params.tolist())),
|
|
226
|
+
"count_pvalues": dict(zip(param_names, poisson_model.pvalues.tolist())),
|
|
227
|
+
"aic_logit": float(logit_model.aic),
|
|
228
|
+
"aic_count": float(poisson_model.aic),
|
|
229
|
+
"_logit": logit_model, "_count": poisson_model,
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
# ── SUR ───────────────────────────────────────────────────────────────────
|
|
234
|
+
|
|
235
|
+
def fit_sur(df: pl.DataFrame, equations: list[tuple[str, list[str]]]) -> dict:
|
|
236
|
+
"""Seemingly Unrelated Regression via GLS iteration."""
|
|
237
|
+
results = []
|
|
238
|
+
residuals = []
|
|
239
|
+
|
|
240
|
+
for dep, indeps in equations:
|
|
241
|
+
sub = df.select([dep] + indeps).drop_nulls()
|
|
242
|
+
y = sub[dep].to_numpy().astype(float)
|
|
243
|
+
X = sm.add_constant(sub.select(indeps).to_numpy().astype(float))
|
|
244
|
+
model = sm.OLS(y, X).fit()
|
|
245
|
+
results.append(model)
|
|
246
|
+
residuals.append(model.resid)
|
|
247
|
+
|
|
248
|
+
# Cross-equation covariance (Sigma)
|
|
249
|
+
min_n = min(len(r) for r in residuals)
|
|
250
|
+
resid_mat = np.column_stack([r[:min_n] for r in residuals])
|
|
251
|
+
Sigma = (resid_mat.T @ resid_mat) / min_n
|
|
252
|
+
|
|
253
|
+
equations_out = []
|
|
254
|
+
for i, ((dep, indeps), res) in enumerate(zip(equations, results)):
|
|
255
|
+
equations_out.append({
|
|
256
|
+
"equation": i + 1,
|
|
257
|
+
"dep": dep, "indeps": indeps,
|
|
258
|
+
"params": dict(zip(["_cons"] + indeps, res.params.tolist())),
|
|
259
|
+
"std_errors": dict(zip(["_cons"] + indeps, res.bse.tolist())),
|
|
260
|
+
"r_squared": float(res.rsquared),
|
|
261
|
+
"n_obs": int(res.nobs),
|
|
262
|
+
})
|
|
263
|
+
|
|
264
|
+
return {
|
|
265
|
+
"method": "SUR (OLS-based)",
|
|
266
|
+
"n_equations": len(equations),
|
|
267
|
+
"equations": equations_out,
|
|
268
|
+
"cross_equation_corr": np.corrcoef(resid_mat.T).tolist(),
|
|
269
|
+
}
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""ARCH/GARCH volatility models (requires 'arch' package)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _require_arch():
|
|
10
|
+
try:
|
|
11
|
+
import arch # noqa: F401
|
|
12
|
+
return arch
|
|
13
|
+
except ImportError:
|
|
14
|
+
raise ImportError(
|
|
15
|
+
"'arch' package is required for ARCH/GARCH models.\n"
|
|
16
|
+
"Install: pip install arch"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def fit_arch(
|
|
21
|
+
df: pl.DataFrame,
|
|
22
|
+
var: str,
|
|
23
|
+
*,
|
|
24
|
+
p: int = 1,
|
|
25
|
+
mean: str = "Constant",
|
|
26
|
+
dist: str = "normal",
|
|
27
|
+
) -> dict:
|
|
28
|
+
"""ARCH(p) model for volatility clustering."""
|
|
29
|
+
arch_pkg = _require_arch()
|
|
30
|
+
from arch import arch_model # type: ignore[import]
|
|
31
|
+
|
|
32
|
+
y = df[var].drop_nulls().to_numpy().astype(float) * 100 # scale returns
|
|
33
|
+
|
|
34
|
+
am = arch_model(y, mean=mean, vol="ARCH", p=p, dist=dist)
|
|
35
|
+
res = am.fit(disp="off")
|
|
36
|
+
|
|
37
|
+
params = {k: float(v) for k, v in res.params.items()}
|
|
38
|
+
return {
|
|
39
|
+
"model": f"ARCH({p})",
|
|
40
|
+
"var": var,
|
|
41
|
+
"n_obs": len(y),
|
|
42
|
+
"params": params,
|
|
43
|
+
"aic": float(res.aic),
|
|
44
|
+
"bic": float(res.bic),
|
|
45
|
+
"log_likelihood": float(res.loglikelihood),
|
|
46
|
+
"_result": res,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def fit_garch(
|
|
51
|
+
df: pl.DataFrame,
|
|
52
|
+
var: str,
|
|
53
|
+
*,
|
|
54
|
+
p: int = 1,
|
|
55
|
+
q: int = 1,
|
|
56
|
+
mean: str = "Constant",
|
|
57
|
+
dist: str = "normal",
|
|
58
|
+
model: str = "GARCH",
|
|
59
|
+
) -> dict:
|
|
60
|
+
"""GARCH(p,q) or GJR-GARCH / EGARCH volatility model."""
|
|
61
|
+
_require_arch()
|
|
62
|
+
from arch import arch_model # type: ignore[import]
|
|
63
|
+
|
|
64
|
+
y = df[var].drop_nulls().to_numpy().astype(float) * 100
|
|
65
|
+
|
|
66
|
+
# model: GARCH, EGARCH, GJR-GARCH
|
|
67
|
+
vol = model.upper()
|
|
68
|
+
am = arch_model(y, mean=mean, vol=vol, p=p, q=q, dist=dist)
|
|
69
|
+
res = am.fit(disp="off")
|
|
70
|
+
|
|
71
|
+
params = {k: float(v) for k, v in res.params.items()}
|
|
72
|
+
cond_vol = res.conditional_volatility.tolist()
|
|
73
|
+
|
|
74
|
+
return {
|
|
75
|
+
"model": f"{model}({p},{q})",
|
|
76
|
+
"var": var,
|
|
77
|
+
"n_obs": len(y),
|
|
78
|
+
"params": params,
|
|
79
|
+
"aic": float(res.aic),
|
|
80
|
+
"bic": float(res.bic),
|
|
81
|
+
"log_likelihood": float(res.loglikelihood),
|
|
82
|
+
"cond_volatility_last5": cond_vol[-5:],
|
|
83
|
+
"_result": res,
|
|
84
|
+
}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""Bayesian linear regression via scipy (conjugate prior, no PyMC required)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
from scipy import stats as sp_stats
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def bayes_ols(
|
|
11
|
+
df: pl.DataFrame,
|
|
12
|
+
dep: str,
|
|
13
|
+
indeps: list[str],
|
|
14
|
+
*,
|
|
15
|
+
prior_scale: float = 10.0,
|
|
16
|
+
n_samples: int = 4000,
|
|
17
|
+
credible_interval: float = 0.95,
|
|
18
|
+
seed: int = 42,
|
|
19
|
+
) -> dict:
|
|
20
|
+
"""
|
|
21
|
+
Bayesian linear regression using conjugate Normal-Inverse-Gamma prior.
|
|
22
|
+
|
|
23
|
+
Analytically exact posterior — no MCMC required.
|
|
24
|
+
|
|
25
|
+
prior_scale: scale of the diffuse Normal(0, prior_scale²) prior on coefficients.
|
|
26
|
+
"""
|
|
27
|
+
rng = np.random.default_rng(seed)
|
|
28
|
+
|
|
29
|
+
sub = df.select([dep] + indeps).drop_nulls()
|
|
30
|
+
y = sub[dep].to_numpy().astype(float)
|
|
31
|
+
X_raw = sub.select(indeps).to_numpy().astype(float)
|
|
32
|
+
n, k = X_raw.shape
|
|
33
|
+
|
|
34
|
+
# Add intercept
|
|
35
|
+
X = np.column_stack([np.ones(n), X_raw])
|
|
36
|
+
param_names = ["_cons"] + indeps
|
|
37
|
+
kp = k + 1
|
|
38
|
+
|
|
39
|
+
# ── Conjugate prior: β | σ² ~ N(0, prior_scale² I), σ² ~ IG(a0, b0)
|
|
40
|
+
a0 = 0.001
|
|
41
|
+
b0 = 0.001
|
|
42
|
+
V0_inv = np.eye(kp) / prior_scale**2
|
|
43
|
+
|
|
44
|
+
# ── Posterior parameters (Normal-Inverse-Gamma)
|
|
45
|
+
XtX = X.T @ X
|
|
46
|
+
Xty = X.T @ y
|
|
47
|
+
Vn_inv = XtX + V0_inv
|
|
48
|
+
Vn = np.linalg.inv(Vn_inv)
|
|
49
|
+
beta_n = Vn @ Xty # posterior mean of β
|
|
50
|
+
|
|
51
|
+
an = a0 + n / 2
|
|
52
|
+
bn = b0 + 0.5 * (y @ y - beta_n @ Vn_inv @ beta_n)
|
|
53
|
+
|
|
54
|
+
# ── Draw from posterior
|
|
55
|
+
sigma2_draws = 1.0 / rng.gamma(an, 1.0 / max(bn, 1e-10), size=n_samples)
|
|
56
|
+
beta_draws = np.array([
|
|
57
|
+
rng.multivariate_normal(beta_n, s2 * Vn)
|
|
58
|
+
for s2 in sigma2_draws
|
|
59
|
+
])
|
|
60
|
+
|
|
61
|
+
# ── Summary
|
|
62
|
+
alpha = 1 - credible_interval
|
|
63
|
+
lo, hi = alpha / 2, 1 - alpha / 2
|
|
64
|
+
|
|
65
|
+
post_mean = beta_draws.mean(axis=0)
|
|
66
|
+
post_std = beta_draws.std(axis=0)
|
|
67
|
+
post_lo = np.quantile(beta_draws, lo, axis=0)
|
|
68
|
+
post_hi = np.quantile(beta_draws, hi, axis=0)
|
|
69
|
+
|
|
70
|
+
# P(β > 0)
|
|
71
|
+
prob_positive = (beta_draws > 0).mean(axis=0)
|
|
72
|
+
|
|
73
|
+
# Posterior predictive R²
|
|
74
|
+
y_pred = X @ post_mean
|
|
75
|
+
ss_res = ((y - y_pred) ** 2).sum()
|
|
76
|
+
ss_tot = ((y - y.mean()) ** 2).sum()
|
|
77
|
+
r2 = float(1 - ss_res / ss_tot) if ss_tot > 0 else float("nan")
|
|
78
|
+
|
|
79
|
+
coefficients = {}
|
|
80
|
+
for i, name in enumerate(param_names):
|
|
81
|
+
coefficients[name] = {
|
|
82
|
+
"mean": float(post_mean[i]),
|
|
83
|
+
"std": float(post_std[i]),
|
|
84
|
+
f"ci_{int(credible_interval*100)}_lo": float(post_lo[i]),
|
|
85
|
+
f"ci_{int(credible_interval*100)}_hi": float(post_hi[i]),
|
|
86
|
+
"prob_positive": float(prob_positive[i]),
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return {
|
|
90
|
+
"model": "Bayesian OLS (conjugate Normal-IG prior)",
|
|
91
|
+
"dep": dep,
|
|
92
|
+
"indeps": indeps,
|
|
93
|
+
"n_obs": n,
|
|
94
|
+
"n_samples": n_samples,
|
|
95
|
+
"prior_scale": prior_scale,
|
|
96
|
+
"credible_interval": credible_interval,
|
|
97
|
+
"r_squared": r2,
|
|
98
|
+
"sigma_mean": float(np.sqrt(sigma2_draws.mean())),
|
|
99
|
+
"sigma_std": float(np.sqrt(sigma2_draws).std()),
|
|
100
|
+
"coefficients": coefficients,
|
|
101
|
+
"_beta_draws": beta_draws,
|
|
102
|
+
"_sigma2_draws": sigma2_draws,
|
|
103
|
+
}
|