openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
"""Discrete / censored models: Tobit, Multinomial Logit, Ordered Logit/Probit."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
import statsmodels.api as sm
|
|
8
|
+
from scipy import stats as sp_stats
|
|
9
|
+
from scipy.optimize import minimize
|
|
10
|
+
|
|
11
|
+
from openstat.stats.models import FitResult, _prepare_data, _cov_args, _model_type_suffix
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# ── Tobit (censored regression) ─────────────────────────────────────
|
|
15
|
+
|
|
16
|
+
def _tobit_loglike(params, y, X, lower, upper):
|
|
17
|
+
"""Tobit log-likelihood for scipy.optimize."""
|
|
18
|
+
beta = params[:-1]
|
|
19
|
+
log_sigma = params[-1]
|
|
20
|
+
sigma = np.exp(log_sigma)
|
|
21
|
+
|
|
22
|
+
Xb = X @ beta
|
|
23
|
+
resid = (y - Xb) / sigma
|
|
24
|
+
|
|
25
|
+
ll = 0.0
|
|
26
|
+
for i in range(len(y)):
|
|
27
|
+
if lower is not None and y[i] <= lower:
|
|
28
|
+
# Left-censored
|
|
29
|
+
cdf_val = sp_stats.norm.cdf((lower - Xb[i]) / sigma)
|
|
30
|
+
ll += np.log(max(cdf_val, 1e-300))
|
|
31
|
+
elif upper is not None and y[i] >= upper:
|
|
32
|
+
# Right-censored
|
|
33
|
+
cdf_val = sp_stats.norm.sf((upper - Xb[i]) / sigma)
|
|
34
|
+
ll += np.log(max(cdf_val, 1e-300))
|
|
35
|
+
else:
|
|
36
|
+
# Uncensored
|
|
37
|
+
ll += sp_stats.norm.logpdf(resid[i]) - log_sigma
|
|
38
|
+
|
|
39
|
+
return -ll # minimize negative log-likelihood
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def fit_tobit(
|
|
43
|
+
df: pl.DataFrame,
|
|
44
|
+
dep: str,
|
|
45
|
+
indeps: list[str],
|
|
46
|
+
*,
|
|
47
|
+
lower_limit: float | None = None,
|
|
48
|
+
upper_limit: float | None = None,
|
|
49
|
+
robust: bool = False,
|
|
50
|
+
cluster_col: str | None = None,
|
|
51
|
+
) -> tuple[FitResult, object]:
|
|
52
|
+
"""Fit a Tobit (censored) regression model via MLE."""
|
|
53
|
+
y, X, warnings_list, var_names, groups = _prepare_data(
|
|
54
|
+
df, dep, indeps, cluster_col=cluster_col,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if lower_limit is None and upper_limit is None:
|
|
58
|
+
warnings_list.append("Note: No censoring limits specified. Results are equivalent to OLS.")
|
|
59
|
+
|
|
60
|
+
n_censored_low = int(np.sum(y <= lower_limit)) if lower_limit is not None else 0
|
|
61
|
+
n_censored_high = int(np.sum(y >= upper_limit)) if upper_limit is not None else 0
|
|
62
|
+
n_uncensored = len(y) - n_censored_low - n_censored_high
|
|
63
|
+
|
|
64
|
+
if n_uncensored < len(var_names) + 2:
|
|
65
|
+
raise ValueError(
|
|
66
|
+
f"Too few uncensored observations ({n_uncensored}) for {len(var_names)} parameters."
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Initial values: OLS estimates + log(sigma)
|
|
70
|
+
ols = sm.OLS(y, X).fit()
|
|
71
|
+
init_beta = ols.params
|
|
72
|
+
init_log_sigma = np.log(np.std(ols.resid))
|
|
73
|
+
init_params = np.append(init_beta, init_log_sigma)
|
|
74
|
+
|
|
75
|
+
result = minimize(
|
|
76
|
+
_tobit_loglike, init_params, args=(y, X, lower_limit, upper_limit),
|
|
77
|
+
method="BFGS",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
if not result.success:
|
|
81
|
+
warnings_list.append(f"Warning: Optimization did not fully converge: {result.message}")
|
|
82
|
+
|
|
83
|
+
beta = result.x[:-1]
|
|
84
|
+
log_sigma = result.x[-1]
|
|
85
|
+
sigma = np.exp(log_sigma)
|
|
86
|
+
|
|
87
|
+
# Standard errors from inverse Hessian
|
|
88
|
+
try:
|
|
89
|
+
hess_inv = result.hess_inv
|
|
90
|
+
if hasattr(hess_inv, 'todense'):
|
|
91
|
+
hess_inv = hess_inv.todense()
|
|
92
|
+
se_all = np.sqrt(np.diag(np.abs(hess_inv)))
|
|
93
|
+
except Exception:
|
|
94
|
+
# Fallback: numerical Hessian
|
|
95
|
+
from scipy.optimize import approx_fprime
|
|
96
|
+
eps = 1e-5
|
|
97
|
+
n_p = len(result.x)
|
|
98
|
+
hess = np.zeros((n_p, n_p))
|
|
99
|
+
for i in range(n_p):
|
|
100
|
+
def grad_i(p):
|
|
101
|
+
g = approx_fprime(p, _tobit_loglike, eps, y, X, lower_limit, upper_limit)
|
|
102
|
+
return g[i]
|
|
103
|
+
hess[i, :] = approx_fprime(result.x, grad_i, eps)
|
|
104
|
+
try:
|
|
105
|
+
se_all = np.sqrt(np.diag(np.linalg.inv(hess)))
|
|
106
|
+
except np.linalg.LinAlgError:
|
|
107
|
+
se_all = np.full(n_p, np.nan)
|
|
108
|
+
|
|
109
|
+
se_beta = se_all[:-1]
|
|
110
|
+
se_sigma = se_all[-1]
|
|
111
|
+
|
|
112
|
+
# Build coefficient results
|
|
113
|
+
t_vals = beta / se_beta
|
|
114
|
+
p_vals = 2 * (1 - sp_stats.norm.cdf(np.abs(t_vals)))
|
|
115
|
+
ci_low = beta - 1.96 * se_beta
|
|
116
|
+
ci_high = beta + 1.96 * se_beta
|
|
117
|
+
|
|
118
|
+
# Add sigma as extra parameter
|
|
119
|
+
all_var_names = var_names + ["sigma"]
|
|
120
|
+
all_params = np.append(beta, sigma)
|
|
121
|
+
all_se = np.append(se_beta, se_sigma)
|
|
122
|
+
sigma_t = sigma / se_sigma if se_sigma > 0 else np.nan
|
|
123
|
+
sigma_p = 2 * (1 - sp_stats.norm.cdf(np.abs(sigma_t)))
|
|
124
|
+
all_t = np.append(t_vals, sigma_t)
|
|
125
|
+
all_p = np.append(p_vals, sigma_p)
|
|
126
|
+
all_ci_low = np.append(ci_low, sigma - 1.96 * se_sigma)
|
|
127
|
+
all_ci_high = np.append(ci_high, sigma + 1.96 * se_sigma)
|
|
128
|
+
|
|
129
|
+
suffix = _model_type_suffix(robust, groups is not None)
|
|
130
|
+
ll_val = -result.fun
|
|
131
|
+
|
|
132
|
+
censor_info = []
|
|
133
|
+
if lower_limit is not None:
|
|
134
|
+
censor_info.append(f"Left-censored at {lower_limit}: {n_censored_low} obs")
|
|
135
|
+
if upper_limit is not None:
|
|
136
|
+
censor_info.append(f"Right-censored at {upper_limit}: {n_censored_high} obs")
|
|
137
|
+
censor_info.append(f"Uncensored: {n_uncensored} obs")
|
|
138
|
+
warnings_list.extend(censor_info)
|
|
139
|
+
|
|
140
|
+
fit = FitResult(
|
|
141
|
+
model_type="Tobit" + suffix,
|
|
142
|
+
formula=f"{dep} ~ {' + '.join(indeps)}",
|
|
143
|
+
dep_var=dep,
|
|
144
|
+
indep_vars=indeps,
|
|
145
|
+
n_obs=len(y),
|
|
146
|
+
params=dict(zip(all_var_names, all_params)),
|
|
147
|
+
std_errors=dict(zip(all_var_names, all_se)),
|
|
148
|
+
t_values=dict(zip(all_var_names, all_t)),
|
|
149
|
+
p_values=dict(zip(all_var_names, all_p)),
|
|
150
|
+
conf_int_low=dict(zip(all_var_names, all_ci_low)),
|
|
151
|
+
conf_int_high=dict(zip(all_var_names, all_ci_high)),
|
|
152
|
+
log_likelihood=ll_val,
|
|
153
|
+
aic=-2 * ll_val + 2 * len(result.x),
|
|
154
|
+
bic=-2 * ll_val + np.log(len(y)) * len(result.x),
|
|
155
|
+
warnings=warnings_list,
|
|
156
|
+
)
|
|
157
|
+
return fit, result
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# ── Multinomial Logit ────────────────────────────────────────────────
|
|
161
|
+
|
|
162
|
+
def fit_mlogit(
|
|
163
|
+
df: pl.DataFrame,
|
|
164
|
+
dep: str,
|
|
165
|
+
indeps: list[str],
|
|
166
|
+
*,
|
|
167
|
+
robust: bool = False,
|
|
168
|
+
cluster_col: str | None = None,
|
|
169
|
+
) -> tuple[FitResult, object]:
|
|
170
|
+
"""Fit a Multinomial Logit model."""
|
|
171
|
+
y, X, warnings_list, var_names, groups = _prepare_data(
|
|
172
|
+
df, dep, indeps, cluster_col=cluster_col,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
cov_type, cov_kwds = _cov_args(robust, groups)
|
|
176
|
+
model = sm.MNLogit(y, X).fit(disp=0, cov_type=cov_type, cov_kwds=cov_kwds)
|
|
177
|
+
|
|
178
|
+
# Get unique categories
|
|
179
|
+
categories = sorted(np.unique(y))
|
|
180
|
+
base_cat = categories[0]
|
|
181
|
+
other_cats = categories[1:]
|
|
182
|
+
|
|
183
|
+
# Flatten per-category coefficients
|
|
184
|
+
params_dict = {}
|
|
185
|
+
se_dict = {}
|
|
186
|
+
t_dict = {}
|
|
187
|
+
p_dict = {}
|
|
188
|
+
ci_low_dict = {}
|
|
189
|
+
ci_high_dict = {}
|
|
190
|
+
|
|
191
|
+
ci = model.conf_int() # shape: (n_cats-1, n_vars, 2)
|
|
192
|
+
|
|
193
|
+
for j, cat in enumerate(other_cats):
|
|
194
|
+
cat_label = f"y={int(cat)}" if cat == int(cat) else f"y={cat}"
|
|
195
|
+
for i, var in enumerate(var_names):
|
|
196
|
+
key = f"{var} ({cat_label})"
|
|
197
|
+
params_dict[key] = float(model.params[i, j])
|
|
198
|
+
se_dict[key] = float(model.bse[i, j])
|
|
199
|
+
t_dict[key] = float(model.tvalues[i, j])
|
|
200
|
+
p_dict[key] = float(model.pvalues[i, j])
|
|
201
|
+
ci_low_dict[key] = float(ci[j, i, 0])
|
|
202
|
+
ci_high_dict[key] = float(ci[j, i, 1])
|
|
203
|
+
|
|
204
|
+
suffix = _model_type_suffix(robust, groups is not None)
|
|
205
|
+
warnings_list.append(f"Base category: {int(base_cat) if base_cat == int(base_cat) else base_cat}")
|
|
206
|
+
|
|
207
|
+
fit = FitResult(
|
|
208
|
+
model_type="MNLogit" + suffix,
|
|
209
|
+
formula=f"{dep} ~ {' + '.join(indeps)}",
|
|
210
|
+
dep_var=dep,
|
|
211
|
+
indep_vars=indeps,
|
|
212
|
+
n_obs=int(model.nobs),
|
|
213
|
+
params=params_dict,
|
|
214
|
+
std_errors=se_dict,
|
|
215
|
+
t_values=t_dict,
|
|
216
|
+
p_values=p_dict,
|
|
217
|
+
conf_int_low=ci_low_dict,
|
|
218
|
+
conf_int_high=ci_high_dict,
|
|
219
|
+
pseudo_r2=float(model.prsquared),
|
|
220
|
+
log_likelihood=float(model.llf),
|
|
221
|
+
aic=float(model.aic),
|
|
222
|
+
bic=float(model.bic),
|
|
223
|
+
warnings=warnings_list,
|
|
224
|
+
)
|
|
225
|
+
return fit, model
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# ── Ordered Logit / Probit ───────────────────────────────────────────
|
|
229
|
+
|
|
230
|
+
def fit_ordered(
|
|
231
|
+
df: pl.DataFrame,
|
|
232
|
+
dep: str,
|
|
233
|
+
indeps: list[str],
|
|
234
|
+
*,
|
|
235
|
+
link: str = "logit",
|
|
236
|
+
robust: bool = False,
|
|
237
|
+
cluster_col: str | None = None,
|
|
238
|
+
) -> tuple[FitResult, object]:
|
|
239
|
+
"""Fit an Ordered Logit or Ordered Probit model."""
|
|
240
|
+
from statsmodels.miscmodels.ordinal_model import OrderedModel
|
|
241
|
+
|
|
242
|
+
y, X, warnings_list, var_names, groups = _prepare_data(
|
|
243
|
+
df, dep, indeps, cluster_col=cluster_col,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# OrderedModel does not want a constant — it estimates thresholds instead
|
|
247
|
+
# Remove the constant column (first column from sm.add_constant)
|
|
248
|
+
X_no_const = X[:, 1:]
|
|
249
|
+
var_names_no_const = var_names[1:] # remove "_cons"
|
|
250
|
+
|
|
251
|
+
distr = "logit" if link == "logit" else "probit"
|
|
252
|
+
|
|
253
|
+
model = OrderedModel(y, X_no_const, distr=distr)
|
|
254
|
+
result = model.fit(disp=0)
|
|
255
|
+
|
|
256
|
+
# params is a numpy array; exog_names gives labels
|
|
257
|
+
n_coefs = len(var_names_no_const)
|
|
258
|
+
n_total = len(result.params)
|
|
259
|
+
ci = result.conf_int() # shape: (n_total, 2)
|
|
260
|
+
|
|
261
|
+
# Coefficients
|
|
262
|
+
params_dict = {}
|
|
263
|
+
se_dict = {}
|
|
264
|
+
t_dict = {}
|
|
265
|
+
p_dict = {}
|
|
266
|
+
ci_low_dict = {}
|
|
267
|
+
ci_high_dict = {}
|
|
268
|
+
|
|
269
|
+
for i, var in enumerate(var_names_no_const):
|
|
270
|
+
params_dict[var] = float(result.params[i])
|
|
271
|
+
se_dict[var] = float(result.bse[i])
|
|
272
|
+
t_dict[var] = float(result.tvalues[i])
|
|
273
|
+
p_dict[var] = float(result.pvalues[i])
|
|
274
|
+
ci_low_dict[var] = float(ci[i, 0])
|
|
275
|
+
ci_high_dict[var] = float(ci[i, 1])
|
|
276
|
+
|
|
277
|
+
# Threshold (cut-point) parameters
|
|
278
|
+
for i in range(n_coefs, n_total):
|
|
279
|
+
cut_label = f"cut{i - n_coefs + 1}"
|
|
280
|
+
params_dict[cut_label] = float(result.params[i])
|
|
281
|
+
se_dict[cut_label] = float(result.bse[i])
|
|
282
|
+
t_dict[cut_label] = float(result.tvalues[i])
|
|
283
|
+
p_dict[cut_label] = float(result.pvalues[i])
|
|
284
|
+
ci_low_dict[cut_label] = float(ci[i, 0])
|
|
285
|
+
ci_high_dict[cut_label] = float(ci[i, 1])
|
|
286
|
+
|
|
287
|
+
suffix = _model_type_suffix(robust, groups is not None)
|
|
288
|
+
model_name = f"O{link.capitalize()}" + suffix # OLogit or OProbit
|
|
289
|
+
|
|
290
|
+
categories = sorted(np.unique(y))
|
|
291
|
+
warnings_list.append(f"Ordered categories: {[int(c) if c == int(c) else c for c in categories]}")
|
|
292
|
+
|
|
293
|
+
fit = FitResult(
|
|
294
|
+
model_type=model_name,
|
|
295
|
+
formula=f"{dep} ~ {' + '.join(indeps)}",
|
|
296
|
+
dep_var=dep,
|
|
297
|
+
indep_vars=indeps,
|
|
298
|
+
n_obs=int(result.nobs),
|
|
299
|
+
params=params_dict,
|
|
300
|
+
std_errors=se_dict,
|
|
301
|
+
t_values=t_dict,
|
|
302
|
+
p_values=p_dict,
|
|
303
|
+
conf_int_low=ci_low_dict,
|
|
304
|
+
conf_int_high=ci_high_dict,
|
|
305
|
+
pseudo_r2=float(result.prsquared) if hasattr(result, "prsquared") else None,
|
|
306
|
+
log_likelihood=float(result.llf),
|
|
307
|
+
aic=float(result.aic),
|
|
308
|
+
bic=float(result.bic),
|
|
309
|
+
warnings=warnings_list,
|
|
310
|
+
)
|
|
311
|
+
return fit, result
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Epidemiology functions: risk ratios, odds ratios, incidence rates, NNT."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import polars as pl
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _ci_log(est: float, n: int, p: float, alpha: float = 0.05) -> tuple[float, float]:
|
|
12
|
+
"""Approximate log-based CI for ratio estimates."""
|
|
13
|
+
from scipy.stats import norm
|
|
14
|
+
z = float(norm.ppf(1 - alpha / 2))
|
|
15
|
+
if p <= 0 or p >= 1 or n == 0:
|
|
16
|
+
return float("nan"), float("nan")
|
|
17
|
+
se_log = math.sqrt((1 - p) / (n * p))
|
|
18
|
+
return math.exp(math.log(est) - z * se_log), math.exp(math.log(est) + z * se_log)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def cohort_study(df: pl.DataFrame, outcome: str, exposure: str) -> dict:
|
|
22
|
+
"""
|
|
23
|
+
Cohort study: compute RR, ARR, NNT from a 2×2 table.
|
|
24
|
+
outcome and exposure must be binary (0/1).
|
|
25
|
+
"""
|
|
26
|
+
from scipy.stats import chi2_contingency, fisher_exact
|
|
27
|
+
sub = df.select([outcome, exposure]).drop_nulls()
|
|
28
|
+
exp = sub[exposure].to_numpy().astype(int)
|
|
29
|
+
out = sub[outcome].to_numpy().astype(int)
|
|
30
|
+
|
|
31
|
+
a = int(((exp == 1) & (out == 1)).sum()) # exposed, outcome
|
|
32
|
+
b = int(((exp == 1) & (out == 0)).sum()) # exposed, no outcome
|
|
33
|
+
c = int(((exp == 0) & (out == 1)).sum()) # unexposed, outcome
|
|
34
|
+
d = int(((exp == 0) & (out == 0)).sum()) # unexposed, no outcome
|
|
35
|
+
|
|
36
|
+
n_exp = a + b
|
|
37
|
+
n_unexp = c + d
|
|
38
|
+
r_exp = a / n_exp if n_exp > 0 else float("nan")
|
|
39
|
+
r_unexp = c / n_unexp if n_unexp > 0 else float("nan")
|
|
40
|
+
|
|
41
|
+
rr = r_exp / r_unexp if r_unexp > 0 else float("nan")
|
|
42
|
+
arr = r_exp - r_unexp
|
|
43
|
+
nnt = 1 / abs(arr) if arr != 0 else float("nan")
|
|
44
|
+
|
|
45
|
+
table = [[a, b], [c, d]]
|
|
46
|
+
chi2, p_chi2, _, _ = chi2_contingency(table)
|
|
47
|
+
_, p_fisher = fisher_exact(table)
|
|
48
|
+
|
|
49
|
+
rr_lo, rr_hi = _ci_log(rr, n_exp, r_exp) if rr == rr else (float("nan"), float("nan"))
|
|
50
|
+
|
|
51
|
+
return {
|
|
52
|
+
"test": "Cohort Study (RR)",
|
|
53
|
+
"exposure": exposure, "outcome": outcome,
|
|
54
|
+
"table_2x2": {"a": a, "b": b, "c": c, "d": d},
|
|
55
|
+
"n_exposed": n_exp, "n_unexposed": n_unexp,
|
|
56
|
+
"risk_exposed": r_exp, "risk_unexposed": r_unexp,
|
|
57
|
+
"risk_ratio": rr, "rr_ci_95_lo": rr_lo, "rr_ci_95_hi": rr_hi,
|
|
58
|
+
"arr": arr, "nnt": nnt,
|
|
59
|
+
"chi2": float(chi2), "p_chi2": float(p_chi2),
|
|
60
|
+
"p_fisher": float(p_fisher),
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def case_control(df: pl.DataFrame, outcome: str, exposure: str) -> dict:
|
|
65
|
+
"""
|
|
66
|
+
Case-control study: compute OR with 95% CI (Woolf method).
|
|
67
|
+
"""
|
|
68
|
+
from scipy.stats import chi2_contingency, fisher_exact
|
|
69
|
+
sub = df.select([outcome, exposure]).drop_nulls()
|
|
70
|
+
exp = sub[exposure].to_numpy().astype(int)
|
|
71
|
+
out = sub[outcome].to_numpy().astype(int)
|
|
72
|
+
|
|
73
|
+
a = int(((exp == 1) & (out == 1)).sum())
|
|
74
|
+
b = int(((exp == 0) & (out == 1)).sum())
|
|
75
|
+
c = int(((exp == 1) & (out == 0)).sum())
|
|
76
|
+
d = int(((exp == 0) & (out == 0)).sum())
|
|
77
|
+
|
|
78
|
+
or_ = (a * d) / (b * c) if b * c > 0 else float("nan")
|
|
79
|
+
# Woolf 95% CI
|
|
80
|
+
if or_ > 0 and or_ == or_:
|
|
81
|
+
from scipy.stats import norm
|
|
82
|
+
z = float(norm.ppf(0.975))
|
|
83
|
+
se_log_or = math.sqrt(1/max(a, 1) + 1/max(b, 1) + 1/max(c, 1) + 1/max(d, 1))
|
|
84
|
+
or_lo = math.exp(math.log(or_) - z * se_log_or)
|
|
85
|
+
or_hi = math.exp(math.log(or_) + z * se_log_or)
|
|
86
|
+
else:
|
|
87
|
+
or_lo = or_hi = float("nan")
|
|
88
|
+
|
|
89
|
+
table = [[a, b], [c, d]]
|
|
90
|
+
chi2, p_chi2, _, _ = chi2_contingency(table)
|
|
91
|
+
_, p_fisher = fisher_exact(table)
|
|
92
|
+
|
|
93
|
+
return {
|
|
94
|
+
"test": "Case-Control (OR)",
|
|
95
|
+
"exposure": exposure, "outcome": outcome,
|
|
96
|
+
"table_2x2": {"a": a, "b": b, "c": c, "d": d},
|
|
97
|
+
"odds_ratio": or_, "or_ci_95_lo": or_lo, "or_ci_95_hi": or_hi,
|
|
98
|
+
"chi2": float(chi2), "p_chi2": float(p_chi2),
|
|
99
|
+
"p_fisher": float(p_fisher),
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def incidence_rate(df: pl.DataFrame, outcome: str, person_time: str) -> dict:
|
|
104
|
+
"""Compute incidence rate = cases / total person-time."""
|
|
105
|
+
sub = df.select([outcome, person_time]).drop_nulls()
|
|
106
|
+
cases = int(sub[outcome].sum())
|
|
107
|
+
pt = float(sub[person_time].sum())
|
|
108
|
+
ir = cases / pt if pt > 0 else float("nan")
|
|
109
|
+
# Exact Poisson CI (Byar's approximation)
|
|
110
|
+
from scipy.stats import chi2
|
|
111
|
+
lo = 0.5 * float(chi2.ppf(0.025, 2 * cases)) / pt if cases > 0 else 0.0
|
|
112
|
+
hi = 0.5 * float(chi2.ppf(0.975, 2 * (cases + 1))) / pt
|
|
113
|
+
return {
|
|
114
|
+
"test": "Incidence Rate",
|
|
115
|
+
"outcome": outcome, "person_time_col": person_time,
|
|
116
|
+
"cases": cases, "person_time": pt,
|
|
117
|
+
"incidence_rate": ir,
|
|
118
|
+
"ir_ci_95_lo": lo, "ir_ci_95_hi": hi,
|
|
119
|
+
}
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""Equivalence tests (TOST) and Tobit/Heckman regression."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
from scipy import stats as sp_stats
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def tost_onemean(
|
|
11
|
+
df: pl.DataFrame,
|
|
12
|
+
col: str,
|
|
13
|
+
mu: float = 0.0,
|
|
14
|
+
delta: float = 0.5,
|
|
15
|
+
alpha: float = 0.05,
|
|
16
|
+
) -> dict:
|
|
17
|
+
"""
|
|
18
|
+
Two One-Sided Tests (TOST) for equivalence: one-sample.
|
|
19
|
+
H0: |mean - mu| >= delta vs H1: |mean - mu| < delta
|
|
20
|
+
"""
|
|
21
|
+
data = df[col].drop_nulls().to_numpy().astype(float)
|
|
22
|
+
n = len(data)
|
|
23
|
+
x_bar = data.mean()
|
|
24
|
+
se = data.std(ddof=1) / np.sqrt(n)
|
|
25
|
+
|
|
26
|
+
# Lower test: H0: mean <= mu - delta
|
|
27
|
+
t_lo = (x_bar - (mu - delta)) / se
|
|
28
|
+
p_lo = float(sp_stats.t.sf(t_lo, df=n - 1)) # one-sided upper
|
|
29
|
+
|
|
30
|
+
# Upper test: H0: mean >= mu + delta
|
|
31
|
+
t_hi = (x_bar - (mu + delta)) / se
|
|
32
|
+
p_hi = float(sp_stats.t.cdf(t_hi, df=n - 1)) # one-sided lower
|
|
33
|
+
|
|
34
|
+
p_tost = max(p_lo, p_hi)
|
|
35
|
+
equivalent = p_tost < alpha
|
|
36
|
+
|
|
37
|
+
return {
|
|
38
|
+
"test": "TOST Equivalence (one-sample)",
|
|
39
|
+
"col": col,
|
|
40
|
+
"n_obs": n,
|
|
41
|
+
"mean": float(x_bar),
|
|
42
|
+
"mu": mu,
|
|
43
|
+
"delta": delta,
|
|
44
|
+
"alpha": alpha,
|
|
45
|
+
"t_lower": float(t_lo),
|
|
46
|
+
"t_upper": float(t_hi),
|
|
47
|
+
"p_lower": p_lo,
|
|
48
|
+
"p_upper": p_hi,
|
|
49
|
+
"p_tost": p_tost,
|
|
50
|
+
"equivalent_at_alpha": equivalent,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def tost_twomeans(
|
|
55
|
+
df: pl.DataFrame,
|
|
56
|
+
col: str,
|
|
57
|
+
by: str,
|
|
58
|
+
delta: float = 0.5,
|
|
59
|
+
alpha: float = 0.05,
|
|
60
|
+
) -> dict:
|
|
61
|
+
"""TOST for equivalence of two independent group means."""
|
|
62
|
+
groups = df[by].drop_nulls().unique().sort().to_list()
|
|
63
|
+
if len(groups) != 2:
|
|
64
|
+
raise ValueError(f"tost_twomeans requires exactly 2 groups, got {len(groups)}")
|
|
65
|
+
g1 = df.filter(pl.col(by) == groups[0])[col].drop_nulls().to_numpy().astype(float)
|
|
66
|
+
g2 = df.filter(pl.col(by) == groups[1])[col].drop_nulls().to_numpy().astype(float)
|
|
67
|
+
|
|
68
|
+
diff = float(g1.mean() - g2.mean())
|
|
69
|
+
se = float(np.sqrt(g1.var(ddof=1) / len(g1) + g2.var(ddof=1) / len(g2)))
|
|
70
|
+
df_welch = int((g1.var(ddof=1) / len(g1) + g2.var(ddof=1) / len(g2))**2 /
|
|
71
|
+
((g1.var(ddof=1) / len(g1))**2 / (len(g1) - 1) +
|
|
72
|
+
(g2.var(ddof=1) / len(g2))**2 / (len(g2) - 1)))
|
|
73
|
+
|
|
74
|
+
t_lo = (diff - (-delta)) / se
|
|
75
|
+
t_hi = (diff - delta) / se
|
|
76
|
+
p_lo = float(sp_stats.t.sf(t_lo, df=df_welch))
|
|
77
|
+
p_hi = float(sp_stats.t.cdf(t_hi, df=df_welch))
|
|
78
|
+
p_tost = max(p_lo, p_hi)
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
"test": "TOST Equivalence (two-sample)",
|
|
82
|
+
"col": col, "by": by,
|
|
83
|
+
"groups": [str(g) for g in groups],
|
|
84
|
+
"mean_diff": diff,
|
|
85
|
+
"delta": delta,
|
|
86
|
+
"alpha": alpha,
|
|
87
|
+
"p_tost": p_tost,
|
|
88
|
+
"equivalent_at_alpha": p_tost < alpha,
|
|
89
|
+
"p_lower": p_lo,
|
|
90
|
+
"p_upper": p_hi,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def fit_tobit(
|
|
95
|
+
df: pl.DataFrame,
|
|
96
|
+
dep: str,
|
|
97
|
+
indeps: list[str],
|
|
98
|
+
left: float | None = 0.0,
|
|
99
|
+
right: float | None = None,
|
|
100
|
+
) -> dict:
|
|
101
|
+
"""
|
|
102
|
+
Tobit regression for censored outcomes via MLE (scipy optimize).
|
|
103
|
+
Handles left-censoring (default at 0), right-censoring, or both.
|
|
104
|
+
"""
|
|
105
|
+
from scipy.optimize import minimize
|
|
106
|
+
from scipy.stats import norm
|
|
107
|
+
|
|
108
|
+
sub = df.select([dep] + indeps).drop_nulls()
|
|
109
|
+
y = sub[dep].to_numpy().astype(float)
|
|
110
|
+
X_raw = sub.select(indeps).to_numpy().astype(float)
|
|
111
|
+
n, k = X_raw.shape
|
|
112
|
+
X = np.column_stack([np.ones(n), X_raw])
|
|
113
|
+
kp = k + 1
|
|
114
|
+
|
|
115
|
+
def neg_ll(params):
|
|
116
|
+
beta = params[:kp]
|
|
117
|
+
log_sigma = params[kp]
|
|
118
|
+
sigma = np.exp(log_sigma)
|
|
119
|
+
xb = X @ beta
|
|
120
|
+
ll = np.zeros(n)
|
|
121
|
+
|
|
122
|
+
for i in range(n):
|
|
123
|
+
if left is not None and y[i] <= left:
|
|
124
|
+
ll[i] = norm.logcdf((left - xb[i]) / sigma)
|
|
125
|
+
elif right is not None and y[i] >= right:
|
|
126
|
+
ll[i] = norm.logsf((right - xb[i]) / sigma)
|
|
127
|
+
else:
|
|
128
|
+
ll[i] = norm.logpdf(y[i], loc=xb[i], scale=sigma)
|
|
129
|
+
return -ll.sum()
|
|
130
|
+
|
|
131
|
+
# OLS start
|
|
132
|
+
beta0 = np.linalg.lstsq(X, y, rcond=None)[0]
|
|
133
|
+
resid0 = y - X @ beta0
|
|
134
|
+
log_sigma0 = np.log(max(resid0.std(), 1e-4))
|
|
135
|
+
x0 = np.concatenate([beta0, [log_sigma0]])
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
res = minimize(neg_ll, x0, method="L-BFGS-B", options={"maxiter": 500})
|
|
139
|
+
beta_hat = res.x[:kp]
|
|
140
|
+
sigma_hat = float(np.exp(res.x[kp]))
|
|
141
|
+
llf = -res.fun
|
|
142
|
+
aic = 2 * (kp + 1) - 2 * llf
|
|
143
|
+
bic = (kp + 1) * np.log(n) - 2 * llf
|
|
144
|
+
|
|
145
|
+
param_names = ["_cons"] + indeps
|
|
146
|
+
params = {nm: float(v) for nm, v in zip(param_names, beta_hat)}
|
|
147
|
+
|
|
148
|
+
return {
|
|
149
|
+
"method": "Tobit",
|
|
150
|
+
"dep": dep, "indeps": indeps,
|
|
151
|
+
"left_censoring": left,
|
|
152
|
+
"right_censoring": right,
|
|
153
|
+
"params": params,
|
|
154
|
+
"sigma": sigma_hat,
|
|
155
|
+
"log_likelihood": float(llf),
|
|
156
|
+
"aic": float(aic),
|
|
157
|
+
"bic": float(bic),
|
|
158
|
+
"n_obs": n,
|
|
159
|
+
"n_censored_left": int((y <= left).sum()) if left is not None else 0,
|
|
160
|
+
"n_censored_right": int((y >= right).sum()) if right is not None else 0,
|
|
161
|
+
}
|
|
162
|
+
except Exception as exc:
|
|
163
|
+
raise RuntimeError(f"Tobit failed: {exc}") from exc
|