openstat-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. openstat/__init__.py +3 -0
  2. openstat/__main__.py +4 -0
  3. openstat/backends/__init__.py +16 -0
  4. openstat/backends/duckdb_backend.py +70 -0
  5. openstat/backends/polars_backend.py +52 -0
  6. openstat/cli.py +92 -0
  7. openstat/commands/__init__.py +82 -0
  8. openstat/commands/adv_stat_cmds.py +1255 -0
  9. openstat/commands/advanced_ml_cmds.py +576 -0
  10. openstat/commands/advreg_cmds.py +207 -0
  11. openstat/commands/alias_cmds.py +135 -0
  12. openstat/commands/arch_cmds.py +82 -0
  13. openstat/commands/arules_cmds.py +111 -0
  14. openstat/commands/automodel_cmds.py +212 -0
  15. openstat/commands/backend_cmds.py +82 -0
  16. openstat/commands/base.py +170 -0
  17. openstat/commands/bayes_cmds.py +71 -0
  18. openstat/commands/causal_cmds.py +269 -0
  19. openstat/commands/cluster_cmds.py +152 -0
  20. openstat/commands/data_cmds.py +996 -0
  21. openstat/commands/datamanip_cmds.py +672 -0
  22. openstat/commands/dataquality_cmds.py +174 -0
  23. openstat/commands/datetime_cmds.py +176 -0
  24. openstat/commands/dimreduce_cmds.py +184 -0
  25. openstat/commands/discrete_cmds.py +149 -0
  26. openstat/commands/dsl_cmds.py +143 -0
  27. openstat/commands/epi_cmds.py +93 -0
  28. openstat/commands/equiv_tobit_cmds.py +94 -0
  29. openstat/commands/esttab_cmds.py +196 -0
  30. openstat/commands/export_beamer_cmds.py +142 -0
  31. openstat/commands/export_cmds.py +201 -0
  32. openstat/commands/export_extra_cmds.py +240 -0
  33. openstat/commands/factor_cmds.py +180 -0
  34. openstat/commands/groupby_cmds.py +155 -0
  35. openstat/commands/help_cmds.py +237 -0
  36. openstat/commands/i18n_cmds.py +43 -0
  37. openstat/commands/import_extra_cmds.py +561 -0
  38. openstat/commands/influence_cmds.py +134 -0
  39. openstat/commands/iv_cmds.py +106 -0
  40. openstat/commands/manova_cmds.py +105 -0
  41. openstat/commands/mediate_cmds.py +233 -0
  42. openstat/commands/meta_cmds.py +284 -0
  43. openstat/commands/mi_cmds.py +228 -0
  44. openstat/commands/mixed_cmds.py +79 -0
  45. openstat/commands/mixture_changepoint_cmds.py +166 -0
  46. openstat/commands/ml_adv_cmds.py +147 -0
  47. openstat/commands/ml_cmds.py +178 -0
  48. openstat/commands/model_eval_cmds.py +142 -0
  49. openstat/commands/network_cmds.py +288 -0
  50. openstat/commands/nlquery_cmds.py +161 -0
  51. openstat/commands/nonparam_cmds.py +149 -0
  52. openstat/commands/outreg_cmds.py +247 -0
  53. openstat/commands/panel_cmds.py +141 -0
  54. openstat/commands/pdf_cmds.py +226 -0
  55. openstat/commands/pipeline_cmds.py +319 -0
  56. openstat/commands/plot_cmds.py +189 -0
  57. openstat/commands/plugin_cmds.py +79 -0
  58. openstat/commands/posthoc_cmds.py +153 -0
  59. openstat/commands/power_cmds.py +172 -0
  60. openstat/commands/profile_cmds.py +246 -0
  61. openstat/commands/rbridge_cmds.py +81 -0
  62. openstat/commands/regex_cmds.py +104 -0
  63. openstat/commands/report_cmds.py +48 -0
  64. openstat/commands/repro_cmds.py +129 -0
  65. openstat/commands/resampling_cmds.py +109 -0
  66. openstat/commands/reshape_cmds.py +223 -0
  67. openstat/commands/sem_cmds.py +177 -0
  68. openstat/commands/stat_cmds.py +1040 -0
  69. openstat/commands/stata_import_cmds.py +215 -0
  70. openstat/commands/string_cmds.py +124 -0
  71. openstat/commands/surv_cmds.py +145 -0
  72. openstat/commands/survey_cmds.py +153 -0
  73. openstat/commands/textanalysis_cmds.py +192 -0
  74. openstat/commands/ts_adv_cmds.py +136 -0
  75. openstat/commands/ts_cmds.py +195 -0
  76. openstat/commands/tui_cmds.py +111 -0
  77. openstat/commands/ux_cmds.py +191 -0
  78. openstat/commands/validate_cmds.py +270 -0
  79. openstat/commands/viz_adv_cmds.py +312 -0
  80. openstat/commands/viz_extra_cmds.py +251 -0
  81. openstat/commands/watch_cmds.py +69 -0
  82. openstat/config.py +106 -0
  83. openstat/dsl/__init__.py +0 -0
  84. openstat/dsl/parser.py +332 -0
  85. openstat/dsl/tokenizer.py +105 -0
  86. openstat/i18n.py +120 -0
  87. openstat/io/__init__.py +0 -0
  88. openstat/io/loader.py +187 -0
  89. openstat/jupyter/__init__.py +18 -0
  90. openstat/jupyter/display.py +18 -0
  91. openstat/jupyter/magic.py +60 -0
  92. openstat/logging_config.py +59 -0
  93. openstat/plots/__init__.py +0 -0
  94. openstat/plots/plotter.py +437 -0
  95. openstat/plots/surv_plots.py +32 -0
  96. openstat/plots/ts_plots.py +59 -0
  97. openstat/plugins/__init__.py +5 -0
  98. openstat/plugins/manager.py +69 -0
  99. openstat/repl.py +457 -0
  100. openstat/reporting/__init__.py +0 -0
  101. openstat/reporting/eda.py +208 -0
  102. openstat/reporting/report.py +67 -0
  103. openstat/script_runner.py +319 -0
  104. openstat/session.py +133 -0
  105. openstat/stats/__init__.py +0 -0
  106. openstat/stats/advanced_regression.py +269 -0
  107. openstat/stats/arch_garch.py +84 -0
  108. openstat/stats/bayesian.py +103 -0
  109. openstat/stats/causal.py +258 -0
  110. openstat/stats/clustering.py +206 -0
  111. openstat/stats/discrete.py +311 -0
  112. openstat/stats/epidemiology.py +119 -0
  113. openstat/stats/equiv_tobit.py +163 -0
  114. openstat/stats/factor.py +174 -0
  115. openstat/stats/imputation.py +282 -0
  116. openstat/stats/influence.py +78 -0
  117. openstat/stats/iv.py +131 -0
  118. openstat/stats/manova.py +124 -0
  119. openstat/stats/mixed.py +128 -0
  120. openstat/stats/ml.py +275 -0
  121. openstat/stats/ml_advanced.py +117 -0
  122. openstat/stats/model_eval.py +183 -0
  123. openstat/stats/models.py +1342 -0
  124. openstat/stats/nonparametric.py +130 -0
  125. openstat/stats/panel.py +179 -0
  126. openstat/stats/power.py +295 -0
  127. openstat/stats/resampling.py +203 -0
  128. openstat/stats/survey.py +213 -0
  129. openstat/stats/survival.py +196 -0
  130. openstat/stats/timeseries.py +142 -0
  131. openstat/stats/ts_advanced.py +114 -0
  132. openstat/types.py +11 -0
  133. openstat/web/__init__.py +1 -0
  134. openstat/web/app.py +117 -0
  135. openstat/web/session_manager.py +73 -0
  136. openstat/web/static/app.js +117 -0
  137. openstat/web/static/index.html +38 -0
  138. openstat/web/static/style.css +103 -0
  139. openstat_cli-1.0.0.dist-info/METADATA +748 -0
  140. openstat_cli-1.0.0.dist-info/RECORD +143 -0
  141. openstat_cli-1.0.0.dist-info/WHEEL +4 -0
  142. openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
  143. openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,311 @@
1
+ """Discrete / censored models: Tobit, Multinomial Logit, Ordered Logit/Probit."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import polars as pl
7
+ import statsmodels.api as sm
8
+ from scipy import stats as sp_stats
9
+ from scipy.optimize import minimize
10
+
11
+ from openstat.stats.models import FitResult, _prepare_data, _cov_args, _model_type_suffix
12
+
13
+
14
+ # ── Tobit (censored regression) ─────────────────────────────────────
15
+
16
+ def _tobit_loglike(params, y, X, lower, upper):
17
+ """Tobit log-likelihood for scipy.optimize."""
18
+ beta = params[:-1]
19
+ log_sigma = params[-1]
20
+ sigma = np.exp(log_sigma)
21
+
22
+ Xb = X @ beta
23
+ resid = (y - Xb) / sigma
24
+
25
+ ll = 0.0
26
+ for i in range(len(y)):
27
+ if lower is not None and y[i] <= lower:
28
+ # Left-censored
29
+ cdf_val = sp_stats.norm.cdf((lower - Xb[i]) / sigma)
30
+ ll += np.log(max(cdf_val, 1e-300))
31
+ elif upper is not None and y[i] >= upper:
32
+ # Right-censored
33
+ cdf_val = sp_stats.norm.sf((upper - Xb[i]) / sigma)
34
+ ll += np.log(max(cdf_val, 1e-300))
35
+ else:
36
+ # Uncensored
37
+ ll += sp_stats.norm.logpdf(resid[i]) - log_sigma
38
+
39
+ return -ll # minimize negative log-likelihood
40
+
41
+
42
+ def fit_tobit(
43
+ df: pl.DataFrame,
44
+ dep: str,
45
+ indeps: list[str],
46
+ *,
47
+ lower_limit: float | None = None,
48
+ upper_limit: float | None = None,
49
+ robust: bool = False,
50
+ cluster_col: str | None = None,
51
+ ) -> tuple[FitResult, object]:
52
+ """Fit a Tobit (censored) regression model via MLE."""
53
+ y, X, warnings_list, var_names, groups = _prepare_data(
54
+ df, dep, indeps, cluster_col=cluster_col,
55
+ )
56
+
57
+ if lower_limit is None and upper_limit is None:
58
+ warnings_list.append("Note: No censoring limits specified. Results are equivalent to OLS.")
59
+
60
+ n_censored_low = int(np.sum(y <= lower_limit)) if lower_limit is not None else 0
61
+ n_censored_high = int(np.sum(y >= upper_limit)) if upper_limit is not None else 0
62
+ n_uncensored = len(y) - n_censored_low - n_censored_high
63
+
64
+ if n_uncensored < len(var_names) + 2:
65
+ raise ValueError(
66
+ f"Too few uncensored observations ({n_uncensored}) for {len(var_names)} parameters."
67
+ )
68
+
69
+ # Initial values: OLS estimates + log(sigma)
70
+ ols = sm.OLS(y, X).fit()
71
+ init_beta = ols.params
72
+ init_log_sigma = np.log(np.std(ols.resid))
73
+ init_params = np.append(init_beta, init_log_sigma)
74
+
75
+ result = minimize(
76
+ _tobit_loglike, init_params, args=(y, X, lower_limit, upper_limit),
77
+ method="BFGS",
78
+ )
79
+
80
+ if not result.success:
81
+ warnings_list.append(f"Warning: Optimization did not fully converge: {result.message}")
82
+
83
+ beta = result.x[:-1]
84
+ log_sigma = result.x[-1]
85
+ sigma = np.exp(log_sigma)
86
+
87
+ # Standard errors from inverse Hessian
88
+ try:
89
+ hess_inv = result.hess_inv
90
+ if hasattr(hess_inv, 'todense'):
91
+ hess_inv = hess_inv.todense()
92
+ se_all = np.sqrt(np.diag(np.abs(hess_inv)))
93
+ except Exception:
94
+ # Fallback: numerical Hessian
95
+ from scipy.optimize import approx_fprime
96
+ eps = 1e-5
97
+ n_p = len(result.x)
98
+ hess = np.zeros((n_p, n_p))
99
+ for i in range(n_p):
100
+ def grad_i(p):
101
+ g = approx_fprime(p, _tobit_loglike, eps, y, X, lower_limit, upper_limit)
102
+ return g[i]
103
+ hess[i, :] = approx_fprime(result.x, grad_i, eps)
104
+ try:
105
+ se_all = np.sqrt(np.diag(np.linalg.inv(hess)))
106
+ except np.linalg.LinAlgError:
107
+ se_all = np.full(n_p, np.nan)
108
+
109
+ se_beta = se_all[:-1]
110
+ se_sigma = se_all[-1]
111
+
112
+ # Build coefficient results
113
+ t_vals = beta / se_beta
114
+ p_vals = 2 * (1 - sp_stats.norm.cdf(np.abs(t_vals)))
115
+ ci_low = beta - 1.96 * se_beta
116
+ ci_high = beta + 1.96 * se_beta
117
+
118
+ # Add sigma as extra parameter
119
+ all_var_names = var_names + ["sigma"]
120
+ all_params = np.append(beta, sigma)
121
+ all_se = np.append(se_beta, se_sigma)
122
+ sigma_t = sigma / se_sigma if se_sigma > 0 else np.nan
123
+ sigma_p = 2 * (1 - sp_stats.norm.cdf(np.abs(sigma_t)))
124
+ all_t = np.append(t_vals, sigma_t)
125
+ all_p = np.append(p_vals, sigma_p)
126
+ all_ci_low = np.append(ci_low, sigma - 1.96 * se_sigma)
127
+ all_ci_high = np.append(ci_high, sigma + 1.96 * se_sigma)
128
+
129
+ suffix = _model_type_suffix(robust, groups is not None)
130
+ ll_val = -result.fun
131
+
132
+ censor_info = []
133
+ if lower_limit is not None:
134
+ censor_info.append(f"Left-censored at {lower_limit}: {n_censored_low} obs")
135
+ if upper_limit is not None:
136
+ censor_info.append(f"Right-censored at {upper_limit}: {n_censored_high} obs")
137
+ censor_info.append(f"Uncensored: {n_uncensored} obs")
138
+ warnings_list.extend(censor_info)
139
+
140
+ fit = FitResult(
141
+ model_type="Tobit" + suffix,
142
+ formula=f"{dep} ~ {' + '.join(indeps)}",
143
+ dep_var=dep,
144
+ indep_vars=indeps,
145
+ n_obs=len(y),
146
+ params=dict(zip(all_var_names, all_params)),
147
+ std_errors=dict(zip(all_var_names, all_se)),
148
+ t_values=dict(zip(all_var_names, all_t)),
149
+ p_values=dict(zip(all_var_names, all_p)),
150
+ conf_int_low=dict(zip(all_var_names, all_ci_low)),
151
+ conf_int_high=dict(zip(all_var_names, all_ci_high)),
152
+ log_likelihood=ll_val,
153
+ aic=-2 * ll_val + 2 * len(result.x),
154
+ bic=-2 * ll_val + np.log(len(y)) * len(result.x),
155
+ warnings=warnings_list,
156
+ )
157
+ return fit, result
158
+
159
+
160
+ # ── Multinomial Logit ────────────────────────────────────────────────
161
+
162
+ def fit_mlogit(
163
+ df: pl.DataFrame,
164
+ dep: str,
165
+ indeps: list[str],
166
+ *,
167
+ robust: bool = False,
168
+ cluster_col: str | None = None,
169
+ ) -> tuple[FitResult, object]:
170
+ """Fit a Multinomial Logit model."""
171
+ y, X, warnings_list, var_names, groups = _prepare_data(
172
+ df, dep, indeps, cluster_col=cluster_col,
173
+ )
174
+
175
+ cov_type, cov_kwds = _cov_args(robust, groups)
176
+ model = sm.MNLogit(y, X).fit(disp=0, cov_type=cov_type, cov_kwds=cov_kwds)
177
+
178
+ # Get unique categories
179
+ categories = sorted(np.unique(y))
180
+ base_cat = categories[0]
181
+ other_cats = categories[1:]
182
+
183
+ # Flatten per-category coefficients
184
+ params_dict = {}
185
+ se_dict = {}
186
+ t_dict = {}
187
+ p_dict = {}
188
+ ci_low_dict = {}
189
+ ci_high_dict = {}
190
+
191
+ ci = model.conf_int() # shape: (n_cats-1, n_vars, 2)
192
+
193
+ for j, cat in enumerate(other_cats):
194
+ cat_label = f"y={int(cat)}" if cat == int(cat) else f"y={cat}"
195
+ for i, var in enumerate(var_names):
196
+ key = f"{var} ({cat_label})"
197
+ params_dict[key] = float(model.params[i, j])
198
+ se_dict[key] = float(model.bse[i, j])
199
+ t_dict[key] = float(model.tvalues[i, j])
200
+ p_dict[key] = float(model.pvalues[i, j])
201
+ ci_low_dict[key] = float(ci[j, i, 0])
202
+ ci_high_dict[key] = float(ci[j, i, 1])
203
+
204
+ suffix = _model_type_suffix(robust, groups is not None)
205
+ warnings_list.append(f"Base category: {int(base_cat) if base_cat == int(base_cat) else base_cat}")
206
+
207
+ fit = FitResult(
208
+ model_type="MNLogit" + suffix,
209
+ formula=f"{dep} ~ {' + '.join(indeps)}",
210
+ dep_var=dep,
211
+ indep_vars=indeps,
212
+ n_obs=int(model.nobs),
213
+ params=params_dict,
214
+ std_errors=se_dict,
215
+ t_values=t_dict,
216
+ p_values=p_dict,
217
+ conf_int_low=ci_low_dict,
218
+ conf_int_high=ci_high_dict,
219
+ pseudo_r2=float(model.prsquared),
220
+ log_likelihood=float(model.llf),
221
+ aic=float(model.aic),
222
+ bic=float(model.bic),
223
+ warnings=warnings_list,
224
+ )
225
+ return fit, model
226
+
227
+
228
+ # ── Ordered Logit / Probit ───────────────────────────────────────────
229
+
230
+ def fit_ordered(
231
+ df: pl.DataFrame,
232
+ dep: str,
233
+ indeps: list[str],
234
+ *,
235
+ link: str = "logit",
236
+ robust: bool = False,
237
+ cluster_col: str | None = None,
238
+ ) -> tuple[FitResult, object]:
239
+ """Fit an Ordered Logit or Ordered Probit model."""
240
+ from statsmodels.miscmodels.ordinal_model import OrderedModel
241
+
242
+ y, X, warnings_list, var_names, groups = _prepare_data(
243
+ df, dep, indeps, cluster_col=cluster_col,
244
+ )
245
+
246
+ # OrderedModel does not want a constant — it estimates thresholds instead
247
+ # Remove the constant column (first column from sm.add_constant)
248
+ X_no_const = X[:, 1:]
249
+ var_names_no_const = var_names[1:] # remove "_cons"
250
+
251
+ distr = "logit" if link == "logit" else "probit"
252
+
253
+ model = OrderedModel(y, X_no_const, distr=distr)
254
+ result = model.fit(disp=0)
255
+
256
+ # params is a numpy array; exog_names gives labels
257
+ n_coefs = len(var_names_no_const)
258
+ n_total = len(result.params)
259
+ ci = result.conf_int() # shape: (n_total, 2)
260
+
261
+ # Coefficients
262
+ params_dict = {}
263
+ se_dict = {}
264
+ t_dict = {}
265
+ p_dict = {}
266
+ ci_low_dict = {}
267
+ ci_high_dict = {}
268
+
269
+ for i, var in enumerate(var_names_no_const):
270
+ params_dict[var] = float(result.params[i])
271
+ se_dict[var] = float(result.bse[i])
272
+ t_dict[var] = float(result.tvalues[i])
273
+ p_dict[var] = float(result.pvalues[i])
274
+ ci_low_dict[var] = float(ci[i, 0])
275
+ ci_high_dict[var] = float(ci[i, 1])
276
+
277
+ # Threshold (cut-point) parameters
278
+ for i in range(n_coefs, n_total):
279
+ cut_label = f"cut{i - n_coefs + 1}"
280
+ params_dict[cut_label] = float(result.params[i])
281
+ se_dict[cut_label] = float(result.bse[i])
282
+ t_dict[cut_label] = float(result.tvalues[i])
283
+ p_dict[cut_label] = float(result.pvalues[i])
284
+ ci_low_dict[cut_label] = float(ci[i, 0])
285
+ ci_high_dict[cut_label] = float(ci[i, 1])
286
+
287
+ suffix = _model_type_suffix(robust, groups is not None)
288
+ model_name = f"O{link.capitalize()}" + suffix # OLogit or OProbit
289
+
290
+ categories = sorted(np.unique(y))
291
+ warnings_list.append(f"Ordered categories: {[int(c) if c == int(c) else c for c in categories]}")
292
+
293
+ fit = FitResult(
294
+ model_type=model_name,
295
+ formula=f"{dep} ~ {' + '.join(indeps)}",
296
+ dep_var=dep,
297
+ indep_vars=indeps,
298
+ n_obs=int(result.nobs),
299
+ params=params_dict,
300
+ std_errors=se_dict,
301
+ t_values=t_dict,
302
+ p_values=p_dict,
303
+ conf_int_low=ci_low_dict,
304
+ conf_int_high=ci_high_dict,
305
+ pseudo_r2=float(result.prsquared) if hasattr(result, "prsquared") else None,
306
+ log_likelihood=float(result.llf),
307
+ aic=float(result.aic),
308
+ bic=float(result.bic),
309
+ warnings=warnings_list,
310
+ )
311
+ return fit, result
@@ -0,0 +1,119 @@
1
+ """Epidemiology functions: risk ratios, odds ratios, incidence rates, NNT."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+
7
+ import numpy as np
8
+ import polars as pl
9
+
10
+
11
+ def _ci_log(est: float, n: int, p: float, alpha: float = 0.05) -> tuple[float, float]:
12
+ """Approximate log-based CI for ratio estimates."""
13
+ from scipy.stats import norm
14
+ z = float(norm.ppf(1 - alpha / 2))
15
+ if p <= 0 or p >= 1 or n == 0:
16
+ return float("nan"), float("nan")
17
+ se_log = math.sqrt((1 - p) / (n * p))
18
+ return math.exp(math.log(est) - z * se_log), math.exp(math.log(est) + z * se_log)
19
+
20
+
21
+ def cohort_study(df: pl.DataFrame, outcome: str, exposure: str) -> dict:
22
+ """
23
+ Cohort study: compute RR, ARR, NNT from a 2×2 table.
24
+ outcome and exposure must be binary (0/1).
25
+ """
26
+ from scipy.stats import chi2_contingency, fisher_exact
27
+ sub = df.select([outcome, exposure]).drop_nulls()
28
+ exp = sub[exposure].to_numpy().astype(int)
29
+ out = sub[outcome].to_numpy().astype(int)
30
+
31
+ a = int(((exp == 1) & (out == 1)).sum()) # exposed, outcome
32
+ b = int(((exp == 1) & (out == 0)).sum()) # exposed, no outcome
33
+ c = int(((exp == 0) & (out == 1)).sum()) # unexposed, outcome
34
+ d = int(((exp == 0) & (out == 0)).sum()) # unexposed, no outcome
35
+
36
+ n_exp = a + b
37
+ n_unexp = c + d
38
+ r_exp = a / n_exp if n_exp > 0 else float("nan")
39
+ r_unexp = c / n_unexp if n_unexp > 0 else float("nan")
40
+
41
+ rr = r_exp / r_unexp if r_unexp > 0 else float("nan")
42
+ arr = r_exp - r_unexp
43
+ nnt = 1 / abs(arr) if arr != 0 else float("nan")
44
+
45
+ table = [[a, b], [c, d]]
46
+ chi2, p_chi2, _, _ = chi2_contingency(table)
47
+ _, p_fisher = fisher_exact(table)
48
+
49
+ rr_lo, rr_hi = _ci_log(rr, n_exp, r_exp) if rr == rr else (float("nan"), float("nan"))
50
+
51
+ return {
52
+ "test": "Cohort Study (RR)",
53
+ "exposure": exposure, "outcome": outcome,
54
+ "table_2x2": {"a": a, "b": b, "c": c, "d": d},
55
+ "n_exposed": n_exp, "n_unexposed": n_unexp,
56
+ "risk_exposed": r_exp, "risk_unexposed": r_unexp,
57
+ "risk_ratio": rr, "rr_ci_95_lo": rr_lo, "rr_ci_95_hi": rr_hi,
58
+ "arr": arr, "nnt": nnt,
59
+ "chi2": float(chi2), "p_chi2": float(p_chi2),
60
+ "p_fisher": float(p_fisher),
61
+ }
62
+
63
+
64
+ def case_control(df: pl.DataFrame, outcome: str, exposure: str) -> dict:
65
+ """
66
+ Case-control study: compute OR with 95% CI (Woolf method).
67
+ """
68
+ from scipy.stats import chi2_contingency, fisher_exact
69
+ sub = df.select([outcome, exposure]).drop_nulls()
70
+ exp = sub[exposure].to_numpy().astype(int)
71
+ out = sub[outcome].to_numpy().astype(int)
72
+
73
+ a = int(((exp == 1) & (out == 1)).sum())
74
+ b = int(((exp == 0) & (out == 1)).sum())
75
+ c = int(((exp == 1) & (out == 0)).sum())
76
+ d = int(((exp == 0) & (out == 0)).sum())
77
+
78
+ or_ = (a * d) / (b * c) if b * c > 0 else float("nan")
79
+ # Woolf 95% CI
80
+ if or_ > 0 and or_ == or_:
81
+ from scipy.stats import norm
82
+ z = float(norm.ppf(0.975))
83
+ se_log_or = math.sqrt(1/max(a, 1) + 1/max(b, 1) + 1/max(c, 1) + 1/max(d, 1))
84
+ or_lo = math.exp(math.log(or_) - z * se_log_or)
85
+ or_hi = math.exp(math.log(or_) + z * se_log_or)
86
+ else:
87
+ or_lo = or_hi = float("nan")
88
+
89
+ table = [[a, b], [c, d]]
90
+ chi2, p_chi2, _, _ = chi2_contingency(table)
91
+ _, p_fisher = fisher_exact(table)
92
+
93
+ return {
94
+ "test": "Case-Control (OR)",
95
+ "exposure": exposure, "outcome": outcome,
96
+ "table_2x2": {"a": a, "b": b, "c": c, "d": d},
97
+ "odds_ratio": or_, "or_ci_95_lo": or_lo, "or_ci_95_hi": or_hi,
98
+ "chi2": float(chi2), "p_chi2": float(p_chi2),
99
+ "p_fisher": float(p_fisher),
100
+ }
101
+
102
+
103
+ def incidence_rate(df: pl.DataFrame, outcome: str, person_time: str) -> dict:
104
+ """Compute incidence rate = cases / total person-time."""
105
+ sub = df.select([outcome, person_time]).drop_nulls()
106
+ cases = int(sub[outcome].sum())
107
+ pt = float(sub[person_time].sum())
108
+ ir = cases / pt if pt > 0 else float("nan")
109
+ # Exact Poisson CI (Byar's approximation)
110
+ from scipy.stats import chi2
111
+ lo = 0.5 * float(chi2.ppf(0.025, 2 * cases)) / pt if cases > 0 else 0.0
112
+ hi = 0.5 * float(chi2.ppf(0.975, 2 * (cases + 1))) / pt
113
+ return {
114
+ "test": "Incidence Rate",
115
+ "outcome": outcome, "person_time_col": person_time,
116
+ "cases": cases, "person_time": pt,
117
+ "incidence_rate": ir,
118
+ "ir_ci_95_lo": lo, "ir_ci_95_hi": hi,
119
+ }
@@ -0,0 +1,163 @@
1
+ """Equivalence tests (TOST) and Tobit/Heckman regression."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import polars as pl
7
+ from scipy import stats as sp_stats
8
+
9
+
10
+ def tost_onemean(
11
+ df: pl.DataFrame,
12
+ col: str,
13
+ mu: float = 0.0,
14
+ delta: float = 0.5,
15
+ alpha: float = 0.05,
16
+ ) -> dict:
17
+ """
18
+ Two One-Sided Tests (TOST) for equivalence: one-sample.
19
+ H0: |mean - mu| >= delta vs H1: |mean - mu| < delta
20
+ """
21
+ data = df[col].drop_nulls().to_numpy().astype(float)
22
+ n = len(data)
23
+ x_bar = data.mean()
24
+ se = data.std(ddof=1) / np.sqrt(n)
25
+
26
+ # Lower test: H0: mean <= mu - delta
27
+ t_lo = (x_bar - (mu - delta)) / se
28
+ p_lo = float(sp_stats.t.sf(t_lo, df=n - 1)) # one-sided upper
29
+
30
+ # Upper test: H0: mean >= mu + delta
31
+ t_hi = (x_bar - (mu + delta)) / se
32
+ p_hi = float(sp_stats.t.cdf(t_hi, df=n - 1)) # one-sided lower
33
+
34
+ p_tost = max(p_lo, p_hi)
35
+ equivalent = p_tost < alpha
36
+
37
+ return {
38
+ "test": "TOST Equivalence (one-sample)",
39
+ "col": col,
40
+ "n_obs": n,
41
+ "mean": float(x_bar),
42
+ "mu": mu,
43
+ "delta": delta,
44
+ "alpha": alpha,
45
+ "t_lower": float(t_lo),
46
+ "t_upper": float(t_hi),
47
+ "p_lower": p_lo,
48
+ "p_upper": p_hi,
49
+ "p_tost": p_tost,
50
+ "equivalent_at_alpha": equivalent,
51
+ }
52
+
53
+
54
+ def tost_twomeans(
55
+ df: pl.DataFrame,
56
+ col: str,
57
+ by: str,
58
+ delta: float = 0.5,
59
+ alpha: float = 0.05,
60
+ ) -> dict:
61
+ """TOST for equivalence of two independent group means."""
62
+ groups = df[by].drop_nulls().unique().sort().to_list()
63
+ if len(groups) != 2:
64
+ raise ValueError(f"tost_twomeans requires exactly 2 groups, got {len(groups)}")
65
+ g1 = df.filter(pl.col(by) == groups[0])[col].drop_nulls().to_numpy().astype(float)
66
+ g2 = df.filter(pl.col(by) == groups[1])[col].drop_nulls().to_numpy().astype(float)
67
+
68
+ diff = float(g1.mean() - g2.mean())
69
+ se = float(np.sqrt(g1.var(ddof=1) / len(g1) + g2.var(ddof=1) / len(g2)))
70
+ df_welch = int((g1.var(ddof=1) / len(g1) + g2.var(ddof=1) / len(g2))**2 /
71
+ ((g1.var(ddof=1) / len(g1))**2 / (len(g1) - 1) +
72
+ (g2.var(ddof=1) / len(g2))**2 / (len(g2) - 1)))
73
+
74
+ t_lo = (diff - (-delta)) / se
75
+ t_hi = (diff - delta) / se
76
+ p_lo = float(sp_stats.t.sf(t_lo, df=df_welch))
77
+ p_hi = float(sp_stats.t.cdf(t_hi, df=df_welch))
78
+ p_tost = max(p_lo, p_hi)
79
+
80
+ return {
81
+ "test": "TOST Equivalence (two-sample)",
82
+ "col": col, "by": by,
83
+ "groups": [str(g) for g in groups],
84
+ "mean_diff": diff,
85
+ "delta": delta,
86
+ "alpha": alpha,
87
+ "p_tost": p_tost,
88
+ "equivalent_at_alpha": p_tost < alpha,
89
+ "p_lower": p_lo,
90
+ "p_upper": p_hi,
91
+ }
92
+
93
+
94
+ def fit_tobit(
95
+ df: pl.DataFrame,
96
+ dep: str,
97
+ indeps: list[str],
98
+ left: float | None = 0.0,
99
+ right: float | None = None,
100
+ ) -> dict:
101
+ """
102
+ Tobit regression for censored outcomes via MLE (scipy optimize).
103
+ Handles left-censoring (default at 0), right-censoring, or both.
104
+ """
105
+ from scipy.optimize import minimize
106
+ from scipy.stats import norm
107
+
108
+ sub = df.select([dep] + indeps).drop_nulls()
109
+ y = sub[dep].to_numpy().astype(float)
110
+ X_raw = sub.select(indeps).to_numpy().astype(float)
111
+ n, k = X_raw.shape
112
+ X = np.column_stack([np.ones(n), X_raw])
113
+ kp = k + 1
114
+
115
+ def neg_ll(params):
116
+ beta = params[:kp]
117
+ log_sigma = params[kp]
118
+ sigma = np.exp(log_sigma)
119
+ xb = X @ beta
120
+ ll = np.zeros(n)
121
+
122
+ for i in range(n):
123
+ if left is not None and y[i] <= left:
124
+ ll[i] = norm.logcdf((left - xb[i]) / sigma)
125
+ elif right is not None and y[i] >= right:
126
+ ll[i] = norm.logsf((right - xb[i]) / sigma)
127
+ else:
128
+ ll[i] = norm.logpdf(y[i], loc=xb[i], scale=sigma)
129
+ return -ll.sum()
130
+
131
+ # OLS start
132
+ beta0 = np.linalg.lstsq(X, y, rcond=None)[0]
133
+ resid0 = y - X @ beta0
134
+ log_sigma0 = np.log(max(resid0.std(), 1e-4))
135
+ x0 = np.concatenate([beta0, [log_sigma0]])
136
+
137
+ try:
138
+ res = minimize(neg_ll, x0, method="L-BFGS-B", options={"maxiter": 500})
139
+ beta_hat = res.x[:kp]
140
+ sigma_hat = float(np.exp(res.x[kp]))
141
+ llf = -res.fun
142
+ aic = 2 * (kp + 1) - 2 * llf
143
+ bic = (kp + 1) * np.log(n) - 2 * llf
144
+
145
+ param_names = ["_cons"] + indeps
146
+ params = {nm: float(v) for nm, v in zip(param_names, beta_hat)}
147
+
148
+ return {
149
+ "method": "Tobit",
150
+ "dep": dep, "indeps": indeps,
151
+ "left_censoring": left,
152
+ "right_censoring": right,
153
+ "params": params,
154
+ "sigma": sigma_hat,
155
+ "log_likelihood": float(llf),
156
+ "aic": float(aic),
157
+ "bic": float(bic),
158
+ "n_obs": n,
159
+ "n_censored_left": int((y <= left).sum()) if left is not None else 0,
160
+ "n_censored_right": int((y >= right).sum()) if right is not None else 0,
161
+ }
162
+ except Exception as exc:
163
+ raise RuntimeError(f"Tobit failed: {exc}") from exc