openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Nonparametric hypothesis tests and rank-based statistics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
from scipy import stats as sp_stats
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# ── Spearman rank correlation ──────────────────────────────────────────────
|
|
11
|
+
|
|
12
|
+
def spearman_corr(df: pl.DataFrame, cols: list[str]) -> dict:
|
|
13
|
+
"""Spearman rank correlation matrix."""
|
|
14
|
+
X = df.select(cols).to_numpy().astype(float)
|
|
15
|
+
n = X.shape[1]
|
|
16
|
+
rho = np.eye(n)
|
|
17
|
+
pvals = np.zeros((n, n))
|
|
18
|
+
for i in range(n):
|
|
19
|
+
for j in range(n):
|
|
20
|
+
if i == j:
|
|
21
|
+
pvals[i, j] = 0.0
|
|
22
|
+
continue
|
|
23
|
+
mask = ~(np.isnan(X[:, i]) | np.isnan(X[:, j]))
|
|
24
|
+
r, p = sp_stats.spearmanr(X[mask, i], X[mask, j])
|
|
25
|
+
rho[i, j] = r
|
|
26
|
+
pvals[i, j] = p
|
|
27
|
+
return {"rho": rho.tolist(), "pvalues": pvals.tolist(), "cols": cols}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ── Mann-Whitney / Wilcoxon rank-sum ──────────────────────────────────────
|
|
31
|
+
|
|
32
|
+
def ranksum_test(
|
|
33
|
+
df: pl.DataFrame,
|
|
34
|
+
var: str,
|
|
35
|
+
group: str,
|
|
36
|
+
*,
|
|
37
|
+
alternative: str = "two-sided",
|
|
38
|
+
) -> dict:
|
|
39
|
+
"""
|
|
40
|
+
Wilcoxon rank-sum test (Mann-Whitney U) for two independent groups.
|
|
41
|
+
|
|
42
|
+
alternative: 'two-sided', 'less', 'greater'
|
|
43
|
+
"""
|
|
44
|
+
grp_vals = df[group].drop_nulls().unique().to_list()
|
|
45
|
+
if len(grp_vals) != 2:
|
|
46
|
+
raise ValueError(f"'{group}' must have exactly 2 groups, found {len(grp_vals)}")
|
|
47
|
+
|
|
48
|
+
g1 = df.filter(pl.col(group) == grp_vals[0])[var].drop_nulls().to_numpy().astype(float)
|
|
49
|
+
g2 = df.filter(pl.col(group) == grp_vals[1])[var].drop_nulls().to_numpy().astype(float)
|
|
50
|
+
|
|
51
|
+
stat, p = sp_stats.mannwhitneyu(g1, g2, alternative=alternative)
|
|
52
|
+
z = (stat - len(g1) * len(g2) / 2) / np.sqrt(
|
|
53
|
+
len(g1) * len(g2) * (len(g1) + len(g2) + 1) / 12
|
|
54
|
+
)
|
|
55
|
+
return {
|
|
56
|
+
"test": "Wilcoxon rank-sum (Mann-Whitney U)",
|
|
57
|
+
"var": var,
|
|
58
|
+
"group": group,
|
|
59
|
+
"groups": grp_vals,
|
|
60
|
+
"n1": len(g1),
|
|
61
|
+
"n2": len(g2),
|
|
62
|
+
"U_statistic": float(stat),
|
|
63
|
+
"z_statistic": float(z),
|
|
64
|
+
"p_value": float(p),
|
|
65
|
+
"alternative": alternative,
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# ── Wilcoxon signed-rank ───────────────────────────────────────────────────
|
|
70
|
+
|
|
71
|
+
def signrank_test(
|
|
72
|
+
df: pl.DataFrame,
|
|
73
|
+
var1: str,
|
|
74
|
+
var2: str | None = None,
|
|
75
|
+
*,
|
|
76
|
+
mu: float = 0.0,
|
|
77
|
+
alternative: str = "two-sided",
|
|
78
|
+
) -> dict:
|
|
79
|
+
"""
|
|
80
|
+
Wilcoxon signed-rank test.
|
|
81
|
+
|
|
82
|
+
One-sample: var2=None, tests median of var1 == mu.
|
|
83
|
+
Paired: tests median of (var1 - var2) == 0.
|
|
84
|
+
"""
|
|
85
|
+
x = df[var1].drop_nulls().to_numpy().astype(float)
|
|
86
|
+
if var2 is not None:
|
|
87
|
+
y = df[var2].drop_nulls().to_numpy().astype(float)
|
|
88
|
+
diff = x[: len(y)] - y[: len(x)]
|
|
89
|
+
else:
|
|
90
|
+
diff = x - mu
|
|
91
|
+
|
|
92
|
+
stat, p = sp_stats.wilcoxon(diff, alternative=alternative)
|
|
93
|
+
return {
|
|
94
|
+
"test": "Wilcoxon signed-rank",
|
|
95
|
+
"var1": var1,
|
|
96
|
+
"var2": var2,
|
|
97
|
+
"mu": mu,
|
|
98
|
+
"n": len(diff),
|
|
99
|
+
"W_statistic": float(stat),
|
|
100
|
+
"p_value": float(p),
|
|
101
|
+
"alternative": alternative,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# ── Kruskal-Wallis ─────────────────────────────────────────────────────────
|
|
106
|
+
|
|
107
|
+
def kruskal_wallis_test(
|
|
108
|
+
df: pl.DataFrame,
|
|
109
|
+
var: str,
|
|
110
|
+
group: str,
|
|
111
|
+
) -> dict:
|
|
112
|
+
"""Kruskal-Wallis H test for k independent groups."""
|
|
113
|
+
groups = df[group].drop_nulls().unique().to_list()
|
|
114
|
+
samples = [
|
|
115
|
+
df.filter(pl.col(group) == g)[var].drop_nulls().to_numpy().astype(float)
|
|
116
|
+
for g in groups
|
|
117
|
+
]
|
|
118
|
+
stat, p = sp_stats.kruskal(*samples)
|
|
119
|
+
df_stat = len(groups) - 1
|
|
120
|
+
return {
|
|
121
|
+
"test": "Kruskal-Wallis H",
|
|
122
|
+
"var": var,
|
|
123
|
+
"group": group,
|
|
124
|
+
"k_groups": len(groups),
|
|
125
|
+
"H_statistic": float(stat),
|
|
126
|
+
"df": df_stat,
|
|
127
|
+
"p_value": float(p),
|
|
128
|
+
"groups": groups,
|
|
129
|
+
"n_per_group": [len(s) for s in samples],
|
|
130
|
+
}
|
openstat/stats/panel.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""Panel data models: Fixed Effects, Random Effects, Between, Hausman test."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import polars as pl
|
|
10
|
+
from rich.table import Table
|
|
11
|
+
from rich.console import Console
|
|
12
|
+
|
|
13
|
+
from openstat.stats.models import FitResult
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _try_import_linearmodels():
|
|
17
|
+
try:
|
|
18
|
+
import linearmodels # noqa: F401
|
|
19
|
+
except ImportError:
|
|
20
|
+
raise ImportError(
|
|
21
|
+
"Panel data models require linearmodels. "
|
|
22
|
+
"Install it with: pip install openstat[panel]"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _panel_to_fit_result(result, model_type: str, dep: str, indeps: list[str]) -> FitResult:
|
|
27
|
+
"""Convert a linearmodels PanelResults to FitResult."""
|
|
28
|
+
params = {name: float(val) for name, val in result.params.items()}
|
|
29
|
+
std_errors = {name: float(val) for name, val in result.std_errors.items()}
|
|
30
|
+
t_values = {name: float(val) for name, val in result.tstats.items()}
|
|
31
|
+
p_values = {name: float(val) for name, val in result.pvalues.items()}
|
|
32
|
+
ci = result.conf_int()
|
|
33
|
+
conf_low = {name: float(ci.loc[name, "lower"]) for name in params}
|
|
34
|
+
conf_high = {name: float(ci.loc[name, "upper"]) for name in params}
|
|
35
|
+
|
|
36
|
+
warnings_list: list[str] = []
|
|
37
|
+
|
|
38
|
+
return FitResult(
|
|
39
|
+
model_type=model_type,
|
|
40
|
+
formula=f"{dep} ~ {' + '.join(indeps)}",
|
|
41
|
+
dep_var=dep,
|
|
42
|
+
indep_vars=indeps,
|
|
43
|
+
n_obs=int(result.nobs),
|
|
44
|
+
params=params,
|
|
45
|
+
std_errors=std_errors,
|
|
46
|
+
t_values=t_values,
|
|
47
|
+
p_values=p_values,
|
|
48
|
+
conf_int_low=conf_low,
|
|
49
|
+
conf_int_high=conf_high,
|
|
50
|
+
r_squared=float(result.rsquared) if hasattr(result, "rsquared") else None,
|
|
51
|
+
f_statistic=float(result.f_statistic.stat) if hasattr(result, "f_statistic") and result.f_statistic is not None else None,
|
|
52
|
+
f_pvalue=float(result.f_statistic.pval) if hasattr(result, "f_statistic") and result.f_statistic is not None else None,
|
|
53
|
+
warnings=warnings_list,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def fit_panel_fe(
|
|
58
|
+
df: pl.DataFrame,
|
|
59
|
+
dep: str,
|
|
60
|
+
indeps: list[str],
|
|
61
|
+
entity_col: str,
|
|
62
|
+
time_col: str,
|
|
63
|
+
robust: bool = False,
|
|
64
|
+
cluster: str | None = None,
|
|
65
|
+
) -> tuple[FitResult, object]:
|
|
66
|
+
"""Fit a Fixed Effects panel model."""
|
|
67
|
+
_try_import_linearmodels()
|
|
68
|
+
from linearmodels.panel import PanelOLS
|
|
69
|
+
|
|
70
|
+
pdf = df.select([entity_col, time_col, dep] + indeps).to_pandas().dropna()
|
|
71
|
+
pdf = pdf.set_index([entity_col, time_col])
|
|
72
|
+
|
|
73
|
+
import statsmodels.api as sm
|
|
74
|
+
y = pdf[dep]
|
|
75
|
+
X = sm.add_constant(pdf[indeps])
|
|
76
|
+
|
|
77
|
+
model = PanelOLS(y, X, entity_effects=True)
|
|
78
|
+
|
|
79
|
+
cov_type = "unadjusted"
|
|
80
|
+
cov_kwds: dict = {}
|
|
81
|
+
if cluster:
|
|
82
|
+
cov_type = "clustered"
|
|
83
|
+
cov_kwds["cluster_entity"] = True
|
|
84
|
+
elif robust:
|
|
85
|
+
cov_type = "robust"
|
|
86
|
+
|
|
87
|
+
result = model.fit(cov_type=cov_type, **cov_kwds)
|
|
88
|
+
fit = _panel_to_fit_result(result, "Panel FE", dep, ["const"] + indeps)
|
|
89
|
+
return fit, result
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def fit_panel_re(
|
|
93
|
+
df: pl.DataFrame,
|
|
94
|
+
dep: str,
|
|
95
|
+
indeps: list[str],
|
|
96
|
+
entity_col: str,
|
|
97
|
+
time_col: str,
|
|
98
|
+
robust: bool = False,
|
|
99
|
+
) -> tuple[FitResult, object]:
|
|
100
|
+
"""Fit a Random Effects panel model."""
|
|
101
|
+
_try_import_linearmodels()
|
|
102
|
+
from linearmodels.panel import RandomEffects
|
|
103
|
+
|
|
104
|
+
pdf = df.select([entity_col, time_col, dep] + indeps).to_pandas().dropna()
|
|
105
|
+
pdf = pdf.set_index([entity_col, time_col])
|
|
106
|
+
|
|
107
|
+
import statsmodels.api as sm
|
|
108
|
+
y = pdf[dep]
|
|
109
|
+
X = sm.add_constant(pdf[indeps])
|
|
110
|
+
|
|
111
|
+
model = RandomEffects(y, X)
|
|
112
|
+
cov_type = "robust" if robust else "unadjusted"
|
|
113
|
+
result = model.fit(cov_type=cov_type)
|
|
114
|
+
fit = _panel_to_fit_result(result, "Panel RE", dep, ["const"] + indeps)
|
|
115
|
+
return fit, result
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def fit_panel_be(
|
|
119
|
+
df: pl.DataFrame,
|
|
120
|
+
dep: str,
|
|
121
|
+
indeps: list[str],
|
|
122
|
+
entity_col: str,
|
|
123
|
+
time_col: str,
|
|
124
|
+
) -> tuple[FitResult, object]:
|
|
125
|
+
"""Fit a Between Effects panel model."""
|
|
126
|
+
_try_import_linearmodels()
|
|
127
|
+
from linearmodels.panel import BetweenOLS
|
|
128
|
+
|
|
129
|
+
pdf = df.select([entity_col, time_col, dep] + indeps).to_pandas().dropna()
|
|
130
|
+
pdf = pdf.set_index([entity_col, time_col])
|
|
131
|
+
|
|
132
|
+
import statsmodels.api as sm
|
|
133
|
+
y = pdf[dep]
|
|
134
|
+
X = sm.add_constant(pdf[indeps])
|
|
135
|
+
|
|
136
|
+
model = BetweenOLS(y, X)
|
|
137
|
+
result = model.fit()
|
|
138
|
+
fit = _panel_to_fit_result(result, "Panel BE", dep, ["const"] + indeps)
|
|
139
|
+
return fit, result
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def hausman_test(fe_result, re_result) -> str:
|
|
143
|
+
"""Perform the Hausman test for FE vs RE.
|
|
144
|
+
|
|
145
|
+
H0: RE is consistent and efficient (prefer RE).
|
|
146
|
+
H1: RE is inconsistent (prefer FE).
|
|
147
|
+
"""
|
|
148
|
+
b_fe = fe_result.params
|
|
149
|
+
b_re = re_result.params
|
|
150
|
+
|
|
151
|
+
# Use common coefficients (exclude const)
|
|
152
|
+
common = [k for k in b_fe.index if k in b_re.index and k != "const"]
|
|
153
|
+
if not common:
|
|
154
|
+
return "No common coefficients for Hausman test."
|
|
155
|
+
|
|
156
|
+
b_diff = np.array([b_fe[k] - b_re[k] for k in common])
|
|
157
|
+
cov_fe = fe_result.cov.loc[common, common].values
|
|
158
|
+
cov_re = re_result.cov.loc[common, common].values
|
|
159
|
+
cov_diff = cov_fe - cov_re
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
chi2_stat = float(b_diff @ np.linalg.inv(cov_diff) @ b_diff)
|
|
163
|
+
except np.linalg.LinAlgError:
|
|
164
|
+
chi2_stat = float(b_diff @ np.linalg.pinv(cov_diff) @ b_diff)
|
|
165
|
+
|
|
166
|
+
from scipy import stats as sp_stats
|
|
167
|
+
df = len(common)
|
|
168
|
+
p_value = float(1 - sp_stats.chi2.cdf(chi2_stat, df))
|
|
169
|
+
|
|
170
|
+
recommendation = "Use Fixed Effects (FE)" if p_value < 0.05 else "Use Random Effects (RE)"
|
|
171
|
+
|
|
172
|
+
lines = [
|
|
173
|
+
"Hausman Test (FE vs RE)",
|
|
174
|
+
f" H0: Random Effects model is consistent",
|
|
175
|
+
f" chi2({df}) = {chi2_stat:.4f}",
|
|
176
|
+
f" p-value = {p_value:.4f}",
|
|
177
|
+
f" Recommendation: {recommendation}",
|
|
178
|
+
]
|
|
179
|
+
return "\n".join(lines)
|
openstat/stats/power.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
"""Power analysis — one/two-sample means, proportions, OLS."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
|
|
7
|
+
from scipy import stats as sp_stats
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# ── Helpers ────────────────────────────────────────────────────────────────
|
|
11
|
+
|
|
12
|
+
def _solve(fn, lo=1, hi=1_000_000, tol=1e-6):
|
|
13
|
+
"""Bisection solver: find x in [lo, hi] where fn(x) ≈ 0."""
|
|
14
|
+
for _ in range(60):
|
|
15
|
+
mid = (lo + hi) / 2
|
|
16
|
+
if fn(mid) < 0:
|
|
17
|
+
lo = mid
|
|
18
|
+
else:
|
|
19
|
+
hi = mid
|
|
20
|
+
if hi - lo < tol:
|
|
21
|
+
break
|
|
22
|
+
return (lo + hi) / 2
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ── One-sample mean ────────────────────────────────────────────────────────
|
|
26
|
+
|
|
27
|
+
def power_onemean(
|
|
28
|
+
effect_size: float | None = None,
|
|
29
|
+
alpha: float = 0.05,
|
|
30
|
+
n: int | None = None,
|
|
31
|
+
power: float | None = None,
|
|
32
|
+
sd: float = 1.0,
|
|
33
|
+
delta: float | None = None,
|
|
34
|
+
two_sided: bool = True,
|
|
35
|
+
) -> dict:
|
|
36
|
+
"""
|
|
37
|
+
Power analysis for one-sample t-test.
|
|
38
|
+
|
|
39
|
+
Provide exactly two of: effect_size (or delta/sd), n, power.
|
|
40
|
+
"""
|
|
41
|
+
if delta is not None and effect_size is None:
|
|
42
|
+
effect_size = delta / sd
|
|
43
|
+
|
|
44
|
+
sides = 2 if two_sided else 1
|
|
45
|
+
za2 = sp_stats.norm.ppf(1 - alpha / sides)
|
|
46
|
+
|
|
47
|
+
def _power_from_n(n_val):
|
|
48
|
+
zb = abs(effect_size) * math.sqrt(n_val) - za2
|
|
49
|
+
return sp_stats.norm.cdf(zb)
|
|
50
|
+
|
|
51
|
+
if n is None and power is not None and effect_size is not None:
|
|
52
|
+
# Solve for n
|
|
53
|
+
n_val = _solve(lambda x: _power_from_n(x) - power)
|
|
54
|
+
n = math.ceil(n_val)
|
|
55
|
+
achieved = _power_from_n(n)
|
|
56
|
+
elif power is None and n is not None and effect_size is not None:
|
|
57
|
+
achieved = _power_from_n(n)
|
|
58
|
+
power = achieved
|
|
59
|
+
elif effect_size is None and n is not None and power is not None:
|
|
60
|
+
# Solve for detectable effect size
|
|
61
|
+
zb = sp_stats.norm.ppf(power)
|
|
62
|
+
effect_size = (za2 + zb) / math.sqrt(n)
|
|
63
|
+
achieved = power
|
|
64
|
+
else:
|
|
65
|
+
raise ValueError("Provide exactly two of: effect_size, n, power")
|
|
66
|
+
|
|
67
|
+
return {
|
|
68
|
+
"test": "One-sample t-test",
|
|
69
|
+
"effect_size": round(effect_size, 6),
|
|
70
|
+
"alpha": alpha,
|
|
71
|
+
"n": n,
|
|
72
|
+
"power": round(power, 6),
|
|
73
|
+
"two_sided": two_sided,
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ── Two-sample means ───────────────────────────────────────────────────────
|
|
78
|
+
|
|
79
|
+
def power_twomeans(
|
|
80
|
+
effect_size: float | None = None,
|
|
81
|
+
alpha: float = 0.05,
|
|
82
|
+
n: int | None = None,
|
|
83
|
+
power: float | None = None,
|
|
84
|
+
ratio: float = 1.0,
|
|
85
|
+
sd: float = 1.0,
|
|
86
|
+
delta: float | None = None,
|
|
87
|
+
two_sided: bool = True,
|
|
88
|
+
) -> dict:
|
|
89
|
+
"""Power analysis for two-sample independent t-test."""
|
|
90
|
+
if delta is not None and effect_size is None:
|
|
91
|
+
effect_size = delta / sd
|
|
92
|
+
|
|
93
|
+
sides = 2 if two_sided else 1
|
|
94
|
+
za2 = sp_stats.norm.ppf(1 - alpha / sides)
|
|
95
|
+
|
|
96
|
+
def _power_from_n(n1):
|
|
97
|
+
n2 = n1 * ratio
|
|
98
|
+
se = math.sqrt(1 / n1 + 1 / n2)
|
|
99
|
+
zb = abs(effect_size) / se - za2
|
|
100
|
+
return sp_stats.norm.cdf(zb)
|
|
101
|
+
|
|
102
|
+
if n is None and power is not None and effect_size is not None:
|
|
103
|
+
n_val = _solve(lambda x: _power_from_n(x) - power)
|
|
104
|
+
n = math.ceil(n_val)
|
|
105
|
+
achieved = _power_from_n(n)
|
|
106
|
+
power = achieved
|
|
107
|
+
elif power is None and n is not None and effect_size is not None:
|
|
108
|
+
power = _power_from_n(n)
|
|
109
|
+
elif effect_size is None and n is not None and power is not None:
|
|
110
|
+
zb = sp_stats.norm.ppf(power)
|
|
111
|
+
n2 = n * ratio
|
|
112
|
+
se = math.sqrt(1 / n + 1 / n2)
|
|
113
|
+
effect_size = (za2 + zb) * se
|
|
114
|
+
achieved = power
|
|
115
|
+
else:
|
|
116
|
+
raise ValueError("Provide exactly two of: effect_size, n, power")
|
|
117
|
+
|
|
118
|
+
return {
|
|
119
|
+
"test": "Two-sample t-test",
|
|
120
|
+
"effect_size": round(effect_size, 6),
|
|
121
|
+
"alpha": alpha,
|
|
122
|
+
"n1": n,
|
|
123
|
+
"n2": math.ceil(n * ratio),
|
|
124
|
+
"power": round(power, 6),
|
|
125
|
+
"ratio": ratio,
|
|
126
|
+
"two_sided": two_sided,
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# ── One proportion ─────────────────────────────────────────────────────────
|
|
131
|
+
|
|
132
|
+
def power_oneproportion(
|
|
133
|
+
p0: float,
|
|
134
|
+
pa: float,
|
|
135
|
+
alpha: float = 0.05,
|
|
136
|
+
n: int | None = None,
|
|
137
|
+
power: float | None = None,
|
|
138
|
+
two_sided: bool = True,
|
|
139
|
+
) -> dict:
|
|
140
|
+
"""Power analysis for one-sample proportion z-test."""
|
|
141
|
+
sides = 2 if two_sided else 1
|
|
142
|
+
za2 = sp_stats.norm.ppf(1 - alpha / sides)
|
|
143
|
+
effect_size = abs(pa - p0) / math.sqrt(p0 * (1 - p0))
|
|
144
|
+
|
|
145
|
+
def _power_from_n(n_val):
|
|
146
|
+
se_null = math.sqrt(p0 * (1 - p0) / n_val)
|
|
147
|
+
se_alt = math.sqrt(pa * (1 - pa) / n_val)
|
|
148
|
+
z = (abs(pa - p0) - za2 * se_null) / se_alt
|
|
149
|
+
return sp_stats.norm.cdf(z)
|
|
150
|
+
|
|
151
|
+
if n is None and power is not None:
|
|
152
|
+
n_val = _solve(lambda x: _power_from_n(x) - power)
|
|
153
|
+
n = math.ceil(n_val)
|
|
154
|
+
power = _power_from_n(n)
|
|
155
|
+
elif power is None and n is not None:
|
|
156
|
+
power = _power_from_n(n)
|
|
157
|
+
else:
|
|
158
|
+
raise ValueError("Provide exactly one of: n, power")
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
"test": "One-sample proportion z-test",
|
|
162
|
+
"p0": p0,
|
|
163
|
+
"pa": pa,
|
|
164
|
+
"effect_size": round(effect_size, 6),
|
|
165
|
+
"alpha": alpha,
|
|
166
|
+
"n": n,
|
|
167
|
+
"power": round(power, 6),
|
|
168
|
+
"two_sided": two_sided,
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# ── Two proportions ────────────────────────────────────────────────────────
|
|
173
|
+
|
|
174
|
+
def power_twoproportions(
|
|
175
|
+
p1: float,
|
|
176
|
+
p2: float,
|
|
177
|
+
alpha: float = 0.05,
|
|
178
|
+
n: int | None = None,
|
|
179
|
+
power: float | None = None,
|
|
180
|
+
two_sided: bool = True,
|
|
181
|
+
) -> dict:
|
|
182
|
+
"""Power analysis for two-sample proportion z-test."""
|
|
183
|
+
sides = 2 if two_sided else 1
|
|
184
|
+
za2 = sp_stats.norm.ppf(1 - alpha / sides)
|
|
185
|
+
p_avg = (p1 + p2) / 2
|
|
186
|
+
effect_size = abs(p2 - p1) / math.sqrt(p_avg * (1 - p_avg))
|
|
187
|
+
|
|
188
|
+
def _power_from_n(n_val):
|
|
189
|
+
se_null = math.sqrt(2 * p_avg * (1 - p_avg) / n_val)
|
|
190
|
+
se_alt = math.sqrt((p1 * (1 - p1) + p2 * (1 - p2)) / n_val)
|
|
191
|
+
z = (abs(p2 - p1) - za2 * se_null) / se_alt
|
|
192
|
+
return sp_stats.norm.cdf(z)
|
|
193
|
+
|
|
194
|
+
if n is None and power is not None:
|
|
195
|
+
n_val = _solve(lambda x: _power_from_n(x) - power)
|
|
196
|
+
n = math.ceil(n_val)
|
|
197
|
+
power = _power_from_n(n)
|
|
198
|
+
elif power is None and n is not None:
|
|
199
|
+
power = _power_from_n(n)
|
|
200
|
+
else:
|
|
201
|
+
raise ValueError("Provide exactly one of: n, power")
|
|
202
|
+
|
|
203
|
+
return {
|
|
204
|
+
"test": "Two-sample proportion z-test",
|
|
205
|
+
"p1": p1,
|
|
206
|
+
"p2": p2,
|
|
207
|
+
"effect_size": round(effect_size, 6),
|
|
208
|
+
"alpha": alpha,
|
|
209
|
+
"n": n,
|
|
210
|
+
"power": round(power, 6),
|
|
211
|
+
"two_sided": two_sided,
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
# ── OLS / multiple regression ──────────────────────────────────────────────
|
|
216
|
+
|
|
217
|
+
def power_ols(
|
|
218
|
+
f2: float | None = None,
|
|
219
|
+
alpha: float = 0.05,
|
|
220
|
+
n: int | None = None,
|
|
221
|
+
power: float | None = None,
|
|
222
|
+
k: int = 1,
|
|
223
|
+
) -> dict:
|
|
224
|
+
"""
|
|
225
|
+
Power analysis for OLS / multiple regression (Cohen's f²).
|
|
226
|
+
|
|
227
|
+
f2 = R² / (1 - R²)
|
|
228
|
+
k = number of predictors
|
|
229
|
+
"""
|
|
230
|
+
|
|
231
|
+
def _power_from_n(n_val):
|
|
232
|
+
df1 = k
|
|
233
|
+
df2 = n_val - k - 1
|
|
234
|
+
if df2 <= 0:
|
|
235
|
+
return 0.0
|
|
236
|
+
nc = f2 * n_val
|
|
237
|
+
return 1 - sp_stats.f.cdf(
|
|
238
|
+
sp_stats.f.ppf(1 - alpha, df1, df2), df1, df2, nc
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
if n is None and power is not None and f2 is not None:
|
|
242
|
+
n_val = _solve(lambda x: _power_from_n(x) - power, lo=k + 2)
|
|
243
|
+
n = math.ceil(n_val)
|
|
244
|
+
power = _power_from_n(n)
|
|
245
|
+
elif power is None and n is not None and f2 is not None:
|
|
246
|
+
power = _power_from_n(n)
|
|
247
|
+
elif f2 is None and n is not None and power is not None:
|
|
248
|
+
f2 = _solve(
|
|
249
|
+
lambda f: _power_from_n_f2(n, f, alpha, k) - power, # type: ignore[arg-type]
|
|
250
|
+
lo=0.0001,
|
|
251
|
+
hi=10,
|
|
252
|
+
)
|
|
253
|
+
power = _power_from_n(n)
|
|
254
|
+
else:
|
|
255
|
+
raise ValueError("Provide exactly two of: f2, n, power")
|
|
256
|
+
|
|
257
|
+
return {
|
|
258
|
+
"test": "OLS / Multiple Regression",
|
|
259
|
+
"f2": round(f2, 6),
|
|
260
|
+
"alpha": alpha,
|
|
261
|
+
"n": n,
|
|
262
|
+
"k": k,
|
|
263
|
+
"power": round(power, 6),
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _power_from_n_f2(n, f2, alpha, k):
|
|
268
|
+
df1 = k
|
|
269
|
+
df2 = n - k - 1
|
|
270
|
+
if df2 <= 0:
|
|
271
|
+
return 0.0
|
|
272
|
+
nc = f2 * n
|
|
273
|
+
return 1 - sp_stats.f.cdf(
|
|
274
|
+
sp_stats.f.ppf(1 - alpha, df1, df2), df1, df2, nc
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
# ── sampsi (Stata-style) ───────────────────────────────────────────────────
|
|
279
|
+
|
|
280
|
+
def sampsi(
|
|
281
|
+
mu1: float,
|
|
282
|
+
mu2: float,
|
|
283
|
+
sd: float = 1.0,
|
|
284
|
+
alpha: float = 0.05,
|
|
285
|
+
power: float = 0.80,
|
|
286
|
+
two_sided: bool = True,
|
|
287
|
+
) -> dict:
|
|
288
|
+
"""Compute required sample size for two-sample t-test (Stata sampsi style)."""
|
|
289
|
+
effect_size = abs(mu2 - mu1) / sd
|
|
290
|
+
return power_twomeans(
|
|
291
|
+
effect_size=effect_size,
|
|
292
|
+
alpha=alpha,
|
|
293
|
+
power=power,
|
|
294
|
+
two_sided=two_sided,
|
|
295
|
+
)
|