openstat-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. openstat/__init__.py +3 -0
  2. openstat/__main__.py +4 -0
  3. openstat/backends/__init__.py +16 -0
  4. openstat/backends/duckdb_backend.py +70 -0
  5. openstat/backends/polars_backend.py +52 -0
  6. openstat/cli.py +92 -0
  7. openstat/commands/__init__.py +82 -0
  8. openstat/commands/adv_stat_cmds.py +1255 -0
  9. openstat/commands/advanced_ml_cmds.py +576 -0
  10. openstat/commands/advreg_cmds.py +207 -0
  11. openstat/commands/alias_cmds.py +135 -0
  12. openstat/commands/arch_cmds.py +82 -0
  13. openstat/commands/arules_cmds.py +111 -0
  14. openstat/commands/automodel_cmds.py +212 -0
  15. openstat/commands/backend_cmds.py +82 -0
  16. openstat/commands/base.py +170 -0
  17. openstat/commands/bayes_cmds.py +71 -0
  18. openstat/commands/causal_cmds.py +269 -0
  19. openstat/commands/cluster_cmds.py +152 -0
  20. openstat/commands/data_cmds.py +996 -0
  21. openstat/commands/datamanip_cmds.py +672 -0
  22. openstat/commands/dataquality_cmds.py +174 -0
  23. openstat/commands/datetime_cmds.py +176 -0
  24. openstat/commands/dimreduce_cmds.py +184 -0
  25. openstat/commands/discrete_cmds.py +149 -0
  26. openstat/commands/dsl_cmds.py +143 -0
  27. openstat/commands/epi_cmds.py +93 -0
  28. openstat/commands/equiv_tobit_cmds.py +94 -0
  29. openstat/commands/esttab_cmds.py +196 -0
  30. openstat/commands/export_beamer_cmds.py +142 -0
  31. openstat/commands/export_cmds.py +201 -0
  32. openstat/commands/export_extra_cmds.py +240 -0
  33. openstat/commands/factor_cmds.py +180 -0
  34. openstat/commands/groupby_cmds.py +155 -0
  35. openstat/commands/help_cmds.py +237 -0
  36. openstat/commands/i18n_cmds.py +43 -0
  37. openstat/commands/import_extra_cmds.py +561 -0
  38. openstat/commands/influence_cmds.py +134 -0
  39. openstat/commands/iv_cmds.py +106 -0
  40. openstat/commands/manova_cmds.py +105 -0
  41. openstat/commands/mediate_cmds.py +233 -0
  42. openstat/commands/meta_cmds.py +284 -0
  43. openstat/commands/mi_cmds.py +228 -0
  44. openstat/commands/mixed_cmds.py +79 -0
  45. openstat/commands/mixture_changepoint_cmds.py +166 -0
  46. openstat/commands/ml_adv_cmds.py +147 -0
  47. openstat/commands/ml_cmds.py +178 -0
  48. openstat/commands/model_eval_cmds.py +142 -0
  49. openstat/commands/network_cmds.py +288 -0
  50. openstat/commands/nlquery_cmds.py +161 -0
  51. openstat/commands/nonparam_cmds.py +149 -0
  52. openstat/commands/outreg_cmds.py +247 -0
  53. openstat/commands/panel_cmds.py +141 -0
  54. openstat/commands/pdf_cmds.py +226 -0
  55. openstat/commands/pipeline_cmds.py +319 -0
  56. openstat/commands/plot_cmds.py +189 -0
  57. openstat/commands/plugin_cmds.py +79 -0
  58. openstat/commands/posthoc_cmds.py +153 -0
  59. openstat/commands/power_cmds.py +172 -0
  60. openstat/commands/profile_cmds.py +246 -0
  61. openstat/commands/rbridge_cmds.py +81 -0
  62. openstat/commands/regex_cmds.py +104 -0
  63. openstat/commands/report_cmds.py +48 -0
  64. openstat/commands/repro_cmds.py +129 -0
  65. openstat/commands/resampling_cmds.py +109 -0
  66. openstat/commands/reshape_cmds.py +223 -0
  67. openstat/commands/sem_cmds.py +177 -0
  68. openstat/commands/stat_cmds.py +1040 -0
  69. openstat/commands/stata_import_cmds.py +215 -0
  70. openstat/commands/string_cmds.py +124 -0
  71. openstat/commands/surv_cmds.py +145 -0
  72. openstat/commands/survey_cmds.py +153 -0
  73. openstat/commands/textanalysis_cmds.py +192 -0
  74. openstat/commands/ts_adv_cmds.py +136 -0
  75. openstat/commands/ts_cmds.py +195 -0
  76. openstat/commands/tui_cmds.py +111 -0
  77. openstat/commands/ux_cmds.py +191 -0
  78. openstat/commands/validate_cmds.py +270 -0
  79. openstat/commands/viz_adv_cmds.py +312 -0
  80. openstat/commands/viz_extra_cmds.py +251 -0
  81. openstat/commands/watch_cmds.py +69 -0
  82. openstat/config.py +106 -0
  83. openstat/dsl/__init__.py +0 -0
  84. openstat/dsl/parser.py +332 -0
  85. openstat/dsl/tokenizer.py +105 -0
  86. openstat/i18n.py +120 -0
  87. openstat/io/__init__.py +0 -0
  88. openstat/io/loader.py +187 -0
  89. openstat/jupyter/__init__.py +18 -0
  90. openstat/jupyter/display.py +18 -0
  91. openstat/jupyter/magic.py +60 -0
  92. openstat/logging_config.py +59 -0
  93. openstat/plots/__init__.py +0 -0
  94. openstat/plots/plotter.py +437 -0
  95. openstat/plots/surv_plots.py +32 -0
  96. openstat/plots/ts_plots.py +59 -0
  97. openstat/plugins/__init__.py +5 -0
  98. openstat/plugins/manager.py +69 -0
  99. openstat/repl.py +457 -0
  100. openstat/reporting/__init__.py +0 -0
  101. openstat/reporting/eda.py +208 -0
  102. openstat/reporting/report.py +67 -0
  103. openstat/script_runner.py +319 -0
  104. openstat/session.py +133 -0
  105. openstat/stats/__init__.py +0 -0
  106. openstat/stats/advanced_regression.py +269 -0
  107. openstat/stats/arch_garch.py +84 -0
  108. openstat/stats/bayesian.py +103 -0
  109. openstat/stats/causal.py +258 -0
  110. openstat/stats/clustering.py +206 -0
  111. openstat/stats/discrete.py +311 -0
  112. openstat/stats/epidemiology.py +119 -0
  113. openstat/stats/equiv_tobit.py +163 -0
  114. openstat/stats/factor.py +174 -0
  115. openstat/stats/imputation.py +282 -0
  116. openstat/stats/influence.py +78 -0
  117. openstat/stats/iv.py +131 -0
  118. openstat/stats/manova.py +124 -0
  119. openstat/stats/mixed.py +128 -0
  120. openstat/stats/ml.py +275 -0
  121. openstat/stats/ml_advanced.py +117 -0
  122. openstat/stats/model_eval.py +183 -0
  123. openstat/stats/models.py +1342 -0
  124. openstat/stats/nonparametric.py +130 -0
  125. openstat/stats/panel.py +179 -0
  126. openstat/stats/power.py +295 -0
  127. openstat/stats/resampling.py +203 -0
  128. openstat/stats/survey.py +213 -0
  129. openstat/stats/survival.py +196 -0
  130. openstat/stats/timeseries.py +142 -0
  131. openstat/stats/ts_advanced.py +114 -0
  132. openstat/types.py +11 -0
  133. openstat/web/__init__.py +1 -0
  134. openstat/web/app.py +117 -0
  135. openstat/web/session_manager.py +73 -0
  136. openstat/web/static/app.js +117 -0
  137. openstat/web/static/index.html +38 -0
  138. openstat/web/static/style.css +103 -0
  139. openstat_cli-1.0.0.dist-info/METADATA +748 -0
  140. openstat_cli-1.0.0.dist-info/RECORD +143 -0
  141. openstat_cli-1.0.0.dist-info/WHEEL +4 -0
  142. openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
  143. openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,130 @@
1
+ """Nonparametric hypothesis tests and rank-based statistics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import polars as pl
7
+ from scipy import stats as sp_stats
8
+
9
+
10
+ # ── Spearman rank correlation ──────────────────────────────────────────────
11
+
12
+ def spearman_corr(df: pl.DataFrame, cols: list[str]) -> dict:
13
+ """Spearman rank correlation matrix."""
14
+ X = df.select(cols).to_numpy().astype(float)
15
+ n = X.shape[1]
16
+ rho = np.eye(n)
17
+ pvals = np.zeros((n, n))
18
+ for i in range(n):
19
+ for j in range(n):
20
+ if i == j:
21
+ pvals[i, j] = 0.0
22
+ continue
23
+ mask = ~(np.isnan(X[:, i]) | np.isnan(X[:, j]))
24
+ r, p = sp_stats.spearmanr(X[mask, i], X[mask, j])
25
+ rho[i, j] = r
26
+ pvals[i, j] = p
27
+ return {"rho": rho.tolist(), "pvalues": pvals.tolist(), "cols": cols}
28
+
29
+
30
+ # ── Mann-Whitney / Wilcoxon rank-sum ──────────────────────────────────────
31
+
32
+ def ranksum_test(
33
+ df: pl.DataFrame,
34
+ var: str,
35
+ group: str,
36
+ *,
37
+ alternative: str = "two-sided",
38
+ ) -> dict:
39
+ """
40
+ Wilcoxon rank-sum test (Mann-Whitney U) for two independent groups.
41
+
42
+ alternative: 'two-sided', 'less', 'greater'
43
+ """
44
+ grp_vals = df[group].drop_nulls().unique().to_list()
45
+ if len(grp_vals) != 2:
46
+ raise ValueError(f"'{group}' must have exactly 2 groups, found {len(grp_vals)}")
47
+
48
+ g1 = df.filter(pl.col(group) == grp_vals[0])[var].drop_nulls().to_numpy().astype(float)
49
+ g2 = df.filter(pl.col(group) == grp_vals[1])[var].drop_nulls().to_numpy().astype(float)
50
+
51
+ stat, p = sp_stats.mannwhitneyu(g1, g2, alternative=alternative)
52
+ z = (stat - len(g1) * len(g2) / 2) / np.sqrt(
53
+ len(g1) * len(g2) * (len(g1) + len(g2) + 1) / 12
54
+ )
55
+ return {
56
+ "test": "Wilcoxon rank-sum (Mann-Whitney U)",
57
+ "var": var,
58
+ "group": group,
59
+ "groups": grp_vals,
60
+ "n1": len(g1),
61
+ "n2": len(g2),
62
+ "U_statistic": float(stat),
63
+ "z_statistic": float(z),
64
+ "p_value": float(p),
65
+ "alternative": alternative,
66
+ }
67
+
68
+
69
+ # ── Wilcoxon signed-rank ───────────────────────────────────────────────────
70
+
71
+ def signrank_test(
72
+ df: pl.DataFrame,
73
+ var1: str,
74
+ var2: str | None = None,
75
+ *,
76
+ mu: float = 0.0,
77
+ alternative: str = "two-sided",
78
+ ) -> dict:
79
+ """
80
+ Wilcoxon signed-rank test.
81
+
82
+ One-sample: var2=None, tests median of var1 == mu.
83
+ Paired: tests median of (var1 - var2) == 0.
84
+ """
85
+ x = df[var1].drop_nulls().to_numpy().astype(float)
86
+ if var2 is not None:
87
+ y = df[var2].drop_nulls().to_numpy().astype(float)
88
+ diff = x[: len(y)] - y[: len(x)]
89
+ else:
90
+ diff = x - mu
91
+
92
+ stat, p = sp_stats.wilcoxon(diff, alternative=alternative)
93
+ return {
94
+ "test": "Wilcoxon signed-rank",
95
+ "var1": var1,
96
+ "var2": var2,
97
+ "mu": mu,
98
+ "n": len(diff),
99
+ "W_statistic": float(stat),
100
+ "p_value": float(p),
101
+ "alternative": alternative,
102
+ }
103
+
104
+
105
+ # ── Kruskal-Wallis ─────────────────────────────────────────────────────────
106
+
107
+ def kruskal_wallis_test(
108
+ df: pl.DataFrame,
109
+ var: str,
110
+ group: str,
111
+ ) -> dict:
112
+ """Kruskal-Wallis H test for k independent groups."""
113
+ groups = df[group].drop_nulls().unique().to_list()
114
+ samples = [
115
+ df.filter(pl.col(group) == g)[var].drop_nulls().to_numpy().astype(float)
116
+ for g in groups
117
+ ]
118
+ stat, p = sp_stats.kruskal(*samples)
119
+ df_stat = len(groups) - 1
120
+ return {
121
+ "test": "Kruskal-Wallis H",
122
+ "var": var,
123
+ "group": group,
124
+ "k_groups": len(groups),
125
+ "H_statistic": float(stat),
126
+ "df": df_stat,
127
+ "p_value": float(p),
128
+ "groups": groups,
129
+ "n_per_group": [len(s) for s in samples],
130
+ }
@@ -0,0 +1,179 @@
1
+ """Panel data models: Fixed Effects, Random Effects, Between, Hausman test."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ from dataclasses import dataclass, field
7
+
8
+ import numpy as np
9
+ import polars as pl
10
+ from rich.table import Table
11
+ from rich.console import Console
12
+
13
+ from openstat.stats.models import FitResult
14
+
15
+
16
+ def _try_import_linearmodels():
17
+ try:
18
+ import linearmodels # noqa: F401
19
+ except ImportError:
20
+ raise ImportError(
21
+ "Panel data models require linearmodels. "
22
+ "Install it with: pip install openstat[panel]"
23
+ )
24
+
25
+
26
+ def _panel_to_fit_result(result, model_type: str, dep: str, indeps: list[str]) -> FitResult:
27
+ """Convert a linearmodels PanelResults to FitResult."""
28
+ params = {name: float(val) for name, val in result.params.items()}
29
+ std_errors = {name: float(val) for name, val in result.std_errors.items()}
30
+ t_values = {name: float(val) for name, val in result.tstats.items()}
31
+ p_values = {name: float(val) for name, val in result.pvalues.items()}
32
+ ci = result.conf_int()
33
+ conf_low = {name: float(ci.loc[name, "lower"]) for name in params}
34
+ conf_high = {name: float(ci.loc[name, "upper"]) for name in params}
35
+
36
+ warnings_list: list[str] = []
37
+
38
+ return FitResult(
39
+ model_type=model_type,
40
+ formula=f"{dep} ~ {' + '.join(indeps)}",
41
+ dep_var=dep,
42
+ indep_vars=indeps,
43
+ n_obs=int(result.nobs),
44
+ params=params,
45
+ std_errors=std_errors,
46
+ t_values=t_values,
47
+ p_values=p_values,
48
+ conf_int_low=conf_low,
49
+ conf_int_high=conf_high,
50
+ r_squared=float(result.rsquared) if hasattr(result, "rsquared") else None,
51
+ f_statistic=float(result.f_statistic.stat) if hasattr(result, "f_statistic") and result.f_statistic is not None else None,
52
+ f_pvalue=float(result.f_statistic.pval) if hasattr(result, "f_statistic") and result.f_statistic is not None else None,
53
+ warnings=warnings_list,
54
+ )
55
+
56
+
57
+ def fit_panel_fe(
58
+ df: pl.DataFrame,
59
+ dep: str,
60
+ indeps: list[str],
61
+ entity_col: str,
62
+ time_col: str,
63
+ robust: bool = False,
64
+ cluster: str | None = None,
65
+ ) -> tuple[FitResult, object]:
66
+ """Fit a Fixed Effects panel model."""
67
+ _try_import_linearmodels()
68
+ from linearmodels.panel import PanelOLS
69
+
70
+ pdf = df.select([entity_col, time_col, dep] + indeps).to_pandas().dropna()
71
+ pdf = pdf.set_index([entity_col, time_col])
72
+
73
+ import statsmodels.api as sm
74
+ y = pdf[dep]
75
+ X = sm.add_constant(pdf[indeps])
76
+
77
+ model = PanelOLS(y, X, entity_effects=True)
78
+
79
+ cov_type = "unadjusted"
80
+ cov_kwds: dict = {}
81
+ if cluster:
82
+ cov_type = "clustered"
83
+ cov_kwds["cluster_entity"] = True
84
+ elif robust:
85
+ cov_type = "robust"
86
+
87
+ result = model.fit(cov_type=cov_type, **cov_kwds)
88
+ fit = _panel_to_fit_result(result, "Panel FE", dep, ["const"] + indeps)
89
+ return fit, result
90
+
91
+
92
+ def fit_panel_re(
93
+ df: pl.DataFrame,
94
+ dep: str,
95
+ indeps: list[str],
96
+ entity_col: str,
97
+ time_col: str,
98
+ robust: bool = False,
99
+ ) -> tuple[FitResult, object]:
100
+ """Fit a Random Effects panel model."""
101
+ _try_import_linearmodels()
102
+ from linearmodels.panel import RandomEffects
103
+
104
+ pdf = df.select([entity_col, time_col, dep] + indeps).to_pandas().dropna()
105
+ pdf = pdf.set_index([entity_col, time_col])
106
+
107
+ import statsmodels.api as sm
108
+ y = pdf[dep]
109
+ X = sm.add_constant(pdf[indeps])
110
+
111
+ model = RandomEffects(y, X)
112
+ cov_type = "robust" if robust else "unadjusted"
113
+ result = model.fit(cov_type=cov_type)
114
+ fit = _panel_to_fit_result(result, "Panel RE", dep, ["const"] + indeps)
115
+ return fit, result
116
+
117
+
118
+ def fit_panel_be(
119
+ df: pl.DataFrame,
120
+ dep: str,
121
+ indeps: list[str],
122
+ entity_col: str,
123
+ time_col: str,
124
+ ) -> tuple[FitResult, object]:
125
+ """Fit a Between Effects panel model."""
126
+ _try_import_linearmodels()
127
+ from linearmodels.panel import BetweenOLS
128
+
129
+ pdf = df.select([entity_col, time_col, dep] + indeps).to_pandas().dropna()
130
+ pdf = pdf.set_index([entity_col, time_col])
131
+
132
+ import statsmodels.api as sm
133
+ y = pdf[dep]
134
+ X = sm.add_constant(pdf[indeps])
135
+
136
+ model = BetweenOLS(y, X)
137
+ result = model.fit()
138
+ fit = _panel_to_fit_result(result, "Panel BE", dep, ["const"] + indeps)
139
+ return fit, result
140
+
141
+
142
+ def hausman_test(fe_result, re_result) -> str:
143
+ """Perform the Hausman test for FE vs RE.
144
+
145
+ H0: RE is consistent and efficient (prefer RE).
146
+ H1: RE is inconsistent (prefer FE).
147
+ """
148
+ b_fe = fe_result.params
149
+ b_re = re_result.params
150
+
151
+ # Use common coefficients (exclude const)
152
+ common = [k for k in b_fe.index if k in b_re.index and k != "const"]
153
+ if not common:
154
+ return "No common coefficients for Hausman test."
155
+
156
+ b_diff = np.array([b_fe[k] - b_re[k] for k in common])
157
+ cov_fe = fe_result.cov.loc[common, common].values
158
+ cov_re = re_result.cov.loc[common, common].values
159
+ cov_diff = cov_fe - cov_re
160
+
161
+ try:
162
+ chi2_stat = float(b_diff @ np.linalg.inv(cov_diff) @ b_diff)
163
+ except np.linalg.LinAlgError:
164
+ chi2_stat = float(b_diff @ np.linalg.pinv(cov_diff) @ b_diff)
165
+
166
+ from scipy import stats as sp_stats
167
+ df = len(common)
168
+ p_value = float(1 - sp_stats.chi2.cdf(chi2_stat, df))
169
+
170
+ recommendation = "Use Fixed Effects (FE)" if p_value < 0.05 else "Use Random Effects (RE)"
171
+
172
+ lines = [
173
+ "Hausman Test (FE vs RE)",
174
+ f" H0: Random Effects model is consistent",
175
+ f" chi2({df}) = {chi2_stat:.4f}",
176
+ f" p-value = {p_value:.4f}",
177
+ f" Recommendation: {recommendation}",
178
+ ]
179
+ return "\n".join(lines)
@@ -0,0 +1,295 @@
1
+ """Power analysis — one/two-sample means, proportions, OLS."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+
7
+ from scipy import stats as sp_stats
8
+
9
+
10
+ # ── Helpers ────────────────────────────────────────────────────────────────
11
+
12
+ def _solve(fn, lo=1, hi=1_000_000, tol=1e-6):
13
+ """Bisection solver: find x in [lo, hi] where fn(x) ≈ 0."""
14
+ for _ in range(60):
15
+ mid = (lo + hi) / 2
16
+ if fn(mid) < 0:
17
+ lo = mid
18
+ else:
19
+ hi = mid
20
+ if hi - lo < tol:
21
+ break
22
+ return (lo + hi) / 2
23
+
24
+
25
+ # ── One-sample mean ────────────────────────────────────────────────────────
26
+
27
+ def power_onemean(
28
+ effect_size: float | None = None,
29
+ alpha: float = 0.05,
30
+ n: int | None = None,
31
+ power: float | None = None,
32
+ sd: float = 1.0,
33
+ delta: float | None = None,
34
+ two_sided: bool = True,
35
+ ) -> dict:
36
+ """
37
+ Power analysis for one-sample t-test.
38
+
39
+ Provide exactly two of: effect_size (or delta/sd), n, power.
40
+ """
41
+ if delta is not None and effect_size is None:
42
+ effect_size = delta / sd
43
+
44
+ sides = 2 if two_sided else 1
45
+ za2 = sp_stats.norm.ppf(1 - alpha / sides)
46
+
47
+ def _power_from_n(n_val):
48
+ zb = abs(effect_size) * math.sqrt(n_val) - za2
49
+ return sp_stats.norm.cdf(zb)
50
+
51
+ if n is None and power is not None and effect_size is not None:
52
+ # Solve for n
53
+ n_val = _solve(lambda x: _power_from_n(x) - power)
54
+ n = math.ceil(n_val)
55
+ achieved = _power_from_n(n)
56
+ elif power is None and n is not None and effect_size is not None:
57
+ achieved = _power_from_n(n)
58
+ power = achieved
59
+ elif effect_size is None and n is not None and power is not None:
60
+ # Solve for detectable effect size
61
+ zb = sp_stats.norm.ppf(power)
62
+ effect_size = (za2 + zb) / math.sqrt(n)
63
+ achieved = power
64
+ else:
65
+ raise ValueError("Provide exactly two of: effect_size, n, power")
66
+
67
+ return {
68
+ "test": "One-sample t-test",
69
+ "effect_size": round(effect_size, 6),
70
+ "alpha": alpha,
71
+ "n": n,
72
+ "power": round(power, 6),
73
+ "two_sided": two_sided,
74
+ }
75
+
76
+
77
+ # ── Two-sample means ───────────────────────────────────────────────────────
78
+
79
+ def power_twomeans(
80
+ effect_size: float | None = None,
81
+ alpha: float = 0.05,
82
+ n: int | None = None,
83
+ power: float | None = None,
84
+ ratio: float = 1.0,
85
+ sd: float = 1.0,
86
+ delta: float | None = None,
87
+ two_sided: bool = True,
88
+ ) -> dict:
89
+ """Power analysis for two-sample independent t-test."""
90
+ if delta is not None and effect_size is None:
91
+ effect_size = delta / sd
92
+
93
+ sides = 2 if two_sided else 1
94
+ za2 = sp_stats.norm.ppf(1 - alpha / sides)
95
+
96
+ def _power_from_n(n1):
97
+ n2 = n1 * ratio
98
+ se = math.sqrt(1 / n1 + 1 / n2)
99
+ zb = abs(effect_size) / se - za2
100
+ return sp_stats.norm.cdf(zb)
101
+
102
+ if n is None and power is not None and effect_size is not None:
103
+ n_val = _solve(lambda x: _power_from_n(x) - power)
104
+ n = math.ceil(n_val)
105
+ achieved = _power_from_n(n)
106
+ power = achieved
107
+ elif power is None and n is not None and effect_size is not None:
108
+ power = _power_from_n(n)
109
+ elif effect_size is None and n is not None and power is not None:
110
+ zb = sp_stats.norm.ppf(power)
111
+ n2 = n * ratio
112
+ se = math.sqrt(1 / n + 1 / n2)
113
+ effect_size = (za2 + zb) * se
114
+ achieved = power
115
+ else:
116
+ raise ValueError("Provide exactly two of: effect_size, n, power")
117
+
118
+ return {
119
+ "test": "Two-sample t-test",
120
+ "effect_size": round(effect_size, 6),
121
+ "alpha": alpha,
122
+ "n1": n,
123
+ "n2": math.ceil(n * ratio),
124
+ "power": round(power, 6),
125
+ "ratio": ratio,
126
+ "two_sided": two_sided,
127
+ }
128
+
129
+
130
+ # ── One proportion ─────────────────────────────────────────────────────────
131
+
132
+ def power_oneproportion(
133
+ p0: float,
134
+ pa: float,
135
+ alpha: float = 0.05,
136
+ n: int | None = None,
137
+ power: float | None = None,
138
+ two_sided: bool = True,
139
+ ) -> dict:
140
+ """Power analysis for one-sample proportion z-test."""
141
+ sides = 2 if two_sided else 1
142
+ za2 = sp_stats.norm.ppf(1 - alpha / sides)
143
+ effect_size = abs(pa - p0) / math.sqrt(p0 * (1 - p0))
144
+
145
+ def _power_from_n(n_val):
146
+ se_null = math.sqrt(p0 * (1 - p0) / n_val)
147
+ se_alt = math.sqrt(pa * (1 - pa) / n_val)
148
+ z = (abs(pa - p0) - za2 * se_null) / se_alt
149
+ return sp_stats.norm.cdf(z)
150
+
151
+ if n is None and power is not None:
152
+ n_val = _solve(lambda x: _power_from_n(x) - power)
153
+ n = math.ceil(n_val)
154
+ power = _power_from_n(n)
155
+ elif power is None and n is not None:
156
+ power = _power_from_n(n)
157
+ else:
158
+ raise ValueError("Provide exactly one of: n, power")
159
+
160
+ return {
161
+ "test": "One-sample proportion z-test",
162
+ "p0": p0,
163
+ "pa": pa,
164
+ "effect_size": round(effect_size, 6),
165
+ "alpha": alpha,
166
+ "n": n,
167
+ "power": round(power, 6),
168
+ "two_sided": two_sided,
169
+ }
170
+
171
+
172
+ # ── Two proportions ────────────────────────────────────────────────────────
173
+
174
+ def power_twoproportions(
175
+ p1: float,
176
+ p2: float,
177
+ alpha: float = 0.05,
178
+ n: int | None = None,
179
+ power: float | None = None,
180
+ two_sided: bool = True,
181
+ ) -> dict:
182
+ """Power analysis for two-sample proportion z-test."""
183
+ sides = 2 if two_sided else 1
184
+ za2 = sp_stats.norm.ppf(1 - alpha / sides)
185
+ p_avg = (p1 + p2) / 2
186
+ effect_size = abs(p2 - p1) / math.sqrt(p_avg * (1 - p_avg))
187
+
188
+ def _power_from_n(n_val):
189
+ se_null = math.sqrt(2 * p_avg * (1 - p_avg) / n_val)
190
+ se_alt = math.sqrt((p1 * (1 - p1) + p2 * (1 - p2)) / n_val)
191
+ z = (abs(p2 - p1) - za2 * se_null) / se_alt
192
+ return sp_stats.norm.cdf(z)
193
+
194
+ if n is None and power is not None:
195
+ n_val = _solve(lambda x: _power_from_n(x) - power)
196
+ n = math.ceil(n_val)
197
+ power = _power_from_n(n)
198
+ elif power is None and n is not None:
199
+ power = _power_from_n(n)
200
+ else:
201
+ raise ValueError("Provide exactly one of: n, power")
202
+
203
+ return {
204
+ "test": "Two-sample proportion z-test",
205
+ "p1": p1,
206
+ "p2": p2,
207
+ "effect_size": round(effect_size, 6),
208
+ "alpha": alpha,
209
+ "n": n,
210
+ "power": round(power, 6),
211
+ "two_sided": two_sided,
212
+ }
213
+
214
+
215
+ # ── OLS / multiple regression ──────────────────────────────────────────────
216
+
217
+ def power_ols(
218
+ f2: float | None = None,
219
+ alpha: float = 0.05,
220
+ n: int | None = None,
221
+ power: float | None = None,
222
+ k: int = 1,
223
+ ) -> dict:
224
+ """
225
+ Power analysis for OLS / multiple regression (Cohen's f²).
226
+
227
+ f2 = R² / (1 - R²)
228
+ k = number of predictors
229
+ """
230
+
231
+ def _power_from_n(n_val):
232
+ df1 = k
233
+ df2 = n_val - k - 1
234
+ if df2 <= 0:
235
+ return 0.0
236
+ nc = f2 * n_val
237
+ return 1 - sp_stats.f.cdf(
238
+ sp_stats.f.ppf(1 - alpha, df1, df2), df1, df2, nc
239
+ )
240
+
241
+ if n is None and power is not None and f2 is not None:
242
+ n_val = _solve(lambda x: _power_from_n(x) - power, lo=k + 2)
243
+ n = math.ceil(n_val)
244
+ power = _power_from_n(n)
245
+ elif power is None and n is not None and f2 is not None:
246
+ power = _power_from_n(n)
247
+ elif f2 is None and n is not None and power is not None:
248
+ f2 = _solve(
249
+ lambda f: _power_from_n_f2(n, f, alpha, k) - power, # type: ignore[arg-type]
250
+ lo=0.0001,
251
+ hi=10,
252
+ )
253
+ power = _power_from_n(n)
254
+ else:
255
+ raise ValueError("Provide exactly two of: f2, n, power")
256
+
257
+ return {
258
+ "test": "OLS / Multiple Regression",
259
+ "f2": round(f2, 6),
260
+ "alpha": alpha,
261
+ "n": n,
262
+ "k": k,
263
+ "power": round(power, 6),
264
+ }
265
+
266
+
267
+ def _power_from_n_f2(n, f2, alpha, k):
268
+ df1 = k
269
+ df2 = n - k - 1
270
+ if df2 <= 0:
271
+ return 0.0
272
+ nc = f2 * n
273
+ return 1 - sp_stats.f.cdf(
274
+ sp_stats.f.ppf(1 - alpha, df1, df2), df1, df2, nc
275
+ )
276
+
277
+
278
+ # ── sampsi (Stata-style) ───────────────────────────────────────────────────
279
+
280
+ def sampsi(
281
+ mu1: float,
282
+ mu2: float,
283
+ sd: float = 1.0,
284
+ alpha: float = 0.05,
285
+ power: float = 0.80,
286
+ two_sided: bool = True,
287
+ ) -> dict:
288
+ """Compute required sample size for two-sample t-test (Stata sampsi style)."""
289
+ effect_size = abs(mu2 - mu1) / sd
290
+ return power_twomeans(
291
+ effect_size=effect_size,
292
+ alpha=alpha,
293
+ power=power,
294
+ two_sided=two_sided,
295
+ )