openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Post-hoc comparison commands: posthoc (Tukey HSD, Bonferroni, Scheffé)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from scipy import stats
|
|
9
|
+
from statsmodels.stats.multicomp import MultiComparison
|
|
10
|
+
|
|
11
|
+
from openstat.commands.base import command
|
|
12
|
+
from openstat.session import Session
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _fmt_table(title: str, headers: list[str], rows: list[tuple]) -> str:
|
|
16
|
+
all_rows = [tuple(str(v) for v in r) for r in rows]
|
|
17
|
+
widths = [
|
|
18
|
+
max(len(headers[i]), max((len(r[i]) for r in all_rows), default=0))
|
|
19
|
+
for i in range(len(headers))
|
|
20
|
+
]
|
|
21
|
+
sep = "-" * (sum(widths) + 3 * len(widths) + 1)
|
|
22
|
+
header_line = " | ".join(f"{h:<{w}}" for h, w in zip(headers, widths))
|
|
23
|
+
lines = [f"\n{title}", "=" * len(sep), header_line, sep]
|
|
24
|
+
for row in all_rows:
|
|
25
|
+
lines.append(" | ".join(f"{v:<{w}}" for v, w in zip(row, widths)))
|
|
26
|
+
lines.append("=" * len(sep))
|
|
27
|
+
return "\n".join(lines)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@command("posthoc", usage="posthoc <var> by(<group>) [--tukey|--bonferroni|--scheffe]")
|
|
31
|
+
def cmd_posthoc(session: Session, args: str) -> str:
|
|
32
|
+
"""Post-hoc pairwise comparisons after ANOVA (Tukey HSD, Bonferroni, Scheffé).
|
|
33
|
+
|
|
34
|
+
Examples:
|
|
35
|
+
posthoc score by(region)
|
|
36
|
+
posthoc income by(education) --bonferroni
|
|
37
|
+
posthoc age by(group) --scheffe
|
|
38
|
+
"""
|
|
39
|
+
df = session.require_data()
|
|
40
|
+
|
|
41
|
+
m = re.search(r"by\((\w+)\)", args)
|
|
42
|
+
if not m:
|
|
43
|
+
return "Usage: posthoc <var> by(<group>) [--tukey|--bonferroni|--scheffe]"
|
|
44
|
+
group_col = m.group(1)
|
|
45
|
+
|
|
46
|
+
rest = re.sub(r"by\([^)]*\)", "", args)
|
|
47
|
+
tokens = [t for t in rest.split() if not t.startswith("--")]
|
|
48
|
+
if not tokens:
|
|
49
|
+
return "Usage: posthoc <var> by(<group>) [--tukey|--bonferroni|--scheffe]"
|
|
50
|
+
var = tokens[0]
|
|
51
|
+
|
|
52
|
+
if var not in df.columns:
|
|
53
|
+
return f"Column not found: {var}"
|
|
54
|
+
if group_col not in df.columns:
|
|
55
|
+
return f"Column not found: {group_col}"
|
|
56
|
+
|
|
57
|
+
method = "tukey"
|
|
58
|
+
if "--bonferroni" in args:
|
|
59
|
+
method = "bonferroni"
|
|
60
|
+
elif "--scheffe" in args:
|
|
61
|
+
method = "scheffe"
|
|
62
|
+
|
|
63
|
+
sub = df.select([var, group_col]).drop_nulls()
|
|
64
|
+
values = sub[var].to_numpy(allow_copy=True).astype(float)
|
|
65
|
+
groups = sub[group_col].to_list()
|
|
66
|
+
|
|
67
|
+
group_labels = sorted(set(str(g) for g in groups))
|
|
68
|
+
groups_str = [str(g) for g in groups]
|
|
69
|
+
|
|
70
|
+
if len(group_labels) < 2:
|
|
71
|
+
return "Need at least 2 groups for post-hoc comparison."
|
|
72
|
+
|
|
73
|
+
# Run overall ANOVA first
|
|
74
|
+
group_arrays = [values[np.array([g == lbl for g in groups_str])] for lbl in group_labels]
|
|
75
|
+
f_stat, p_overall = stats.f_oneway(*group_arrays)
|
|
76
|
+
anova_line = f"Overall ANOVA: F = {f_stat:.4f}, p = {p_overall:.4f}"
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
if method == "tukey":
|
|
80
|
+
mc = MultiComparison(values, groups_str)
|
|
81
|
+
result = mc.tukeyhsd()
|
|
82
|
+
summary_data = result.summary().data
|
|
83
|
+
headers = ["Group1", "Group2", "MeanDiff", "Lower", "Upper", "p-adj", "Reject H0"]
|
|
84
|
+
rows = []
|
|
85
|
+
for row in summary_data[1:]:
|
|
86
|
+
g1, g2, meandiff, p_adj, lower, upper, reject = row
|
|
87
|
+
rows.append((
|
|
88
|
+
str(g1), str(g2),
|
|
89
|
+
f"{float(meandiff):.4f}",
|
|
90
|
+
f"{float(lower):.4f}",
|
|
91
|
+
f"{float(upper):.4f}",
|
|
92
|
+
f"{float(p_adj):.4f}",
|
|
93
|
+
"Yes" if reject else "No",
|
|
94
|
+
))
|
|
95
|
+
return anova_line + _fmt_table("Tukey HSD Post-hoc Comparison", headers, rows)
|
|
96
|
+
|
|
97
|
+
elif method == "bonferroni":
|
|
98
|
+
n_pairs = len(group_labels) * (len(group_labels) - 1) // 2
|
|
99
|
+
alpha_adj = 0.05 / n_pairs
|
|
100
|
+
rows = []
|
|
101
|
+
for i in range(len(group_labels)):
|
|
102
|
+
for j in range(i + 1, len(group_labels)):
|
|
103
|
+
a = group_arrays[i]
|
|
104
|
+
b = group_arrays[j]
|
|
105
|
+
t_stat, p_raw = stats.ttest_ind(a, b)
|
|
106
|
+
p_adj = min(p_raw * n_pairs, 1.0)
|
|
107
|
+
diff = np.mean(a) - np.mean(b)
|
|
108
|
+
rows.append((
|
|
109
|
+
group_labels[i], group_labels[j],
|
|
110
|
+
f"{diff:.4f}",
|
|
111
|
+
f"{t_stat:.4f}",
|
|
112
|
+
f"{p_raw:.4f}",
|
|
113
|
+
f"{p_adj:.4f}",
|
|
114
|
+
"Yes" if p_adj < 0.05 else "No",
|
|
115
|
+
))
|
|
116
|
+
headers = ["Group1", "Group2", "MeanDiff", "t-stat", "p-raw", "p-adj(Bonf)", "Reject H0"]
|
|
117
|
+
note = f"\n (Bonferroni correction: α_adj = 0.05/{n_pairs} = {alpha_adj:.5f})"
|
|
118
|
+
return anova_line + _fmt_table("Bonferroni Post-hoc Comparison", headers, rows) + note
|
|
119
|
+
|
|
120
|
+
elif method == "scheffe":
|
|
121
|
+
k = len(group_labels)
|
|
122
|
+
n_total = sum(len(d) for d in group_arrays)
|
|
123
|
+
n_per = [len(d) for d in group_arrays]
|
|
124
|
+
means = [np.mean(d) for d in group_arrays]
|
|
125
|
+
|
|
126
|
+
ss_within = sum(np.sum((d - np.mean(d)) ** 2) for d in group_arrays)
|
|
127
|
+
df_within = n_total - k
|
|
128
|
+
mse = ss_within / df_within
|
|
129
|
+
f_crit = stats.f.ppf(0.95, k - 1, df_within)
|
|
130
|
+
critical = (k - 1) * f_crit
|
|
131
|
+
|
|
132
|
+
rows = []
|
|
133
|
+
for i in range(k):
|
|
134
|
+
for j in range(i + 1, k):
|
|
135
|
+
diff = means[i] - means[j]
|
|
136
|
+
f_s = diff ** 2 / (mse * (1.0 / n_per[i] + 1.0 / n_per[j]))
|
|
137
|
+
p_val = 1.0 - stats.f.cdf(f_s / (k - 1), k - 1, df_within)
|
|
138
|
+
rows.append((
|
|
139
|
+
group_labels[i], group_labels[j],
|
|
140
|
+
f"{diff:.4f}",
|
|
141
|
+
f"{f_s:.4f}",
|
|
142
|
+
f"{critical:.4f}",
|
|
143
|
+
f"{p_val:.4f}",
|
|
144
|
+
"Yes" if f_s > critical else "No",
|
|
145
|
+
))
|
|
146
|
+
headers = ["Group1", "Group2", "MeanDiff", "F*", "F-critical", "p-value", "Reject H0"]
|
|
147
|
+
note = f"\n (Scheffé critical value = (k-1)×F_crit = {k-1}×{f_crit:.4f} = {critical:.4f})"
|
|
148
|
+
return anova_line + _fmt_table("Scheffé Post-hoc Comparison", headers, rows) + note
|
|
149
|
+
|
|
150
|
+
except Exception as exc:
|
|
151
|
+
return f"posthoc error: {exc}"
|
|
152
|
+
|
|
153
|
+
return "Unknown method."
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Power analysis commands: power, sampsi."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from openstat.commands.base import command
|
|
8
|
+
from openstat.session import Session
|
|
9
|
+
from openstat.stats.power import (
|
|
10
|
+
power_onemean,
|
|
11
|
+
power_twomeans,
|
|
12
|
+
power_oneproportion,
|
|
13
|
+
power_twoproportions,
|
|
14
|
+
power_ols,
|
|
15
|
+
sampsi as _sampsi,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# ── Stata-style argument parser ────────────────────────────────────────────
|
|
20
|
+
|
|
21
|
+
def _stata_parse(raw: str) -> tuple[list[str], dict[str, str]]:
|
|
22
|
+
"""Parse Stata-style args: positional tokens and key(value) options.
|
|
23
|
+
|
|
24
|
+
Handles:
|
|
25
|
+
- positional tokens (bare words, numbers)
|
|
26
|
+
- key(value) options e.g. n(50), alpha(0.05)
|
|
27
|
+
- key=value options e.g. n=50
|
|
28
|
+
- --flag / bare flag e.g. --onesided, onesided
|
|
29
|
+
- commas are ignored (Stata separator)
|
|
30
|
+
"""
|
|
31
|
+
opts: dict[str, str] = {}
|
|
32
|
+
positional: list[str] = []
|
|
33
|
+
flags: set[str] = set()
|
|
34
|
+
|
|
35
|
+
# key(value)
|
|
36
|
+
for m in re.finditer(r'(\w+)\(([^)]*)\)', raw):
|
|
37
|
+
opts[m.group(1).lower()] = m.group(2)
|
|
38
|
+
|
|
39
|
+
# Remove key(value) tokens from raw for further parsing
|
|
40
|
+
rest = re.sub(r'\w+\([^)]*\)', '', raw)
|
|
41
|
+
|
|
42
|
+
for tok in rest.split():
|
|
43
|
+
tok = tok.strip(',')
|
|
44
|
+
if not tok:
|
|
45
|
+
continue
|
|
46
|
+
if '=' in tok:
|
|
47
|
+
k, v = tok.split('=', 1)
|
|
48
|
+
opts[k.lower().lstrip('-')] = v
|
|
49
|
+
elif tok.startswith('--'):
|
|
50
|
+
flags.add(tok.lstrip('-').lower())
|
|
51
|
+
elif re.match(r'^-?(\d+\.?\d*|\.\d+)$', tok):
|
|
52
|
+
positional.append(tok)
|
|
53
|
+
elif re.match(r'^\w+$', tok):
|
|
54
|
+
# Could be a sub-command, flag, or positional
|
|
55
|
+
positional.append(tok)
|
|
56
|
+
|
|
57
|
+
return positional, opts, flags
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _fmt_row(label: str, value) -> str:
|
|
61
|
+
return f" {label:<30} {value}"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _power_table(result: dict) -> str:
|
|
65
|
+
lines = [f"\n{result['test']}", "-" * 50]
|
|
66
|
+
skip = {"test"}
|
|
67
|
+
for k, v in result.items():
|
|
68
|
+
if k in skip:
|
|
69
|
+
continue
|
|
70
|
+
if isinstance(v, float):
|
|
71
|
+
lines.append(_fmt_row(k, f"{v:.4f}"))
|
|
72
|
+
else:
|
|
73
|
+
lines.append(_fmt_row(k, v))
|
|
74
|
+
lines.append("-" * 50)
|
|
75
|
+
return "\n".join(lines)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# ── Command ────────────────────────────────────────────────────────────────
|
|
79
|
+
|
|
80
|
+
@command("power", usage="power onemean|twomeans|oneprop|twoprop|ols [options]")
|
|
81
|
+
def cmd_power(session: Session, args: str) -> str:
|
|
82
|
+
"""Power analysis for common statistical tests."""
|
|
83
|
+
positional, opts, flags = _stata_parse(args)
|
|
84
|
+
|
|
85
|
+
if not positional:
|
|
86
|
+
return (
|
|
87
|
+
"Usage: power <subcommand> [options]\n"
|
|
88
|
+
"Subcommands: onemean, twomeans, oneprop, twoprop, ols\n\n"
|
|
89
|
+
"Examples:\n"
|
|
90
|
+
" power onemean, n(50) delta(0.5) sd(1)\n"
|
|
91
|
+
" power twomeans, n(80) delta(0.5) sd(1)\n"
|
|
92
|
+
" power oneprop, n(100) p0(0.5) pa(0.65)\n"
|
|
93
|
+
" power twoprop, p1(0.3) p2(0.5) power(0.80)\n"
|
|
94
|
+
" power ols, n(100) f2(0.15) k(3)"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
sub = positional[0].lower()
|
|
98
|
+
alpha = float(opts.get("alpha", 0.05))
|
|
99
|
+
n = int(opts["n"]) if "n" in opts else None
|
|
100
|
+
pwr = float(opts["power"]) if "power" in opts else None
|
|
101
|
+
two_sided = "onesided" not in flags
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
if sub in ("onemean", "one_mean"):
|
|
105
|
+
delta = float(opts.get("delta", 0.5))
|
|
106
|
+
sd = float(opts.get("sd", 1.0))
|
|
107
|
+
es = float(opts["es"]) if "es" in opts else None
|
|
108
|
+
result = power_onemean(
|
|
109
|
+
effect_size=es, alpha=alpha, n=n, power=pwr,
|
|
110
|
+
sd=sd, delta=delta, two_sided=two_sided,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
elif sub in ("twomeans", "two_means"):
|
|
114
|
+
delta = float(opts.get("delta", 0.5))
|
|
115
|
+
sd = float(opts.get("sd", 1.0))
|
|
116
|
+
ratio = float(opts.get("ratio", 1.0))
|
|
117
|
+
n1 = int(opts["n1"]) if "n1" in opts else n
|
|
118
|
+
es = float(opts["es"]) if "es" in opts else None
|
|
119
|
+
result = power_twomeans(
|
|
120
|
+
effect_size=es, alpha=alpha, n=n1, power=pwr,
|
|
121
|
+
ratio=ratio, sd=sd, delta=delta, two_sided=two_sided,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
elif sub in ("oneprop", "one_prop", "onepropo"):
|
|
125
|
+
p0 = float(opts.get("p0", 0.5))
|
|
126
|
+
pa = float(opts.get("pa", 0.6))
|
|
127
|
+
result = power_oneproportion(
|
|
128
|
+
p0=p0, pa=pa, alpha=alpha, n=n, power=pwr, two_sided=two_sided,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
elif sub in ("twoprop", "two_prop", "twopropo"):
|
|
132
|
+
p1 = float(opts.get("p1", 0.3))
|
|
133
|
+
p2 = float(opts.get("p2", 0.5))
|
|
134
|
+
result = power_twoproportions(
|
|
135
|
+
p1=p1, p2=p2, alpha=alpha, n=n, power=pwr, two_sided=two_sided,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
elif sub == "ols":
|
|
139
|
+
f2 = float(opts["f2"]) if "f2" in opts else None
|
|
140
|
+
k = int(opts.get("k", 1))
|
|
141
|
+
result = power_ols(f2=f2, alpha=alpha, n=n, power=pwr, k=k)
|
|
142
|
+
|
|
143
|
+
else:
|
|
144
|
+
return f"Unknown power subcommand: {sub}"
|
|
145
|
+
|
|
146
|
+
except (ValueError, TypeError) as exc:
|
|
147
|
+
return f"Power analysis error: {exc}"
|
|
148
|
+
|
|
149
|
+
return _power_table(result)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@command("sampsi", usage="sampsi mu1 mu2 [, sd(1) alpha(0.05) power(0.80)]")
|
|
153
|
+
def cmd_sampsi(session: Session, args: str) -> str:
|
|
154
|
+
"""Compute required sample size (Stata-style sampsi)."""
|
|
155
|
+
positional, opts, flags = _stata_parse(args)
|
|
156
|
+
|
|
157
|
+
# Filter only numeric positionals
|
|
158
|
+
nums = [p for p in positional if re.match(r'^-?(\d+\.?\d*|\.\d+)$', p)]
|
|
159
|
+
if len(nums) < 2:
|
|
160
|
+
return "Usage: sampsi mu1 mu2 [, sd(1) alpha(0.05) power(0.80)]"
|
|
161
|
+
|
|
162
|
+
try:
|
|
163
|
+
mu1 = float(nums[0])
|
|
164
|
+
mu2 = float(nums[1])
|
|
165
|
+
sd = float(opts.get("sd", 1.0))
|
|
166
|
+
alpha = float(opts.get("alpha", 0.05))
|
|
167
|
+
pwr = float(opts.get("power", 0.80))
|
|
168
|
+
result = _sampsi(mu1=mu1, mu2=mu2, sd=sd, alpha=alpha, power=pwr)
|
|
169
|
+
except (ValueError, TypeError) as exc:
|
|
170
|
+
return f"sampsi error: {exc}"
|
|
171
|
+
|
|
172
|
+
return _power_table(result)
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
"""Data profile and data dictionary commands."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from openstat.commands.base import command, CommandArgs, friendly_error
|
|
6
|
+
from openstat.session import Session
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@command("profile", usage="profile [col1 col2 ...] [--out=report.html]")
|
|
10
|
+
def cmd_profile(session: Session, args: str) -> str:
|
|
11
|
+
"""Generate a comprehensive data profile report.
|
|
12
|
+
|
|
13
|
+
Shows for each column: type, missing count/%, unique values,
|
|
14
|
+
min/max/mean/std/median/mode, top values, distribution shape.
|
|
15
|
+
|
|
16
|
+
Options:
|
|
17
|
+
--out=<path> save as HTML report (default: outputs/profile.html)
|
|
18
|
+
--cols=<list> comma-separated column subset
|
|
19
|
+
|
|
20
|
+
Examples:
|
|
21
|
+
profile
|
|
22
|
+
profile income age education
|
|
23
|
+
profile --out=data_profile.html
|
|
24
|
+
"""
|
|
25
|
+
import polars as pl
|
|
26
|
+
|
|
27
|
+
ca = CommandArgs(args)
|
|
28
|
+
try:
|
|
29
|
+
df = session.require_data()
|
|
30
|
+
except RuntimeError as e:
|
|
31
|
+
return str(e)
|
|
32
|
+
|
|
33
|
+
# Column subset
|
|
34
|
+
cols_opt = ca.options.get("cols")
|
|
35
|
+
if cols_opt:
|
|
36
|
+
cols = [c.strip() for c in cols_opt.split(",")]
|
|
37
|
+
elif ca.positional:
|
|
38
|
+
cols = ca.positional
|
|
39
|
+
else:
|
|
40
|
+
cols = df.columns
|
|
41
|
+
|
|
42
|
+
missing_cols = [c for c in cols if c not in df.columns]
|
|
43
|
+
if missing_cols:
|
|
44
|
+
return f"Columns not found: {', '.join(missing_cols)}"
|
|
45
|
+
|
|
46
|
+
NUMERIC = (pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
47
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64)
|
|
48
|
+
|
|
49
|
+
lines = [
|
|
50
|
+
f"Data Profile: {session.dataset_name or 'dataset'}",
|
|
51
|
+
f"Shape: {df.height:,} rows × {df.width} columns | Showing {len(cols)} columns",
|
|
52
|
+
"=" * 72,
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
for col in cols:
|
|
56
|
+
series = df[col]
|
|
57
|
+
n_miss = series.null_count()
|
|
58
|
+
miss_pct = 100 * n_miss / df.height if df.height else 0
|
|
59
|
+
n_uniq = series.drop_nulls().n_unique()
|
|
60
|
+
dtype = str(series.dtype)
|
|
61
|
+
|
|
62
|
+
lines.append(f"\n {col} [{dtype}]")
|
|
63
|
+
lines.append(f" Missing: {n_miss:,} ({miss_pct:.1f}%)")
|
|
64
|
+
lines.append(f" Unique: {n_uniq:,}")
|
|
65
|
+
|
|
66
|
+
if series.dtype in NUMERIC:
|
|
67
|
+
s = series.drop_nulls()
|
|
68
|
+
if s.len() > 0:
|
|
69
|
+
lines.append(f" Mean: {s.mean():.4f}")
|
|
70
|
+
lines.append(f" Std: {s.std():.4f}" if s.len() > 1 else " Std: —")
|
|
71
|
+
lines.append(f" Min: {s.min():.4f}")
|
|
72
|
+
lines.append(f" Median: {s.median():.4f}")
|
|
73
|
+
lines.append(f" Max: {s.max():.4f}")
|
|
74
|
+
# Skewness / kurtosis
|
|
75
|
+
try:
|
|
76
|
+
import numpy as np
|
|
77
|
+
arr = s.to_numpy()
|
|
78
|
+
from scipy.stats import skew, kurtosis
|
|
79
|
+
lines.append(f" Skewness: {skew(arr):.3f}")
|
|
80
|
+
lines.append(f" Kurtosis: {kurtosis(arr):.3f}")
|
|
81
|
+
except Exception:
|
|
82
|
+
pass
|
|
83
|
+
# Zeros / negatives
|
|
84
|
+
n_zero = int((s == 0).sum())
|
|
85
|
+
n_neg = int((s < 0).sum())
|
|
86
|
+
if n_zero or n_neg:
|
|
87
|
+
lines.append(f" Zeros: {n_zero:,} Negative: {n_neg:,}")
|
|
88
|
+
else:
|
|
89
|
+
# Categorical / string
|
|
90
|
+
s = series.drop_nulls().cast(pl.Utf8)
|
|
91
|
+
top = s.value_counts().sort("count", descending=True).head(5)
|
|
92
|
+
if top.height > 0:
|
|
93
|
+
top_vals = ", ".join(
|
|
94
|
+
f"{row[0]}({row[1]})" for row in top.iter_rows()
|
|
95
|
+
)
|
|
96
|
+
lines.append(f" Top 5: {top_vals}")
|
|
97
|
+
|
|
98
|
+
lines.append("\n" + "=" * 72)
|
|
99
|
+
|
|
100
|
+
# HTML output
|
|
101
|
+
out_path = ca.options.get("out")
|
|
102
|
+
if out_path:
|
|
103
|
+
try:
|
|
104
|
+
_save_profile_html(lines, out_path, session)
|
|
105
|
+
lines.append(f"\nHTML report saved: {out_path}")
|
|
106
|
+
except Exception as exc:
|
|
107
|
+
lines.append(f"\nHTML save failed: {exc}")
|
|
108
|
+
|
|
109
|
+
return "\n".join(lines)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _save_profile_html(lines: list[str], path: str, session: Session) -> None:
|
|
113
|
+
"""Save a simple HTML version of the profile."""
|
|
114
|
+
import polars as pl
|
|
115
|
+
from pathlib import Path
|
|
116
|
+
|
|
117
|
+
df = session.df
|
|
118
|
+
NUMERIC = (pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
119
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64)
|
|
120
|
+
|
|
121
|
+
rows_html = ""
|
|
122
|
+
if df is not None:
|
|
123
|
+
for col in df.columns:
|
|
124
|
+
series = df[col]
|
|
125
|
+
n_miss = series.null_count()
|
|
126
|
+
miss_pct = f"{100*n_miss/df.height:.1f}%" if df.height else "—"
|
|
127
|
+
n_uniq = series.drop_nulls().n_unique()
|
|
128
|
+
dtype = str(series.dtype)
|
|
129
|
+
if series.dtype in NUMERIC:
|
|
130
|
+
s = series.drop_nulls()
|
|
131
|
+
stats = f"mean={s.mean():.3f}, std={s.std():.3f}" if s.len() > 0 else "—"
|
|
132
|
+
else:
|
|
133
|
+
stats = f"{n_uniq} unique values"
|
|
134
|
+
rows_html += (
|
|
135
|
+
f"<tr><td>{col}</td><td>{dtype}</td>"
|
|
136
|
+
f"<td>{n_miss} ({miss_pct})</td>"
|
|
137
|
+
f"<td>{n_uniq}</td><td>{stats}</td></tr>\n"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
html = f"""<!DOCTYPE html>
|
|
141
|
+
<html><head><meta charset="utf-8">
|
|
142
|
+
<title>OpenStat Data Profile</title>
|
|
143
|
+
<style>
|
|
144
|
+
body {{font-family: sans-serif; margin: 2em; background: #f9f9f9;}}
|
|
145
|
+
h1 {{color: #333;}} table {{border-collapse: collapse; width: 100%;}}
|
|
146
|
+
th {{background: #4C72B0; color: white; padding: 8px;}}
|
|
147
|
+
td {{border: 1px solid #ddd; padding: 6px;}}
|
|
148
|
+
tr:nth-child(even) {{background: #f0f4ff;}}
|
|
149
|
+
</style></head><body>
|
|
150
|
+
<h1>Data Profile: {session.dataset_name or "dataset"}</h1>
|
|
151
|
+
<p>Shape: {session.shape_str}</p>
|
|
152
|
+
<table>
|
|
153
|
+
<tr><th>Column</th><th>Type</th><th>Missing</th><th>Unique</th><th>Stats</th></tr>
|
|
154
|
+
{rows_html}
|
|
155
|
+
</table>
|
|
156
|
+
<pre>{"chr(10)".join(lines)}</pre>
|
|
157
|
+
</body></html>"""
|
|
158
|
+
|
|
159
|
+
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
|
160
|
+
Path(path).write_text(html, encoding="utf-8")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
@command("datadict", usage="datadict [--out=dict.xlsx|dict.md]")
|
|
164
|
+
def cmd_datadict(session: Session, args: str) -> str:
|
|
165
|
+
"""Generate a data dictionary for the current dataset.
|
|
166
|
+
|
|
167
|
+
Creates a table with: variable name, type, missing%, unique count,
|
|
168
|
+
min/max/mean for numeric, top values for categorical.
|
|
169
|
+
|
|
170
|
+
Options:
|
|
171
|
+
--out=<path> save to Excel (.xlsx) or Markdown (.md)
|
|
172
|
+
Default: outputs/data_dictionary.md
|
|
173
|
+
|
|
174
|
+
Examples:
|
|
175
|
+
datadict
|
|
176
|
+
datadict --out=dictionary.xlsx
|
|
177
|
+
datadict --out=docs/variables.md
|
|
178
|
+
"""
|
|
179
|
+
import polars as pl
|
|
180
|
+
from pathlib import Path
|
|
181
|
+
|
|
182
|
+
ca = CommandArgs(args)
|
|
183
|
+
out_path = ca.options.get("out", "outputs/data_dictionary.md")
|
|
184
|
+
|
|
185
|
+
try:
|
|
186
|
+
df = session.require_data()
|
|
187
|
+
except RuntimeError as e:
|
|
188
|
+
return str(e)
|
|
189
|
+
|
|
190
|
+
NUMERIC = (pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
191
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64)
|
|
192
|
+
|
|
193
|
+
records = []
|
|
194
|
+
for col in df.columns:
|
|
195
|
+
series = df[col]
|
|
196
|
+
n_miss = series.null_count()
|
|
197
|
+
miss_pct = f"{100*n_miss/df.height:.1f}%"
|
|
198
|
+
n_uniq = series.drop_nulls().n_unique()
|
|
199
|
+
dtype = str(series.dtype)
|
|
200
|
+
|
|
201
|
+
if series.dtype in NUMERIC:
|
|
202
|
+
s = series.drop_nulls()
|
|
203
|
+
if s.len() > 0:
|
|
204
|
+
extra = f"mean={s.mean():.3f}; range=[{s.min():.3f},{s.max():.3f}]"
|
|
205
|
+
else:
|
|
206
|
+
extra = "all missing"
|
|
207
|
+
else:
|
|
208
|
+
top = series.drop_nulls().cast(pl.Utf8).value_counts().sort("count", descending=True).head(3)
|
|
209
|
+
top_vals = "; ".join(str(r[0]) for r in top.iter_rows())
|
|
210
|
+
extra = f"top: {top_vals}"
|
|
211
|
+
|
|
212
|
+
records.append({
|
|
213
|
+
"Variable": col,
|
|
214
|
+
"Type": dtype,
|
|
215
|
+
"Missing": f"{n_miss} ({miss_pct})",
|
|
216
|
+
"Unique": str(n_uniq),
|
|
217
|
+
"Notes": extra,
|
|
218
|
+
"Description": "", # user fills in
|
|
219
|
+
})
|
|
220
|
+
|
|
221
|
+
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
|
|
222
|
+
|
|
223
|
+
if out_path.endswith(".xlsx"):
|
|
224
|
+
try:
|
|
225
|
+
dict_df = pl.DataFrame(records)
|
|
226
|
+
dict_df.write_excel(out_path)
|
|
227
|
+
return f"Data dictionary saved: {out_path} ({len(records)} variables)"
|
|
228
|
+
except ImportError:
|
|
229
|
+
return "xlsxwriter required for Excel output. Try --out=dict.md"
|
|
230
|
+
|
|
231
|
+
else: # Markdown
|
|
232
|
+
lines = [
|
|
233
|
+
f"# Data Dictionary: {session.dataset_name or 'dataset'}",
|
|
234
|
+
f"",
|
|
235
|
+
f"Shape: {df.height:,} rows × {df.width} columns",
|
|
236
|
+
f"",
|
|
237
|
+
"| Variable | Type | Missing | Unique | Notes | Description |",
|
|
238
|
+
"|---|---|---|---|---|---|",
|
|
239
|
+
]
|
|
240
|
+
for r in records:
|
|
241
|
+
lines.append(
|
|
242
|
+
f"| {r['Variable']} | {r['Type']} | {r['Missing']} | "
|
|
243
|
+
f"{r['Unique']} | {r['Notes']} | {r['Description']} |"
|
|
244
|
+
)
|
|
245
|
+
Path(out_path).write_text("\n".join(lines), encoding="utf-8")
|
|
246
|
+
return f"Data dictionary saved: {out_path} ({len(records)} variables)"
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""R bridge command: run R code from OpenStat (requires rpy2)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from openstat.commands.base import command, CommandArgs, friendly_error
|
|
6
|
+
from openstat.session import Session
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@command("r", usage='r "<R code>"')
|
|
10
|
+
def cmd_r(session: Session, args: str) -> str:
|
|
11
|
+
"""Execute R code in the current session (requires rpy2).
|
|
12
|
+
|
|
13
|
+
The current dataset is available in R as 'data'.
|
|
14
|
+
Results printed in R are captured and returned.
|
|
15
|
+
Modified 'data' is pulled back into the OpenStat session.
|
|
16
|
+
|
|
17
|
+
Examples:
|
|
18
|
+
r "summary(data)"
|
|
19
|
+
r "cor(data[, sapply(data, is.numeric)])"
|
|
20
|
+
r "data$log_income <- log(data$income + 1)"
|
|
21
|
+
r "lm_result <- lm(y ~ x1 + x2, data=data); summary(lm_result)"
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
import rpy2.robjects as ro
|
|
25
|
+
from rpy2.robjects import pandas2ri
|
|
26
|
+
from rpy2.robjects.conversion import localconverter
|
|
27
|
+
import rpy2.rinterface_lib.callbacks as rcb
|
|
28
|
+
except ImportError:
|
|
29
|
+
return (
|
|
30
|
+
"rpy2 is required for the R bridge.\n"
|
|
31
|
+
"Install: pip install rpy2\n"
|
|
32
|
+
"Also requires a working R installation."
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
code = args.strip().strip('"\'')
|
|
36
|
+
if not code:
|
|
37
|
+
return 'Usage: r "<R code>"'
|
|
38
|
+
|
|
39
|
+
import io as _io
|
|
40
|
+
output_lines: list[str] = []
|
|
41
|
+
|
|
42
|
+
# Capture R output
|
|
43
|
+
def _capture(x):
|
|
44
|
+
output_lines.append(x)
|
|
45
|
+
|
|
46
|
+
old_write = rcb.consolewrite_print
|
|
47
|
+
rcb.consolewrite_print = _capture
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
with localconverter(ro.default_converter + pandas2ri.converter):
|
|
51
|
+
# Push current dataframe into R as 'data'
|
|
52
|
+
if session.df is not None:
|
|
53
|
+
try:
|
|
54
|
+
r_df = ro.conversion.py2rpy(session.df.to_pandas())
|
|
55
|
+
ro.globalenv["data"] = r_df
|
|
56
|
+
except Exception:
|
|
57
|
+
pass # non-critical
|
|
58
|
+
|
|
59
|
+
# Execute R code
|
|
60
|
+
ro.r(code)
|
|
61
|
+
|
|
62
|
+
# Pull 'data' back if it was modified
|
|
63
|
+
try:
|
|
64
|
+
r_data = ro.globalenv.get("data")
|
|
65
|
+
if r_data is not None:
|
|
66
|
+
import polars as pl
|
|
67
|
+
pd_df = ro.conversion.rpy2py(r_data)
|
|
68
|
+
new_df = pl.from_pandas(pd_df)
|
|
69
|
+
if session.df is None or not new_df.equals(session.df):
|
|
70
|
+
session.snapshot()
|
|
71
|
+
session.df = new_df
|
|
72
|
+
except Exception:
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
except Exception as e:
|
|
76
|
+
return friendly_error(e, "R bridge")
|
|
77
|
+
finally:
|
|
78
|
+
rcb.consolewrite_print = old_write
|
|
79
|
+
|
|
80
|
+
result = "".join(output_lines).strip()
|
|
81
|
+
return result or "[R code executed — no output]"
|