openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""Meta-analysis commands: meta, forest plot, funnel plot."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from scipy import stats
|
|
10
|
+
|
|
11
|
+
from openstat.commands.base import command
|
|
12
|
+
from openstat.session import Session
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _meta_fe(effects: np.ndarray, variances: np.ndarray) -> dict:
|
|
16
|
+
"""Fixed-effects meta-analysis (inverse-variance weighting)."""
|
|
17
|
+
weights = 1.0 / variances
|
|
18
|
+
pooled = np.sum(weights * effects) / np.sum(weights)
|
|
19
|
+
se = math.sqrt(1.0 / np.sum(weights))
|
|
20
|
+
z = pooled / se
|
|
21
|
+
p = 2 * (1 - stats.norm.cdf(abs(z)))
|
|
22
|
+
ci_low = pooled - 1.96 * se
|
|
23
|
+
ci_high = pooled + 1.96 * se
|
|
24
|
+
|
|
25
|
+
# Cochran's Q
|
|
26
|
+
q = np.sum(weights * (effects - pooled) ** 2)
|
|
27
|
+
df = len(effects) - 1
|
|
28
|
+
q_pval = 1 - stats.chi2.cdf(q, df) if df > 0 else float("nan")
|
|
29
|
+
i2 = max(0.0, (q - df) / q * 100) if q > df else 0.0
|
|
30
|
+
|
|
31
|
+
return {
|
|
32
|
+
"pooled": pooled, "se": se, "z": z, "p": p,
|
|
33
|
+
"ci_low": ci_low, "ci_high": ci_high,
|
|
34
|
+
"Q": q, "Q_df": df, "Q_p": q_pval, "I2": i2,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _meta_re(effects: np.ndarray, variances: np.ndarray) -> dict:
|
|
39
|
+
"""Random-effects meta-analysis (DerSimonian-Laird)."""
|
|
40
|
+
weights_fe = 1.0 / variances
|
|
41
|
+
pooled_fe = np.sum(weights_fe * effects) / np.sum(weights_fe)
|
|
42
|
+
q = np.sum(weights_fe * (effects - pooled_fe) ** 2)
|
|
43
|
+
df = len(effects) - 1
|
|
44
|
+
|
|
45
|
+
# Between-study variance (tau^2) via DerSimonian-Laird
|
|
46
|
+
c = np.sum(weights_fe) - np.sum(weights_fe ** 2) / np.sum(weights_fe)
|
|
47
|
+
tau2 = max(0.0, (q - df) / c)
|
|
48
|
+
|
|
49
|
+
# RE weights
|
|
50
|
+
weights_re = 1.0 / (variances + tau2)
|
|
51
|
+
pooled_re = np.sum(weights_re * effects) / np.sum(weights_re)
|
|
52
|
+
se_re = math.sqrt(1.0 / np.sum(weights_re))
|
|
53
|
+
z = pooled_re / se_re
|
|
54
|
+
p = 2 * (1 - stats.norm.cdf(abs(z)))
|
|
55
|
+
ci_low = pooled_re - 1.96 * se_re
|
|
56
|
+
ci_high = pooled_re + 1.96 * se_re
|
|
57
|
+
|
|
58
|
+
q_pval = 1 - stats.chi2.cdf(q, df) if df > 0 else float("nan")
|
|
59
|
+
i2 = max(0.0, (q - df) / q * 100) if q > df else 0.0
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
"pooled": pooled_re, "se": se_re, "z": z, "p": p,
|
|
63
|
+
"ci_low": ci_low, "ci_high": ci_high,
|
|
64
|
+
"tau2": tau2, "tau": math.sqrt(tau2),
|
|
65
|
+
"Q": q, "Q_df": df, "Q_p": q_pval, "I2": i2,
|
|
66
|
+
"model": "Random-effects (DerSimonian-Laird)",
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _sig(p: float) -> str:
|
|
71
|
+
if p < 0.001: return "***"
|
|
72
|
+
if p < 0.01: return "**"
|
|
73
|
+
if p < 0.05: return "*"
|
|
74
|
+
return ""
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _plot_forest(
|
|
78
|
+
study_labels: list[str],
|
|
79
|
+
effects: np.ndarray,
|
|
80
|
+
ci_low: np.ndarray,
|
|
81
|
+
ci_high: np.ndarray,
|
|
82
|
+
pooled: float,
|
|
83
|
+
pooled_ci_low: float,
|
|
84
|
+
pooled_ci_high: float,
|
|
85
|
+
output_dir: Path,
|
|
86
|
+
title: str = "Forest Plot",
|
|
87
|
+
) -> Path:
|
|
88
|
+
import matplotlib
|
|
89
|
+
matplotlib.use("Agg")
|
|
90
|
+
import matplotlib.pyplot as plt
|
|
91
|
+
|
|
92
|
+
n = len(study_labels)
|
|
93
|
+
fig_h = max(4, n * 0.45 + 2.5)
|
|
94
|
+
fig, ax = plt.subplots(figsize=(10, fig_h))
|
|
95
|
+
|
|
96
|
+
y_pos = np.arange(n, 0, -1, dtype=float)
|
|
97
|
+
|
|
98
|
+
# Individual studies
|
|
99
|
+
ax.errorbar(
|
|
100
|
+
effects, y_pos,
|
|
101
|
+
xerr=[effects - ci_low, ci_high - effects],
|
|
102
|
+
fmt="s", color="#4C72B0", ecolor="#4C72B0",
|
|
103
|
+
capsize=3, markersize=5, linewidth=1.2, label="Studies",
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Pooled diamond
|
|
107
|
+
diamond_x = [pooled_ci_low, pooled, pooled_ci_high, pooled, pooled_ci_low]
|
|
108
|
+
diamond_y = [0.0, 0.25, 0.0, -0.25, 0.0]
|
|
109
|
+
ax.fill(diamond_x, diamond_y, color="#E66100", zorder=5, label="Pooled")
|
|
110
|
+
ax.plot([pooled, pooled], [-0.3, n + 0.3], color="#E66100",
|
|
111
|
+
linestyle="--", linewidth=1, alpha=0.6)
|
|
112
|
+
|
|
113
|
+
ax.axvline(0, color="gray", linestyle="--", linewidth=1, alpha=0.7)
|
|
114
|
+
|
|
115
|
+
ax.set_yticks(y_pos)
|
|
116
|
+
ax.set_yticklabels(study_labels)
|
|
117
|
+
ax.set_xlabel("Effect Size")
|
|
118
|
+
ax.set_title(title)
|
|
119
|
+
ax.legend(loc="upper right", fontsize=9)
|
|
120
|
+
fig.tight_layout()
|
|
121
|
+
|
|
122
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
123
|
+
from openstat.plots.plotter import _unique_path
|
|
124
|
+
path = _unique_path(output_dir, "forest_plot")
|
|
125
|
+
fig.savefig(path, dpi=150)
|
|
126
|
+
plt.close(fig)
|
|
127
|
+
return path
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _plot_funnel(
|
|
131
|
+
effects: np.ndarray,
|
|
132
|
+
se: np.ndarray,
|
|
133
|
+
pooled: float,
|
|
134
|
+
output_dir: Path,
|
|
135
|
+
) -> Path:
|
|
136
|
+
import matplotlib
|
|
137
|
+
matplotlib.use("Agg")
|
|
138
|
+
import matplotlib.pyplot as plt
|
|
139
|
+
|
|
140
|
+
max_se = se.max()
|
|
141
|
+
se_range = np.linspace(0, max_se * 1.1, 100)
|
|
142
|
+
ci_l = pooled - 1.96 * se_range
|
|
143
|
+
ci_u = pooled + 1.96 * se_range
|
|
144
|
+
|
|
145
|
+
fig, ax = plt.subplots(figsize=(7, 6))
|
|
146
|
+
ax.scatter(effects, se, alpha=0.7, s=40, color="#4C72B0")
|
|
147
|
+
ax.plot(ci_l, se_range, "r--", linewidth=1, label="95% CI")
|
|
148
|
+
ax.plot(ci_u, se_range, "r--", linewidth=1)
|
|
149
|
+
ax.axvline(pooled, color="gray", linestyle="--", linewidth=1, label=f"Pooled={pooled:.3f}")
|
|
150
|
+
ax.invert_yaxis()
|
|
151
|
+
ax.set_xlabel("Effect Size")
|
|
152
|
+
ax.set_ylabel("Standard Error")
|
|
153
|
+
ax.set_title("Funnel Plot")
|
|
154
|
+
ax.legend(fontsize=9)
|
|
155
|
+
fig.tight_layout()
|
|
156
|
+
|
|
157
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
158
|
+
from openstat.plots.plotter import _unique_path
|
|
159
|
+
path = _unique_path(output_dir, "funnel_plot")
|
|
160
|
+
fig.savefig(path, dpi=150)
|
|
161
|
+
plt.close(fig)
|
|
162
|
+
return path
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@command("meta", usage="meta <effect_col> <se_col> [study=<label_col>] [--re|--fe] [--forest] [--funnel]")
|
|
166
|
+
def cmd_meta(session: Session, args: str) -> str:
|
|
167
|
+
"""Meta-analysis: fixed-effects or random-effects pooling with forest/funnel plots.
|
|
168
|
+
|
|
169
|
+
Requires columns for effect sizes and their standard errors.
|
|
170
|
+
Uses DerSimonian-Laird for random-effects (default).
|
|
171
|
+
|
|
172
|
+
Examples:
|
|
173
|
+
meta es se
|
|
174
|
+
meta es se study=author --re
|
|
175
|
+
meta es se study=author --forest --funnel
|
|
176
|
+
meta logOR se_logOR study=trial --fe
|
|
177
|
+
"""
|
|
178
|
+
import re
|
|
179
|
+
import polars as pl
|
|
180
|
+
|
|
181
|
+
df = session.require_data()
|
|
182
|
+
args = args.strip()
|
|
183
|
+
|
|
184
|
+
# Parse options
|
|
185
|
+
fe = "--fe" in args
|
|
186
|
+
re_flag = "--re" in args or not fe # default: random-effects
|
|
187
|
+
forest = "--forest" in args
|
|
188
|
+
funnel = "--funnel" in args
|
|
189
|
+
args_clean = re.sub(r"--\w+", "", args).strip()
|
|
190
|
+
|
|
191
|
+
# study=col
|
|
192
|
+
study_col = None
|
|
193
|
+
m = re.search(r"study[= ](\w+)", args_clean)
|
|
194
|
+
if m:
|
|
195
|
+
study_col = m.group(1)
|
|
196
|
+
args_clean = args_clean[:m.start()] + args_clean[m.end():]
|
|
197
|
+
|
|
198
|
+
tokens = args_clean.split()
|
|
199
|
+
if len(tokens) < 2:
|
|
200
|
+
return (
|
|
201
|
+
"Usage: meta <effect_col> <se_col> [study=<label_col>] [--re|--fe] [--forest] [--funnel]\n"
|
|
202
|
+
"Effect sizes must be pre-computed (e.g., Cohen's d, log OR, standardized mean diff)."
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
eff_col, se_col = tokens[0], tokens[1]
|
|
206
|
+
for c in [eff_col, se_col]:
|
|
207
|
+
if c not in df.columns:
|
|
208
|
+
return f"Column not found: {c}"
|
|
209
|
+
|
|
210
|
+
sub = df.select([c for c in [eff_col, se_col, study_col] if c]).drop_nulls()
|
|
211
|
+
effects = sub[eff_col].to_numpy().astype(float)
|
|
212
|
+
ses = sub[se_col].to_numpy().astype(float)
|
|
213
|
+
variances = ses ** 2
|
|
214
|
+
k = len(effects)
|
|
215
|
+
|
|
216
|
+
if k < 2:
|
|
217
|
+
return "Need at least 2 studies for meta-analysis."
|
|
218
|
+
|
|
219
|
+
labels = (
|
|
220
|
+
[str(v) for v in sub[study_col].to_list()]
|
|
221
|
+
if study_col
|
|
222
|
+
else [f"Study {i+1}" for i in range(k)]
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# Run analysis
|
|
226
|
+
method = "FE" if fe else "RE"
|
|
227
|
+
res = _meta_fe(effects, variances) if fe else _meta_re(effects, variances)
|
|
228
|
+
|
|
229
|
+
lines = [
|
|
230
|
+
f"Studies (k): {k}",
|
|
231
|
+
f"Method: {'Fixed-effects (IV)' if fe else 'Random-effects (DerSimonian-Laird)'}",
|
|
232
|
+
"",
|
|
233
|
+
"Individual Studies:",
|
|
234
|
+
f" {'Study':<20} {'Effect':>8} {'SE':>7} {'95% CI':>18}",
|
|
235
|
+
" " + "-" * 55,
|
|
236
|
+
]
|
|
237
|
+
ci_low_arr = effects - 1.96 * ses
|
|
238
|
+
ci_high_arr = effects + 1.96 * ses
|
|
239
|
+
for lbl, eff, se_i, lo, hi in zip(labels, effects, ses, ci_low_arr, ci_high_arr):
|
|
240
|
+
lines.append(f" {lbl:<20} {eff:>8.4f} {se_i:>7.4f} [{lo:>7.4f}, {hi:>7.4f}]")
|
|
241
|
+
|
|
242
|
+
lines += [
|
|
243
|
+
"",
|
|
244
|
+
"Pooled Estimate:",
|
|
245
|
+
f" Effect = {res['pooled']:.4f} (95% CI: [{res['ci_low']:.4f}, {res['ci_high']:.4f}])",
|
|
246
|
+
f" SE = {res['se']:.4f} z = {res['z']:.3f} p = {res['p']:.4f} {_sig(res['p'])}",
|
|
247
|
+
"",
|
|
248
|
+
"Heterogeneity:",
|
|
249
|
+
f" Q({res['Q_df']}) = {res['Q']:.3f} p = {res['Q_p']:.4f}",
|
|
250
|
+
f" I² = {res['I2']:.1f}%",
|
|
251
|
+
]
|
|
252
|
+
if not fe:
|
|
253
|
+
lines.append(f" τ² = {res['tau2']:.4f} τ = {res['tau']:.4f}")
|
|
254
|
+
|
|
255
|
+
out = "\n" + "=" * 60 + "\nMeta-Analysis Results\n" + "=" * 60 + "\n"
|
|
256
|
+
out += "\n".join(lines) + "\n" + "=" * 60
|
|
257
|
+
|
|
258
|
+
# Plots
|
|
259
|
+
plot_msgs = []
|
|
260
|
+
if forest:
|
|
261
|
+
try:
|
|
262
|
+
p = _plot_forest(
|
|
263
|
+
labels, effects, ci_low_arr, ci_high_arr,
|
|
264
|
+
res["pooled"], res["ci_low"], res["ci_high"],
|
|
265
|
+
session.output_dir,
|
|
266
|
+
title=f"Forest Plot (pooled={res['pooled']:.3f})",
|
|
267
|
+
)
|
|
268
|
+
session.plot_paths.append(str(p))
|
|
269
|
+
plot_msgs.append(f"Forest plot saved: {p}")
|
|
270
|
+
except Exception as exc:
|
|
271
|
+
plot_msgs.append(f"Forest plot error: {exc}")
|
|
272
|
+
|
|
273
|
+
if funnel:
|
|
274
|
+
try:
|
|
275
|
+
p = _plot_funnel(effects, ses, res["pooled"], session.output_dir)
|
|
276
|
+
session.plot_paths.append(str(p))
|
|
277
|
+
plot_msgs.append(f"Funnel plot saved: {p}")
|
|
278
|
+
except Exception as exc:
|
|
279
|
+
plot_msgs.append(f"Funnel plot error: {exc}")
|
|
280
|
+
|
|
281
|
+
if plot_msgs:
|
|
282
|
+
out += "\n" + "\n".join(plot_msgs)
|
|
283
|
+
|
|
284
|
+
return out
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""Multiple imputation commands: mi impute, mi estimate, mi describe."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import io
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.table import Table
|
|
11
|
+
|
|
12
|
+
from openstat.session import Session, ModelResult
|
|
13
|
+
from openstat.commands.base import command, CommandArgs, rich_to_str, friendly_error
|
|
14
|
+
from openstat.dsl.parser import parse_formula, ParseError
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _parse_impute_specs(args: str) -> tuple[list[tuple[str, str]], dict]:
|
|
18
|
+
"""Parse imputation specification.
|
|
19
|
+
|
|
20
|
+
Formats:
|
|
21
|
+
chained (regress) x1 (logit) x2, add(5)
|
|
22
|
+
pmm x1 x2, add(10) knn(5)
|
|
23
|
+
|
|
24
|
+
Returns (specs, options).
|
|
25
|
+
"""
|
|
26
|
+
# Split on comma for options
|
|
27
|
+
if ',' in args:
|
|
28
|
+
spec_part, opt_part = args.split(',', 1)
|
|
29
|
+
else:
|
|
30
|
+
spec_part, opt_part = args, ""
|
|
31
|
+
|
|
32
|
+
# Parse options
|
|
33
|
+
options: dict = {}
|
|
34
|
+
m_add = re.search(r'add\((\d+)\)', opt_part)
|
|
35
|
+
if m_add:
|
|
36
|
+
options["m"] = int(m_add.group(1))
|
|
37
|
+
m_knn = re.search(r'knn\((\d+)\)', opt_part)
|
|
38
|
+
if m_knn:
|
|
39
|
+
options["knn"] = int(m_knn.group(1))
|
|
40
|
+
|
|
41
|
+
spec_part = spec_part.strip()
|
|
42
|
+
specs: list[tuple[str, str]] = []
|
|
43
|
+
|
|
44
|
+
if spec_part.startswith("chained"):
|
|
45
|
+
# Parse (method) var pairs
|
|
46
|
+
spec_str = spec_part[len("chained"):].strip()
|
|
47
|
+
pattern = r'\((\w+)\)\s+(\w+)'
|
|
48
|
+
for match in re.finditer(pattern, spec_str):
|
|
49
|
+
method = match.group(1)
|
|
50
|
+
col = match.group(2)
|
|
51
|
+
specs.append((method, col))
|
|
52
|
+
elif spec_part.startswith("pmm"):
|
|
53
|
+
# PMM for all listed variables
|
|
54
|
+
cols = spec_part[len("pmm"):].strip().split()
|
|
55
|
+
for col in cols:
|
|
56
|
+
specs.append(("pmm", col))
|
|
57
|
+
else:
|
|
58
|
+
# Default: regress for all variables
|
|
59
|
+
cols = spec_part.split()
|
|
60
|
+
for col in cols:
|
|
61
|
+
specs.append(("regress", col))
|
|
62
|
+
|
|
63
|
+
return specs, options
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@command("mi", usage="mi impute|estimate|describe ...")
|
|
67
|
+
def cmd_mi(session: Session, args: str) -> str:
|
|
68
|
+
"""Multiple imputation: impute missing data, estimate models, describe imputations."""
|
|
69
|
+
df = session.require_data()
|
|
70
|
+
|
|
71
|
+
parts = args.strip().split(None, 1)
|
|
72
|
+
subcmd = parts[0].lower() if parts else ""
|
|
73
|
+
rest = parts[1] if len(parts) > 1 else ""
|
|
74
|
+
|
|
75
|
+
if subcmd == "impute":
|
|
76
|
+
return _mi_impute(session, df, rest)
|
|
77
|
+
elif subcmd.startswith("estimate"):
|
|
78
|
+
# Handle "estimate:" or "estimate :"
|
|
79
|
+
model_cmd = rest
|
|
80
|
+
if model_cmd.startswith(":"):
|
|
81
|
+
model_cmd = model_cmd[1:].strip()
|
|
82
|
+
elif ":" in args:
|
|
83
|
+
model_cmd = args.split(":", 1)[1].strip()
|
|
84
|
+
return _mi_estimate(session, model_cmd)
|
|
85
|
+
elif subcmd == "describe":
|
|
86
|
+
return _mi_describe(session)
|
|
87
|
+
else:
|
|
88
|
+
return "Usage: mi impute|estimate|describe\n mi impute chained (regress) x1 (logit) x2, add(5)\n mi estimate: ols y ~ x1 + x2\n mi describe"
|
|
89
|
+
|
|
90
|
+
def _mi_impute(session: Session, df: pl.DataFrame, args: str) -> str:
|
|
91
|
+
"""Run MICE imputation."""
|
|
92
|
+
specs, options = _parse_impute_specs(args)
|
|
93
|
+
if not specs:
|
|
94
|
+
return "No variables specified for imputation."
|
|
95
|
+
|
|
96
|
+
m = options.get("m", 5)
|
|
97
|
+
knn = options.get("knn", 5)
|
|
98
|
+
|
|
99
|
+
# Validate columns
|
|
100
|
+
for method, col in specs:
|
|
101
|
+
if col not in df.columns:
|
|
102
|
+
return f"Column not found: {col}"
|
|
103
|
+
|
|
104
|
+
# Check for missing values (both null and NaN)
|
|
105
|
+
has_missing = False
|
|
106
|
+
for _, col in specs:
|
|
107
|
+
if df[col].null_count() > 0:
|
|
108
|
+
has_missing = True
|
|
109
|
+
break
|
|
110
|
+
if df[col].dtype.is_float() and df[col].is_nan().sum() > 0:
|
|
111
|
+
has_missing = True
|
|
112
|
+
break
|
|
113
|
+
if not has_missing:
|
|
114
|
+
return "No missing values found in specified columns."
|
|
115
|
+
|
|
116
|
+
try:
|
|
117
|
+
from openstat.stats.imputation import mice_impute
|
|
118
|
+
|
|
119
|
+
datasets = mice_impute(df, specs, m=m)
|
|
120
|
+
session._imputed_datasets = datasets
|
|
121
|
+
session._mi_m = m
|
|
122
|
+
|
|
123
|
+
lines = [f"Created {m} imputed datasets."]
|
|
124
|
+
for method, col in specs:
|
|
125
|
+
n_missing = df[col].null_count()
|
|
126
|
+
lines.append(f" {col}: {n_missing} missing values imputed ({method})")
|
|
127
|
+
lines.append(f"\nUse 'mi estimate: <model>' to run analysis across imputed datasets.")
|
|
128
|
+
return "\n".join(lines)
|
|
129
|
+
except Exception as e:
|
|
130
|
+
return friendly_error(e, "mi impute")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _mi_estimate(session: Session, model_cmd: str) -> str:
|
|
134
|
+
"""Run model on each imputed dataset and combine with Rubin's rules."""
|
|
135
|
+
if not session._imputed_datasets:
|
|
136
|
+
return "No imputed datasets. Run 'mi impute' first."
|
|
137
|
+
|
|
138
|
+
model_cmd = model_cmd.strip()
|
|
139
|
+
if not model_cmd:
|
|
140
|
+
return "Usage: mi estimate: ols y ~ x1 + x2"
|
|
141
|
+
|
|
142
|
+
# Parse model type and formula
|
|
143
|
+
parts = model_cmd.split(None, 1)
|
|
144
|
+
model_type = parts[0].lower()
|
|
145
|
+
formula_str = parts[1] if len(parts) > 1 else ""
|
|
146
|
+
|
|
147
|
+
try:
|
|
148
|
+
dep, indeps = parse_formula(formula_str)
|
|
149
|
+
except ParseError as e:
|
|
150
|
+
return f"Formula error: {e}"
|
|
151
|
+
|
|
152
|
+
estimates: list[dict[str, float]] = []
|
|
153
|
+
std_errors: list[dict[str, float]] = []
|
|
154
|
+
|
|
155
|
+
for i, imp_df in enumerate(session._imputed_datasets):
|
|
156
|
+
try:
|
|
157
|
+
if model_type == "ols":
|
|
158
|
+
from openstat.stats.models import fit_ols
|
|
159
|
+
result, _ = fit_ols(imp_df, dep, indeps)
|
|
160
|
+
elif model_type == "logit":
|
|
161
|
+
from openstat.stats.models import fit_logit
|
|
162
|
+
result, _ = fit_logit(imp_df, dep, indeps)
|
|
163
|
+
else:
|
|
164
|
+
return f"Unsupported model for MI: {model_type}. Use ols or logit."
|
|
165
|
+
|
|
166
|
+
estimates.append(dict(result.params))
|
|
167
|
+
std_errors.append(dict(result.std_errors))
|
|
168
|
+
except Exception as e:
|
|
169
|
+
return f"Error fitting imputation {i + 1}: {e}"
|
|
170
|
+
|
|
171
|
+
# Combine with Rubin's rules
|
|
172
|
+
from openstat.stats.imputation import rubins_rules
|
|
173
|
+
n_obs = session._imputed_datasets[0].height
|
|
174
|
+
mi_result = rubins_rules(estimates, std_errors, n_obs)
|
|
175
|
+
mi_result.model_type = f"MI ({mi_result.m} imputations): {model_type.upper()}"
|
|
176
|
+
mi_result.formula = f"{dep} ~ {' + '.join(indeps)}"
|
|
177
|
+
|
|
178
|
+
# Display as table
|
|
179
|
+
def render(console: Console) -> None:
|
|
180
|
+
table = Table(title=mi_result.model_type)
|
|
181
|
+
table.add_column("Variable", style="cyan")
|
|
182
|
+
table.add_column("Coef", justify="right")
|
|
183
|
+
table.add_column("MI Std.Err", justify="right")
|
|
184
|
+
table.add_column("t", justify="right")
|
|
185
|
+
table.add_column("P>|t|", justify="right")
|
|
186
|
+
table.add_column("[95% CI Low]", justify="right")
|
|
187
|
+
table.add_column("[95% CI High]", justify="right")
|
|
188
|
+
table.add_column("FMI", justify="right")
|
|
189
|
+
|
|
190
|
+
for var in mi_result.params:
|
|
191
|
+
sig = ""
|
|
192
|
+
pv = mi_result.p_values[var]
|
|
193
|
+
if pv < 0.001:
|
|
194
|
+
sig = " ***"
|
|
195
|
+
elif pv < 0.01:
|
|
196
|
+
sig = " **"
|
|
197
|
+
elif pv < 0.05:
|
|
198
|
+
sig = " *"
|
|
199
|
+
|
|
200
|
+
table.add_row(
|
|
201
|
+
var,
|
|
202
|
+
f"{mi_result.params[var]:.4f}",
|
|
203
|
+
f"{mi_result.std_errors[var]:.4f}",
|
|
204
|
+
f"{mi_result.t_values[var]:.3f}",
|
|
205
|
+
f"{pv:.4f}{sig}",
|
|
206
|
+
f"{mi_result.conf_int_low[var]:.4f}",
|
|
207
|
+
f"{mi_result.conf_int_high[var]:.4f}",
|
|
208
|
+
f"{mi_result.fmi[var]:.3f}",
|
|
209
|
+
)
|
|
210
|
+
console.print(table)
|
|
211
|
+
|
|
212
|
+
output = rich_to_str(render)
|
|
213
|
+
output += f"\nN = {mi_result.n_obs} | Imputations = {mi_result.m}"
|
|
214
|
+
return output
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _mi_describe(session: Session) -> str:
|
|
218
|
+
"""Describe imputed datasets."""
|
|
219
|
+
if not session._imputed_datasets:
|
|
220
|
+
return "No imputed datasets. Run 'mi impute' first."
|
|
221
|
+
|
|
222
|
+
lines = [
|
|
223
|
+
f"Multiple Imputation Summary:",
|
|
224
|
+
f" Number of imputations (m): {session._mi_m}",
|
|
225
|
+
f" Rows per dataset: {session._imputed_datasets[0].height}",
|
|
226
|
+
f" Columns: {session._imputed_datasets[0].width}",
|
|
227
|
+
]
|
|
228
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Mixed / hierarchical model commands: mixed, estat icc."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from openstat.session import Session, ModelResult
|
|
6
|
+
from openstat.commands.base import command, CommandArgs, friendly_error
|
|
7
|
+
from openstat.dsl.parser import parse_formula, ParseError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _parse_mixed_formula(raw: str) -> tuple[str, list[str], str, list[str]]:
|
|
11
|
+
"""Parse mixed model formula: y ~ x1 || group: x1.
|
|
12
|
+
|
|
13
|
+
Returns (dep, fixed_effects, group_var, random_effects).
|
|
14
|
+
"""
|
|
15
|
+
if '||' not in raw:
|
|
16
|
+
raise ParseError("Mixed model requires '||' to specify grouping: y ~ x1 || group: [re_vars]")
|
|
17
|
+
|
|
18
|
+
fixed_part, random_part = raw.split('||', 1)
|
|
19
|
+
dep, fixed = parse_formula(fixed_part.strip())
|
|
20
|
+
|
|
21
|
+
random_part = random_part.strip()
|
|
22
|
+
if ':' in random_part:
|
|
23
|
+
group_str, re_str = random_part.split(':', 1)
|
|
24
|
+
group_var = group_str.strip()
|
|
25
|
+
re_vars = re_str.strip().split() if re_str.strip() else []
|
|
26
|
+
else:
|
|
27
|
+
group_var = random_part.strip()
|
|
28
|
+
re_vars = []
|
|
29
|
+
|
|
30
|
+
if not group_var:
|
|
31
|
+
raise ParseError("No grouping variable specified after '||'")
|
|
32
|
+
|
|
33
|
+
return dep, fixed, group_var, re_vars
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@command("mixed", usage="mixed y ~ x1 || group: [re_vars]")
|
|
37
|
+
def cmd_mixed(session: Session, args: str) -> str:
|
|
38
|
+
"""Fit a mixed/hierarchical linear model with random intercepts and/or slopes."""
|
|
39
|
+
df = session.require_data()
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
dep, fixed, group_var, re_vars = _parse_mixed_formula(args.strip())
|
|
43
|
+
except ParseError as e:
|
|
44
|
+
return f"Formula error: {e}"
|
|
45
|
+
|
|
46
|
+
# Validate columns
|
|
47
|
+
all_cols = [dep] + fixed + [group_var] + re_vars
|
|
48
|
+
missing = [c for c in all_cols if c not in df.columns]
|
|
49
|
+
if missing:
|
|
50
|
+
return f"Columns not found: {', '.join(missing)}"
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
from openstat.stats.mixed import fit_mixed
|
|
54
|
+
|
|
55
|
+
result, raw = fit_mixed(df, dep, fixed, group_var, re_vars or None)
|
|
56
|
+
|
|
57
|
+
session._last_model = raw
|
|
58
|
+
session._last_model_vars = (dep, fixed)
|
|
59
|
+
session._last_fit_result = result
|
|
60
|
+
session._last_fit_kwargs = {"group_var": group_var, "re_vars": re_vars}
|
|
61
|
+
|
|
62
|
+
md = result.to_markdown() if hasattr(result, "to_markdown") else ""
|
|
63
|
+
session.results.append(ModelResult(
|
|
64
|
+
name="Mixed LM", formula=result.formula,
|
|
65
|
+
table=md, details={
|
|
66
|
+
"n_obs": result.n_obs,
|
|
67
|
+
"params": dict(result.params),
|
|
68
|
+
"aic": result.aic,
|
|
69
|
+
"bic": result.bic,
|
|
70
|
+
"log_likelihood": result.log_likelihood,
|
|
71
|
+
},
|
|
72
|
+
))
|
|
73
|
+
|
|
74
|
+
output = result.summary_table()
|
|
75
|
+
if result.warnings:
|
|
76
|
+
output += "\n" + "\n".join(result.warnings)
|
|
77
|
+
return output
|
|
78
|
+
except Exception as e:
|
|
79
|
+
return friendly_error(e, "mixed")
|