openstat-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. openstat/__init__.py +3 -0
  2. openstat/__main__.py +4 -0
  3. openstat/backends/__init__.py +16 -0
  4. openstat/backends/duckdb_backend.py +70 -0
  5. openstat/backends/polars_backend.py +52 -0
  6. openstat/cli.py +92 -0
  7. openstat/commands/__init__.py +82 -0
  8. openstat/commands/adv_stat_cmds.py +1255 -0
  9. openstat/commands/advanced_ml_cmds.py +576 -0
  10. openstat/commands/advreg_cmds.py +207 -0
  11. openstat/commands/alias_cmds.py +135 -0
  12. openstat/commands/arch_cmds.py +82 -0
  13. openstat/commands/arules_cmds.py +111 -0
  14. openstat/commands/automodel_cmds.py +212 -0
  15. openstat/commands/backend_cmds.py +82 -0
  16. openstat/commands/base.py +170 -0
  17. openstat/commands/bayes_cmds.py +71 -0
  18. openstat/commands/causal_cmds.py +269 -0
  19. openstat/commands/cluster_cmds.py +152 -0
  20. openstat/commands/data_cmds.py +996 -0
  21. openstat/commands/datamanip_cmds.py +672 -0
  22. openstat/commands/dataquality_cmds.py +174 -0
  23. openstat/commands/datetime_cmds.py +176 -0
  24. openstat/commands/dimreduce_cmds.py +184 -0
  25. openstat/commands/discrete_cmds.py +149 -0
  26. openstat/commands/dsl_cmds.py +143 -0
  27. openstat/commands/epi_cmds.py +93 -0
  28. openstat/commands/equiv_tobit_cmds.py +94 -0
  29. openstat/commands/esttab_cmds.py +196 -0
  30. openstat/commands/export_beamer_cmds.py +142 -0
  31. openstat/commands/export_cmds.py +201 -0
  32. openstat/commands/export_extra_cmds.py +240 -0
  33. openstat/commands/factor_cmds.py +180 -0
  34. openstat/commands/groupby_cmds.py +155 -0
  35. openstat/commands/help_cmds.py +237 -0
  36. openstat/commands/i18n_cmds.py +43 -0
  37. openstat/commands/import_extra_cmds.py +561 -0
  38. openstat/commands/influence_cmds.py +134 -0
  39. openstat/commands/iv_cmds.py +106 -0
  40. openstat/commands/manova_cmds.py +105 -0
  41. openstat/commands/mediate_cmds.py +233 -0
  42. openstat/commands/meta_cmds.py +284 -0
  43. openstat/commands/mi_cmds.py +228 -0
  44. openstat/commands/mixed_cmds.py +79 -0
  45. openstat/commands/mixture_changepoint_cmds.py +166 -0
  46. openstat/commands/ml_adv_cmds.py +147 -0
  47. openstat/commands/ml_cmds.py +178 -0
  48. openstat/commands/model_eval_cmds.py +142 -0
  49. openstat/commands/network_cmds.py +288 -0
  50. openstat/commands/nlquery_cmds.py +161 -0
  51. openstat/commands/nonparam_cmds.py +149 -0
  52. openstat/commands/outreg_cmds.py +247 -0
  53. openstat/commands/panel_cmds.py +141 -0
  54. openstat/commands/pdf_cmds.py +226 -0
  55. openstat/commands/pipeline_cmds.py +319 -0
  56. openstat/commands/plot_cmds.py +189 -0
  57. openstat/commands/plugin_cmds.py +79 -0
  58. openstat/commands/posthoc_cmds.py +153 -0
  59. openstat/commands/power_cmds.py +172 -0
  60. openstat/commands/profile_cmds.py +246 -0
  61. openstat/commands/rbridge_cmds.py +81 -0
  62. openstat/commands/regex_cmds.py +104 -0
  63. openstat/commands/report_cmds.py +48 -0
  64. openstat/commands/repro_cmds.py +129 -0
  65. openstat/commands/resampling_cmds.py +109 -0
  66. openstat/commands/reshape_cmds.py +223 -0
  67. openstat/commands/sem_cmds.py +177 -0
  68. openstat/commands/stat_cmds.py +1040 -0
  69. openstat/commands/stata_import_cmds.py +215 -0
  70. openstat/commands/string_cmds.py +124 -0
  71. openstat/commands/surv_cmds.py +145 -0
  72. openstat/commands/survey_cmds.py +153 -0
  73. openstat/commands/textanalysis_cmds.py +192 -0
  74. openstat/commands/ts_adv_cmds.py +136 -0
  75. openstat/commands/ts_cmds.py +195 -0
  76. openstat/commands/tui_cmds.py +111 -0
  77. openstat/commands/ux_cmds.py +191 -0
  78. openstat/commands/validate_cmds.py +270 -0
  79. openstat/commands/viz_adv_cmds.py +312 -0
  80. openstat/commands/viz_extra_cmds.py +251 -0
  81. openstat/commands/watch_cmds.py +69 -0
  82. openstat/config.py +106 -0
  83. openstat/dsl/__init__.py +0 -0
  84. openstat/dsl/parser.py +332 -0
  85. openstat/dsl/tokenizer.py +105 -0
  86. openstat/i18n.py +120 -0
  87. openstat/io/__init__.py +0 -0
  88. openstat/io/loader.py +187 -0
  89. openstat/jupyter/__init__.py +18 -0
  90. openstat/jupyter/display.py +18 -0
  91. openstat/jupyter/magic.py +60 -0
  92. openstat/logging_config.py +59 -0
  93. openstat/plots/__init__.py +0 -0
  94. openstat/plots/plotter.py +437 -0
  95. openstat/plots/surv_plots.py +32 -0
  96. openstat/plots/ts_plots.py +59 -0
  97. openstat/plugins/__init__.py +5 -0
  98. openstat/plugins/manager.py +69 -0
  99. openstat/repl.py +457 -0
  100. openstat/reporting/__init__.py +0 -0
  101. openstat/reporting/eda.py +208 -0
  102. openstat/reporting/report.py +67 -0
  103. openstat/script_runner.py +319 -0
  104. openstat/session.py +133 -0
  105. openstat/stats/__init__.py +0 -0
  106. openstat/stats/advanced_regression.py +269 -0
  107. openstat/stats/arch_garch.py +84 -0
  108. openstat/stats/bayesian.py +103 -0
  109. openstat/stats/causal.py +258 -0
  110. openstat/stats/clustering.py +206 -0
  111. openstat/stats/discrete.py +311 -0
  112. openstat/stats/epidemiology.py +119 -0
  113. openstat/stats/equiv_tobit.py +163 -0
  114. openstat/stats/factor.py +174 -0
  115. openstat/stats/imputation.py +282 -0
  116. openstat/stats/influence.py +78 -0
  117. openstat/stats/iv.py +131 -0
  118. openstat/stats/manova.py +124 -0
  119. openstat/stats/mixed.py +128 -0
  120. openstat/stats/ml.py +275 -0
  121. openstat/stats/ml_advanced.py +117 -0
  122. openstat/stats/model_eval.py +183 -0
  123. openstat/stats/models.py +1342 -0
  124. openstat/stats/nonparametric.py +130 -0
  125. openstat/stats/panel.py +179 -0
  126. openstat/stats/power.py +295 -0
  127. openstat/stats/resampling.py +203 -0
  128. openstat/stats/survey.py +213 -0
  129. openstat/stats/survival.py +196 -0
  130. openstat/stats/timeseries.py +142 -0
  131. openstat/stats/ts_advanced.py +114 -0
  132. openstat/types.py +11 -0
  133. openstat/web/__init__.py +1 -0
  134. openstat/web/app.py +117 -0
  135. openstat/web/session_manager.py +73 -0
  136. openstat/web/static/app.js +117 -0
  137. openstat/web/static/index.html +38 -0
  138. openstat/web/static/style.css +103 -0
  139. openstat_cli-1.0.0.dist-info/METADATA +748 -0
  140. openstat_cli-1.0.0.dist-info/RECORD +143 -0
  141. openstat_cli-1.0.0.dist-info/WHEEL +4 -0
  142. openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
  143. openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,284 @@
1
+ """Meta-analysis commands: meta, forest plot, funnel plot."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ from pathlib import Path
7
+
8
+ import numpy as np
9
+ from scipy import stats
10
+
11
+ from openstat.commands.base import command
12
+ from openstat.session import Session
13
+
14
+
15
+ def _meta_fe(effects: np.ndarray, variances: np.ndarray) -> dict:
16
+ """Fixed-effects meta-analysis (inverse-variance weighting)."""
17
+ weights = 1.0 / variances
18
+ pooled = np.sum(weights * effects) / np.sum(weights)
19
+ se = math.sqrt(1.0 / np.sum(weights))
20
+ z = pooled / se
21
+ p = 2 * (1 - stats.norm.cdf(abs(z)))
22
+ ci_low = pooled - 1.96 * se
23
+ ci_high = pooled + 1.96 * se
24
+
25
+ # Cochran's Q
26
+ q = np.sum(weights * (effects - pooled) ** 2)
27
+ df = len(effects) - 1
28
+ q_pval = 1 - stats.chi2.cdf(q, df) if df > 0 else float("nan")
29
+ i2 = max(0.0, (q - df) / q * 100) if q > df else 0.0
30
+
31
+ return {
32
+ "pooled": pooled, "se": se, "z": z, "p": p,
33
+ "ci_low": ci_low, "ci_high": ci_high,
34
+ "Q": q, "Q_df": df, "Q_p": q_pval, "I2": i2,
35
+ }
36
+
37
+
38
+ def _meta_re(effects: np.ndarray, variances: np.ndarray) -> dict:
39
+ """Random-effects meta-analysis (DerSimonian-Laird)."""
40
+ weights_fe = 1.0 / variances
41
+ pooled_fe = np.sum(weights_fe * effects) / np.sum(weights_fe)
42
+ q = np.sum(weights_fe * (effects - pooled_fe) ** 2)
43
+ df = len(effects) - 1
44
+
45
+ # Between-study variance (tau^2) via DerSimonian-Laird
46
+ c = np.sum(weights_fe) - np.sum(weights_fe ** 2) / np.sum(weights_fe)
47
+ tau2 = max(0.0, (q - df) / c)
48
+
49
+ # RE weights
50
+ weights_re = 1.0 / (variances + tau2)
51
+ pooled_re = np.sum(weights_re * effects) / np.sum(weights_re)
52
+ se_re = math.sqrt(1.0 / np.sum(weights_re))
53
+ z = pooled_re / se_re
54
+ p = 2 * (1 - stats.norm.cdf(abs(z)))
55
+ ci_low = pooled_re - 1.96 * se_re
56
+ ci_high = pooled_re + 1.96 * se_re
57
+
58
+ q_pval = 1 - stats.chi2.cdf(q, df) if df > 0 else float("nan")
59
+ i2 = max(0.0, (q - df) / q * 100) if q > df else 0.0
60
+
61
+ return {
62
+ "pooled": pooled_re, "se": se_re, "z": z, "p": p,
63
+ "ci_low": ci_low, "ci_high": ci_high,
64
+ "tau2": tau2, "tau": math.sqrt(tau2),
65
+ "Q": q, "Q_df": df, "Q_p": q_pval, "I2": i2,
66
+ "model": "Random-effects (DerSimonian-Laird)",
67
+ }
68
+
69
+
70
+ def _sig(p: float) -> str:
71
+ if p < 0.001: return "***"
72
+ if p < 0.01: return "**"
73
+ if p < 0.05: return "*"
74
+ return ""
75
+
76
+
77
+ def _plot_forest(
78
+ study_labels: list[str],
79
+ effects: np.ndarray,
80
+ ci_low: np.ndarray,
81
+ ci_high: np.ndarray,
82
+ pooled: float,
83
+ pooled_ci_low: float,
84
+ pooled_ci_high: float,
85
+ output_dir: Path,
86
+ title: str = "Forest Plot",
87
+ ) -> Path:
88
+ import matplotlib
89
+ matplotlib.use("Agg")
90
+ import matplotlib.pyplot as plt
91
+
92
+ n = len(study_labels)
93
+ fig_h = max(4, n * 0.45 + 2.5)
94
+ fig, ax = plt.subplots(figsize=(10, fig_h))
95
+
96
+ y_pos = np.arange(n, 0, -1, dtype=float)
97
+
98
+ # Individual studies
99
+ ax.errorbar(
100
+ effects, y_pos,
101
+ xerr=[effects - ci_low, ci_high - effects],
102
+ fmt="s", color="#4C72B0", ecolor="#4C72B0",
103
+ capsize=3, markersize=5, linewidth=1.2, label="Studies",
104
+ )
105
+
106
+ # Pooled diamond
107
+ diamond_x = [pooled_ci_low, pooled, pooled_ci_high, pooled, pooled_ci_low]
108
+ diamond_y = [0.0, 0.25, 0.0, -0.25, 0.0]
109
+ ax.fill(diamond_x, diamond_y, color="#E66100", zorder=5, label="Pooled")
110
+ ax.plot([pooled, pooled], [-0.3, n + 0.3], color="#E66100",
111
+ linestyle="--", linewidth=1, alpha=0.6)
112
+
113
+ ax.axvline(0, color="gray", linestyle="--", linewidth=1, alpha=0.7)
114
+
115
+ ax.set_yticks(y_pos)
116
+ ax.set_yticklabels(study_labels)
117
+ ax.set_xlabel("Effect Size")
118
+ ax.set_title(title)
119
+ ax.legend(loc="upper right", fontsize=9)
120
+ fig.tight_layout()
121
+
122
+ output_dir.mkdir(parents=True, exist_ok=True)
123
+ from openstat.plots.plotter import _unique_path
124
+ path = _unique_path(output_dir, "forest_plot")
125
+ fig.savefig(path, dpi=150)
126
+ plt.close(fig)
127
+ return path
128
+
129
+
130
+ def _plot_funnel(
131
+ effects: np.ndarray,
132
+ se: np.ndarray,
133
+ pooled: float,
134
+ output_dir: Path,
135
+ ) -> Path:
136
+ import matplotlib
137
+ matplotlib.use("Agg")
138
+ import matplotlib.pyplot as plt
139
+
140
+ max_se = se.max()
141
+ se_range = np.linspace(0, max_se * 1.1, 100)
142
+ ci_l = pooled - 1.96 * se_range
143
+ ci_u = pooled + 1.96 * se_range
144
+
145
+ fig, ax = plt.subplots(figsize=(7, 6))
146
+ ax.scatter(effects, se, alpha=0.7, s=40, color="#4C72B0")
147
+ ax.plot(ci_l, se_range, "r--", linewidth=1, label="95% CI")
148
+ ax.plot(ci_u, se_range, "r--", linewidth=1)
149
+ ax.axvline(pooled, color="gray", linestyle="--", linewidth=1, label=f"Pooled={pooled:.3f}")
150
+ ax.invert_yaxis()
151
+ ax.set_xlabel("Effect Size")
152
+ ax.set_ylabel("Standard Error")
153
+ ax.set_title("Funnel Plot")
154
+ ax.legend(fontsize=9)
155
+ fig.tight_layout()
156
+
157
+ output_dir.mkdir(parents=True, exist_ok=True)
158
+ from openstat.plots.plotter import _unique_path
159
+ path = _unique_path(output_dir, "funnel_plot")
160
+ fig.savefig(path, dpi=150)
161
+ plt.close(fig)
162
+ return path
163
+
164
+
165
+ @command("meta", usage="meta <effect_col> <se_col> [study=<label_col>] [--re|--fe] [--forest] [--funnel]")
166
+ def cmd_meta(session: Session, args: str) -> str:
167
+ """Meta-analysis: fixed-effects or random-effects pooling with forest/funnel plots.
168
+
169
+ Requires columns for effect sizes and their standard errors.
170
+ Uses DerSimonian-Laird for random-effects (default).
171
+
172
+ Examples:
173
+ meta es se
174
+ meta es se study=author --re
175
+ meta es se study=author --forest --funnel
176
+ meta logOR se_logOR study=trial --fe
177
+ """
178
+ import re
179
+ import polars as pl
180
+
181
+ df = session.require_data()
182
+ args = args.strip()
183
+
184
+ # Parse options
185
+ fe = "--fe" in args
186
+ re_flag = "--re" in args or not fe # default: random-effects
187
+ forest = "--forest" in args
188
+ funnel = "--funnel" in args
189
+ args_clean = re.sub(r"--\w+", "", args).strip()
190
+
191
+ # study=col
192
+ study_col = None
193
+ m = re.search(r"study[= ](\w+)", args_clean)
194
+ if m:
195
+ study_col = m.group(1)
196
+ args_clean = args_clean[:m.start()] + args_clean[m.end():]
197
+
198
+ tokens = args_clean.split()
199
+ if len(tokens) < 2:
200
+ return (
201
+ "Usage: meta <effect_col> <se_col> [study=<label_col>] [--re|--fe] [--forest] [--funnel]\n"
202
+ "Effect sizes must be pre-computed (e.g., Cohen's d, log OR, standardized mean diff)."
203
+ )
204
+
205
+ eff_col, se_col = tokens[0], tokens[1]
206
+ for c in [eff_col, se_col]:
207
+ if c not in df.columns:
208
+ return f"Column not found: {c}"
209
+
210
+ sub = df.select([c for c in [eff_col, se_col, study_col] if c]).drop_nulls()
211
+ effects = sub[eff_col].to_numpy().astype(float)
212
+ ses = sub[se_col].to_numpy().astype(float)
213
+ variances = ses ** 2
214
+ k = len(effects)
215
+
216
+ if k < 2:
217
+ return "Need at least 2 studies for meta-analysis."
218
+
219
+ labels = (
220
+ [str(v) for v in sub[study_col].to_list()]
221
+ if study_col
222
+ else [f"Study {i+1}" for i in range(k)]
223
+ )
224
+
225
+ # Run analysis
226
+ method = "FE" if fe else "RE"
227
+ res = _meta_fe(effects, variances) if fe else _meta_re(effects, variances)
228
+
229
+ lines = [
230
+ f"Studies (k): {k}",
231
+ f"Method: {'Fixed-effects (IV)' if fe else 'Random-effects (DerSimonian-Laird)'}",
232
+ "",
233
+ "Individual Studies:",
234
+ f" {'Study':<20} {'Effect':>8} {'SE':>7} {'95% CI':>18}",
235
+ " " + "-" * 55,
236
+ ]
237
+ ci_low_arr = effects - 1.96 * ses
238
+ ci_high_arr = effects + 1.96 * ses
239
+ for lbl, eff, se_i, lo, hi in zip(labels, effects, ses, ci_low_arr, ci_high_arr):
240
+ lines.append(f" {lbl:<20} {eff:>8.4f} {se_i:>7.4f} [{lo:>7.4f}, {hi:>7.4f}]")
241
+
242
+ lines += [
243
+ "",
244
+ "Pooled Estimate:",
245
+ f" Effect = {res['pooled']:.4f} (95% CI: [{res['ci_low']:.4f}, {res['ci_high']:.4f}])",
246
+ f" SE = {res['se']:.4f} z = {res['z']:.3f} p = {res['p']:.4f} {_sig(res['p'])}",
247
+ "",
248
+ "Heterogeneity:",
249
+ f" Q({res['Q_df']}) = {res['Q']:.3f} p = {res['Q_p']:.4f}",
250
+ f" I² = {res['I2']:.1f}%",
251
+ ]
252
+ if not fe:
253
+ lines.append(f" τ² = {res['tau2']:.4f} τ = {res['tau']:.4f}")
254
+
255
+ out = "\n" + "=" * 60 + "\nMeta-Analysis Results\n" + "=" * 60 + "\n"
256
+ out += "\n".join(lines) + "\n" + "=" * 60
257
+
258
+ # Plots
259
+ plot_msgs = []
260
+ if forest:
261
+ try:
262
+ p = _plot_forest(
263
+ labels, effects, ci_low_arr, ci_high_arr,
264
+ res["pooled"], res["ci_low"], res["ci_high"],
265
+ session.output_dir,
266
+ title=f"Forest Plot (pooled={res['pooled']:.3f})",
267
+ )
268
+ session.plot_paths.append(str(p))
269
+ plot_msgs.append(f"Forest plot saved: {p}")
270
+ except Exception as exc:
271
+ plot_msgs.append(f"Forest plot error: {exc}")
272
+
273
+ if funnel:
274
+ try:
275
+ p = _plot_funnel(effects, ses, res["pooled"], session.output_dir)
276
+ session.plot_paths.append(str(p))
277
+ plot_msgs.append(f"Funnel plot saved: {p}")
278
+ except Exception as exc:
279
+ plot_msgs.append(f"Funnel plot error: {exc}")
280
+
281
+ if plot_msgs:
282
+ out += "\n" + "\n".join(plot_msgs)
283
+
284
+ return out
@@ -0,0 +1,228 @@
1
+ """Multiple imputation commands: mi impute, mi estimate, mi describe."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ import io
7
+
8
+ import polars as pl
9
+ from rich.console import Console
10
+ from rich.table import Table
11
+
12
+ from openstat.session import Session, ModelResult
13
+ from openstat.commands.base import command, CommandArgs, rich_to_str, friendly_error
14
+ from openstat.dsl.parser import parse_formula, ParseError
15
+
16
+
17
+ def _parse_impute_specs(args: str) -> tuple[list[tuple[str, str]], dict]:
18
+ """Parse imputation specification.
19
+
20
+ Formats:
21
+ chained (regress) x1 (logit) x2, add(5)
22
+ pmm x1 x2, add(10) knn(5)
23
+
24
+ Returns (specs, options).
25
+ """
26
+ # Split on comma for options
27
+ if ',' in args:
28
+ spec_part, opt_part = args.split(',', 1)
29
+ else:
30
+ spec_part, opt_part = args, ""
31
+
32
+ # Parse options
33
+ options: dict = {}
34
+ m_add = re.search(r'add\((\d+)\)', opt_part)
35
+ if m_add:
36
+ options["m"] = int(m_add.group(1))
37
+ m_knn = re.search(r'knn\((\d+)\)', opt_part)
38
+ if m_knn:
39
+ options["knn"] = int(m_knn.group(1))
40
+
41
+ spec_part = spec_part.strip()
42
+ specs: list[tuple[str, str]] = []
43
+
44
+ if spec_part.startswith("chained"):
45
+ # Parse (method) var pairs
46
+ spec_str = spec_part[len("chained"):].strip()
47
+ pattern = r'\((\w+)\)\s+(\w+)'
48
+ for match in re.finditer(pattern, spec_str):
49
+ method = match.group(1)
50
+ col = match.group(2)
51
+ specs.append((method, col))
52
+ elif spec_part.startswith("pmm"):
53
+ # PMM for all listed variables
54
+ cols = spec_part[len("pmm"):].strip().split()
55
+ for col in cols:
56
+ specs.append(("pmm", col))
57
+ else:
58
+ # Default: regress for all variables
59
+ cols = spec_part.split()
60
+ for col in cols:
61
+ specs.append(("regress", col))
62
+
63
+ return specs, options
64
+
65
+
66
+ @command("mi", usage="mi impute|estimate|describe ...")
67
+ def cmd_mi(session: Session, args: str) -> str:
68
+ """Multiple imputation: impute missing data, estimate models, describe imputations."""
69
+ df = session.require_data()
70
+
71
+ parts = args.strip().split(None, 1)
72
+ subcmd = parts[0].lower() if parts else ""
73
+ rest = parts[1] if len(parts) > 1 else ""
74
+
75
+ if subcmd == "impute":
76
+ return _mi_impute(session, df, rest)
77
+ elif subcmd.startswith("estimate"):
78
+ # Handle "estimate:" or "estimate :"
79
+ model_cmd = rest
80
+ if model_cmd.startswith(":"):
81
+ model_cmd = model_cmd[1:].strip()
82
+ elif ":" in args:
83
+ model_cmd = args.split(":", 1)[1].strip()
84
+ return _mi_estimate(session, model_cmd)
85
+ elif subcmd == "describe":
86
+ return _mi_describe(session)
87
+ else:
88
+ return "Usage: mi impute|estimate|describe\n mi impute chained (regress) x1 (logit) x2, add(5)\n mi estimate: ols y ~ x1 + x2\n mi describe"
89
+
90
+ def _mi_impute(session: Session, df: pl.DataFrame, args: str) -> str:
91
+ """Run MICE imputation."""
92
+ specs, options = _parse_impute_specs(args)
93
+ if not specs:
94
+ return "No variables specified for imputation."
95
+
96
+ m = options.get("m", 5)
97
+ knn = options.get("knn", 5)
98
+
99
+ # Validate columns
100
+ for method, col in specs:
101
+ if col not in df.columns:
102
+ return f"Column not found: {col}"
103
+
104
+ # Check for missing values (both null and NaN)
105
+ has_missing = False
106
+ for _, col in specs:
107
+ if df[col].null_count() > 0:
108
+ has_missing = True
109
+ break
110
+ if df[col].dtype.is_float() and df[col].is_nan().sum() > 0:
111
+ has_missing = True
112
+ break
113
+ if not has_missing:
114
+ return "No missing values found in specified columns."
115
+
116
+ try:
117
+ from openstat.stats.imputation import mice_impute
118
+
119
+ datasets = mice_impute(df, specs, m=m)
120
+ session._imputed_datasets = datasets
121
+ session._mi_m = m
122
+
123
+ lines = [f"Created {m} imputed datasets."]
124
+ for method, col in specs:
125
+ n_missing = df[col].null_count()
126
+ lines.append(f" {col}: {n_missing} missing values imputed ({method})")
127
+ lines.append(f"\nUse 'mi estimate: <model>' to run analysis across imputed datasets.")
128
+ return "\n".join(lines)
129
+ except Exception as e:
130
+ return friendly_error(e, "mi impute")
131
+
132
+
133
+ def _mi_estimate(session: Session, model_cmd: str) -> str:
134
+ """Run model on each imputed dataset and combine with Rubin's rules."""
135
+ if not session._imputed_datasets:
136
+ return "No imputed datasets. Run 'mi impute' first."
137
+
138
+ model_cmd = model_cmd.strip()
139
+ if not model_cmd:
140
+ return "Usage: mi estimate: ols y ~ x1 + x2"
141
+
142
+ # Parse model type and formula
143
+ parts = model_cmd.split(None, 1)
144
+ model_type = parts[0].lower()
145
+ formula_str = parts[1] if len(parts) > 1 else ""
146
+
147
+ try:
148
+ dep, indeps = parse_formula(formula_str)
149
+ except ParseError as e:
150
+ return f"Formula error: {e}"
151
+
152
+ estimates: list[dict[str, float]] = []
153
+ std_errors: list[dict[str, float]] = []
154
+
155
+ for i, imp_df in enumerate(session._imputed_datasets):
156
+ try:
157
+ if model_type == "ols":
158
+ from openstat.stats.models import fit_ols
159
+ result, _ = fit_ols(imp_df, dep, indeps)
160
+ elif model_type == "logit":
161
+ from openstat.stats.models import fit_logit
162
+ result, _ = fit_logit(imp_df, dep, indeps)
163
+ else:
164
+ return f"Unsupported model for MI: {model_type}. Use ols or logit."
165
+
166
+ estimates.append(dict(result.params))
167
+ std_errors.append(dict(result.std_errors))
168
+ except Exception as e:
169
+ return f"Error fitting imputation {i + 1}: {e}"
170
+
171
+ # Combine with Rubin's rules
172
+ from openstat.stats.imputation import rubins_rules
173
+ n_obs = session._imputed_datasets[0].height
174
+ mi_result = rubins_rules(estimates, std_errors, n_obs)
175
+ mi_result.model_type = f"MI ({mi_result.m} imputations): {model_type.upper()}"
176
+ mi_result.formula = f"{dep} ~ {' + '.join(indeps)}"
177
+
178
+ # Display as table
179
+ def render(console: Console) -> None:
180
+ table = Table(title=mi_result.model_type)
181
+ table.add_column("Variable", style="cyan")
182
+ table.add_column("Coef", justify="right")
183
+ table.add_column("MI Std.Err", justify="right")
184
+ table.add_column("t", justify="right")
185
+ table.add_column("P>|t|", justify="right")
186
+ table.add_column("[95% CI Low]", justify="right")
187
+ table.add_column("[95% CI High]", justify="right")
188
+ table.add_column("FMI", justify="right")
189
+
190
+ for var in mi_result.params:
191
+ sig = ""
192
+ pv = mi_result.p_values[var]
193
+ if pv < 0.001:
194
+ sig = " ***"
195
+ elif pv < 0.01:
196
+ sig = " **"
197
+ elif pv < 0.05:
198
+ sig = " *"
199
+
200
+ table.add_row(
201
+ var,
202
+ f"{mi_result.params[var]:.4f}",
203
+ f"{mi_result.std_errors[var]:.4f}",
204
+ f"{mi_result.t_values[var]:.3f}",
205
+ f"{pv:.4f}{sig}",
206
+ f"{mi_result.conf_int_low[var]:.4f}",
207
+ f"{mi_result.conf_int_high[var]:.4f}",
208
+ f"{mi_result.fmi[var]:.3f}",
209
+ )
210
+ console.print(table)
211
+
212
+ output = rich_to_str(render)
213
+ output += f"\nN = {mi_result.n_obs} | Imputations = {mi_result.m}"
214
+ return output
215
+
216
+
217
+ def _mi_describe(session: Session) -> str:
218
+ """Describe imputed datasets."""
219
+ if not session._imputed_datasets:
220
+ return "No imputed datasets. Run 'mi impute' first."
221
+
222
+ lines = [
223
+ f"Multiple Imputation Summary:",
224
+ f" Number of imputations (m): {session._mi_m}",
225
+ f" Rows per dataset: {session._imputed_datasets[0].height}",
226
+ f" Columns: {session._imputed_datasets[0].width}",
227
+ ]
228
+ return "\n".join(lines)
@@ -0,0 +1,79 @@
1
+ """Mixed / hierarchical model commands: mixed, estat icc."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from openstat.session import Session, ModelResult
6
+ from openstat.commands.base import command, CommandArgs, friendly_error
7
+ from openstat.dsl.parser import parse_formula, ParseError
8
+
9
+
10
+ def _parse_mixed_formula(raw: str) -> tuple[str, list[str], str, list[str]]:
11
+ """Parse mixed model formula: y ~ x1 || group: x1.
12
+
13
+ Returns (dep, fixed_effects, group_var, random_effects).
14
+ """
15
+ if '||' not in raw:
16
+ raise ParseError("Mixed model requires '||' to specify grouping: y ~ x1 || group: [re_vars]")
17
+
18
+ fixed_part, random_part = raw.split('||', 1)
19
+ dep, fixed = parse_formula(fixed_part.strip())
20
+
21
+ random_part = random_part.strip()
22
+ if ':' in random_part:
23
+ group_str, re_str = random_part.split(':', 1)
24
+ group_var = group_str.strip()
25
+ re_vars = re_str.strip().split() if re_str.strip() else []
26
+ else:
27
+ group_var = random_part.strip()
28
+ re_vars = []
29
+
30
+ if not group_var:
31
+ raise ParseError("No grouping variable specified after '||'")
32
+
33
+ return dep, fixed, group_var, re_vars
34
+
35
+
36
+ @command("mixed", usage="mixed y ~ x1 || group: [re_vars]")
37
+ def cmd_mixed(session: Session, args: str) -> str:
38
+ """Fit a mixed/hierarchical linear model with random intercepts and/or slopes."""
39
+ df = session.require_data()
40
+
41
+ try:
42
+ dep, fixed, group_var, re_vars = _parse_mixed_formula(args.strip())
43
+ except ParseError as e:
44
+ return f"Formula error: {e}"
45
+
46
+ # Validate columns
47
+ all_cols = [dep] + fixed + [group_var] + re_vars
48
+ missing = [c for c in all_cols if c not in df.columns]
49
+ if missing:
50
+ return f"Columns not found: {', '.join(missing)}"
51
+
52
+ try:
53
+ from openstat.stats.mixed import fit_mixed
54
+
55
+ result, raw = fit_mixed(df, dep, fixed, group_var, re_vars or None)
56
+
57
+ session._last_model = raw
58
+ session._last_model_vars = (dep, fixed)
59
+ session._last_fit_result = result
60
+ session._last_fit_kwargs = {"group_var": group_var, "re_vars": re_vars}
61
+
62
+ md = result.to_markdown() if hasattr(result, "to_markdown") else ""
63
+ session.results.append(ModelResult(
64
+ name="Mixed LM", formula=result.formula,
65
+ table=md, details={
66
+ "n_obs": result.n_obs,
67
+ "params": dict(result.params),
68
+ "aic": result.aic,
69
+ "bic": result.bic,
70
+ "log_likelihood": result.log_likelihood,
71
+ },
72
+ ))
73
+
74
+ output = result.summary_table()
75
+ if result.warnings:
76
+ output += "\n" + "\n".join(result.warnings)
77
+ return output
78
+ except Exception as e:
79
+ return friendly_error(e, "mixed")