openstat-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. openstat/__init__.py +3 -0
  2. openstat/__main__.py +4 -0
  3. openstat/backends/__init__.py +16 -0
  4. openstat/backends/duckdb_backend.py +70 -0
  5. openstat/backends/polars_backend.py +52 -0
  6. openstat/cli.py +92 -0
  7. openstat/commands/__init__.py +82 -0
  8. openstat/commands/adv_stat_cmds.py +1255 -0
  9. openstat/commands/advanced_ml_cmds.py +576 -0
  10. openstat/commands/advreg_cmds.py +207 -0
  11. openstat/commands/alias_cmds.py +135 -0
  12. openstat/commands/arch_cmds.py +82 -0
  13. openstat/commands/arules_cmds.py +111 -0
  14. openstat/commands/automodel_cmds.py +212 -0
  15. openstat/commands/backend_cmds.py +82 -0
  16. openstat/commands/base.py +170 -0
  17. openstat/commands/bayes_cmds.py +71 -0
  18. openstat/commands/causal_cmds.py +269 -0
  19. openstat/commands/cluster_cmds.py +152 -0
  20. openstat/commands/data_cmds.py +996 -0
  21. openstat/commands/datamanip_cmds.py +672 -0
  22. openstat/commands/dataquality_cmds.py +174 -0
  23. openstat/commands/datetime_cmds.py +176 -0
  24. openstat/commands/dimreduce_cmds.py +184 -0
  25. openstat/commands/discrete_cmds.py +149 -0
  26. openstat/commands/dsl_cmds.py +143 -0
  27. openstat/commands/epi_cmds.py +93 -0
  28. openstat/commands/equiv_tobit_cmds.py +94 -0
  29. openstat/commands/esttab_cmds.py +196 -0
  30. openstat/commands/export_beamer_cmds.py +142 -0
  31. openstat/commands/export_cmds.py +201 -0
  32. openstat/commands/export_extra_cmds.py +240 -0
  33. openstat/commands/factor_cmds.py +180 -0
  34. openstat/commands/groupby_cmds.py +155 -0
  35. openstat/commands/help_cmds.py +237 -0
  36. openstat/commands/i18n_cmds.py +43 -0
  37. openstat/commands/import_extra_cmds.py +561 -0
  38. openstat/commands/influence_cmds.py +134 -0
  39. openstat/commands/iv_cmds.py +106 -0
  40. openstat/commands/manova_cmds.py +105 -0
  41. openstat/commands/mediate_cmds.py +233 -0
  42. openstat/commands/meta_cmds.py +284 -0
  43. openstat/commands/mi_cmds.py +228 -0
  44. openstat/commands/mixed_cmds.py +79 -0
  45. openstat/commands/mixture_changepoint_cmds.py +166 -0
  46. openstat/commands/ml_adv_cmds.py +147 -0
  47. openstat/commands/ml_cmds.py +178 -0
  48. openstat/commands/model_eval_cmds.py +142 -0
  49. openstat/commands/network_cmds.py +288 -0
  50. openstat/commands/nlquery_cmds.py +161 -0
  51. openstat/commands/nonparam_cmds.py +149 -0
  52. openstat/commands/outreg_cmds.py +247 -0
  53. openstat/commands/panel_cmds.py +141 -0
  54. openstat/commands/pdf_cmds.py +226 -0
  55. openstat/commands/pipeline_cmds.py +319 -0
  56. openstat/commands/plot_cmds.py +189 -0
  57. openstat/commands/plugin_cmds.py +79 -0
  58. openstat/commands/posthoc_cmds.py +153 -0
  59. openstat/commands/power_cmds.py +172 -0
  60. openstat/commands/profile_cmds.py +246 -0
  61. openstat/commands/rbridge_cmds.py +81 -0
  62. openstat/commands/regex_cmds.py +104 -0
  63. openstat/commands/report_cmds.py +48 -0
  64. openstat/commands/repro_cmds.py +129 -0
  65. openstat/commands/resampling_cmds.py +109 -0
  66. openstat/commands/reshape_cmds.py +223 -0
  67. openstat/commands/sem_cmds.py +177 -0
  68. openstat/commands/stat_cmds.py +1040 -0
  69. openstat/commands/stata_import_cmds.py +215 -0
  70. openstat/commands/string_cmds.py +124 -0
  71. openstat/commands/surv_cmds.py +145 -0
  72. openstat/commands/survey_cmds.py +153 -0
  73. openstat/commands/textanalysis_cmds.py +192 -0
  74. openstat/commands/ts_adv_cmds.py +136 -0
  75. openstat/commands/ts_cmds.py +195 -0
  76. openstat/commands/tui_cmds.py +111 -0
  77. openstat/commands/ux_cmds.py +191 -0
  78. openstat/commands/validate_cmds.py +270 -0
  79. openstat/commands/viz_adv_cmds.py +312 -0
  80. openstat/commands/viz_extra_cmds.py +251 -0
  81. openstat/commands/watch_cmds.py +69 -0
  82. openstat/config.py +106 -0
  83. openstat/dsl/__init__.py +0 -0
  84. openstat/dsl/parser.py +332 -0
  85. openstat/dsl/tokenizer.py +105 -0
  86. openstat/i18n.py +120 -0
  87. openstat/io/__init__.py +0 -0
  88. openstat/io/loader.py +187 -0
  89. openstat/jupyter/__init__.py +18 -0
  90. openstat/jupyter/display.py +18 -0
  91. openstat/jupyter/magic.py +60 -0
  92. openstat/logging_config.py +59 -0
  93. openstat/plots/__init__.py +0 -0
  94. openstat/plots/plotter.py +437 -0
  95. openstat/plots/surv_plots.py +32 -0
  96. openstat/plots/ts_plots.py +59 -0
  97. openstat/plugins/__init__.py +5 -0
  98. openstat/plugins/manager.py +69 -0
  99. openstat/repl.py +457 -0
  100. openstat/reporting/__init__.py +0 -0
  101. openstat/reporting/eda.py +208 -0
  102. openstat/reporting/report.py +67 -0
  103. openstat/script_runner.py +319 -0
  104. openstat/session.py +133 -0
  105. openstat/stats/__init__.py +0 -0
  106. openstat/stats/advanced_regression.py +269 -0
  107. openstat/stats/arch_garch.py +84 -0
  108. openstat/stats/bayesian.py +103 -0
  109. openstat/stats/causal.py +258 -0
  110. openstat/stats/clustering.py +206 -0
  111. openstat/stats/discrete.py +311 -0
  112. openstat/stats/epidemiology.py +119 -0
  113. openstat/stats/equiv_tobit.py +163 -0
  114. openstat/stats/factor.py +174 -0
  115. openstat/stats/imputation.py +282 -0
  116. openstat/stats/influence.py +78 -0
  117. openstat/stats/iv.py +131 -0
  118. openstat/stats/manova.py +124 -0
  119. openstat/stats/mixed.py +128 -0
  120. openstat/stats/ml.py +275 -0
  121. openstat/stats/ml_advanced.py +117 -0
  122. openstat/stats/model_eval.py +183 -0
  123. openstat/stats/models.py +1342 -0
  124. openstat/stats/nonparametric.py +130 -0
  125. openstat/stats/panel.py +179 -0
  126. openstat/stats/power.py +295 -0
  127. openstat/stats/resampling.py +203 -0
  128. openstat/stats/survey.py +213 -0
  129. openstat/stats/survival.py +196 -0
  130. openstat/stats/timeseries.py +142 -0
  131. openstat/stats/ts_advanced.py +114 -0
  132. openstat/types.py +11 -0
  133. openstat/web/__init__.py +1 -0
  134. openstat/web/app.py +117 -0
  135. openstat/web/session_manager.py +73 -0
  136. openstat/web/static/app.js +117 -0
  137. openstat/web/static/index.html +38 -0
  138. openstat/web/static/style.css +103 -0
  139. openstat_cli-1.0.0.dist-info/METADATA +748 -0
  140. openstat_cli-1.0.0.dist-info/RECORD +143 -0
  141. openstat_cli-1.0.0.dist-info/WHEEL +4 -0
  142. openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
  143. openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,312 @@
1
+ """Advanced visualization: 3D, interactive, animated, missing, map."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from openstat.commands.base import command, CommandArgs, friendly_error
6
+ from openstat.session import Session
7
+
8
+
9
+ @command("plot3d", usage="plot3d <x> <y> <z> [--color=<col>]")
10
+ def cmd_plot3d(session: Session, args: str) -> str:
11
+ """3D scatter plot.
12
+
13
+ Examples:
14
+ plot3d x y z
15
+ plot3d age income educ --color=gender
16
+ """
17
+ import matplotlib
18
+ matplotlib.use("Agg")
19
+ import matplotlib.pyplot as plt
20
+ from mpl_toolkits.mplot3d import Axes3D # noqa: F401
21
+ import numpy as np
22
+ import polars as pl
23
+
24
+ ca = CommandArgs(args)
25
+ if len(ca.positional) < 3:
26
+ return "Usage: plot3d <x> <y> <z> [--color=<col>]"
27
+ x_col, y_col, z_col = ca.positional[:3]
28
+ color_col = ca.options.get("color")
29
+
30
+ try:
31
+ df = session.require_data()
32
+ for c in [x_col, y_col, z_col]:
33
+ if c not in df.columns:
34
+ return f"Column not found: {c}"
35
+
36
+ cols = [x_col, y_col, z_col]
37
+ if color_col and color_col in df.columns:
38
+ cols.append(color_col)
39
+ sub = df.select(cols).drop_nulls()
40
+
41
+ x = sub[x_col].to_numpy().astype(float)
42
+ y = sub[y_col].to_numpy().astype(float)
43
+ z = sub[z_col].to_numpy().astype(float)
44
+
45
+ fig = plt.figure(figsize=(9, 7))
46
+ ax = fig.add_subplot(111, projection="3d")
47
+
48
+ if color_col and color_col in sub.columns:
49
+ cats = sub[color_col].cast(pl.Utf8).to_list()
50
+ unique_cats = sorted(set(cats))
51
+ cmap = plt.colormaps.get_cmap("tab10")
52
+ for i, cat in enumerate(unique_cats):
53
+ mask = [c == cat for c in cats]
54
+ ax.scatter(x[mask], y[mask], z[mask],
55
+ label=str(cat), alpha=0.7, color=cmap(i / max(len(unique_cats), 1)))
56
+ ax.legend(title=color_col)
57
+ else:
58
+ ax.scatter(x, y, z, alpha=0.6, color="#4C72B0")
59
+
60
+ ax.set_xlabel(x_col)
61
+ ax.set_ylabel(y_col)
62
+ ax.set_zlabel(z_col)
63
+ ax.set_title(f"3D Scatter: {x_col} × {y_col} × {z_col}")
64
+ fig.tight_layout()
65
+
66
+ session.output_dir.mkdir(parents=True, exist_ok=True)
67
+ from pathlib import Path
68
+ path = session.output_dir / "scatter3d.png"
69
+ fig.savefig(path, dpi=150)
70
+ plt.close(fig)
71
+ session.plot_paths.append(str(path))
72
+ return f"3D scatter plot saved: {path}"
73
+ except Exception as e:
74
+ return friendly_error(e, "plot3d")
75
+
76
+
77
+ @command("plotmissing", usage="plotmissing [cols...]")
78
+ def cmd_plotmissing(session: Session, args: str) -> str:
79
+ """Missing data heatmap: visualise missingness patterns across columns.
80
+
81
+ Shows rows (sampled if large) × columns with missing/present colour coding.
82
+
83
+ Examples:
84
+ plotmissing
85
+ plotmissing income age education
86
+ """
87
+ import matplotlib
88
+ matplotlib.use("Agg")
89
+ import matplotlib.pyplot as plt
90
+ import numpy as np
91
+ import polars as pl
92
+
93
+ ca = CommandArgs(args)
94
+ try:
95
+ df = session.require_data()
96
+ cols = ca.positional if ca.positional else df.columns
97
+
98
+ missing_cols = [c for c in cols if c not in df.columns]
99
+ if missing_cols:
100
+ return f"Columns not found: {', '.join(missing_cols)}"
101
+
102
+ # Sample up to 500 rows for readability
103
+ MAX_ROWS = 500
104
+ if df.height > MAX_ROWS:
105
+ import random
106
+ idx = sorted(random.sample(range(df.height), MAX_ROWS))
107
+ sub = df[idx].select(cols)
108
+ else:
109
+ sub = df.select(cols)
110
+
111
+ # Build missing matrix
112
+ mat = np.zeros((sub.height, len(cols)))
113
+ for j, c in enumerate(cols):
114
+ mat[:, j] = sub[c].is_null().to_numpy().astype(float)
115
+
116
+ miss_pct = [f"{100*df[c].null_count()/df.height:.0f}%" for c in cols]
117
+ labels = [f"{c}\n({p})" for c, p in zip(cols, miss_pct)]
118
+
119
+ fig, ax = plt.subplots(figsize=(max(8, len(cols) * 0.5 + 2), 6))
120
+ ax.imshow(mat.T, aspect="auto", cmap="RdYlGn_r", interpolation="nearest",
121
+ vmin=0, vmax=1)
122
+ ax.set_yticks(range(len(cols)))
123
+ ax.set_yticklabels(labels, fontsize=8)
124
+ ax.set_xlabel("Rows")
125
+ ax.set_title("Missingness Heatmap (red=missing, green=present)")
126
+ fig.tight_layout()
127
+
128
+ session.output_dir.mkdir(parents=True, exist_ok=True)
129
+ from pathlib import Path
130
+ path = session.output_dir / "missing_heatmap.png"
131
+ fig.savefig(path, dpi=150)
132
+ plt.close(fig)
133
+ session.plot_paths.append(str(path))
134
+
135
+ # Summary table
136
+ lines = [f"Missing data heatmap saved: {path}", ""]
137
+ lines.append(f" {'Column':<25} {'Missing':>8} {'%':>6}")
138
+ lines.append(" " + "-" * 42)
139
+ for c in cols:
140
+ n = df[c].null_count()
141
+ p = 100 * n / df.height
142
+ lines.append(f" {c:<25} {n:>8,} {p:>5.1f}%")
143
+ return "\n".join(lines)
144
+ except Exception as e:
145
+ return friendly_error(e, "plotmissing")
146
+
147
+
148
+ @command("plotinteractive", usage="plotinteractive scatter|bar|line|hist <args> [--out=plot.html]")
149
+ def cmd_plotinteractive(session: Session, args: str) -> str:
150
+ """Interactive Plotly chart (saved as HTML — open in browser).
151
+
152
+ Sub-commands: scatter, bar, line, hist, box, heatmap
153
+
154
+ Options:
155
+ --out=<path> output HTML file (default: outputs/interactive_plot.html)
156
+ --title=<txt> chart title
157
+
158
+ Examples:
159
+ plotinteractive scatter y x
160
+ plotinteractive hist income --out=income_dist.html
161
+ plotinteractive bar category value
162
+ plotinteractive line date price --title="Price Over Time"
163
+ """
164
+ try:
165
+ import plotly.express as px
166
+ import plotly.io as pio
167
+ except ImportError:
168
+ return "plotly required. Install: pip install plotly"
169
+
170
+ import polars as pl
171
+ ca = CommandArgs(args)
172
+ if not ca.positional:
173
+ return "Usage: plotinteractive scatter|bar|line|hist|box|heatmap <args>"
174
+
175
+ subcmd = ca.positional[0].lower()
176
+ out_path = ca.options.get("out", str(session.output_dir / "interactive_plot.html"))
177
+ title = ca.options.get("title", f"Interactive {subcmd.title()}")
178
+
179
+ try:
180
+ df = session.require_data()
181
+ pd_df = df.to_pandas()
182
+
183
+ if subcmd == "scatter":
184
+ if len(ca.positional) < 3:
185
+ return "Usage: plotinteractive scatter <y> <x>"
186
+ y_col, x_col = ca.positional[1], ca.positional[2]
187
+ color_col = ca.options.get("color")
188
+ fig = px.scatter(pd_df, x=x_col, y=y_col, color=color_col, title=title, trendline="ols")
189
+
190
+ elif subcmd == "hist":
191
+ if len(ca.positional) < 2:
192
+ return "Usage: plotinteractive hist <col>"
193
+ col = ca.positional[1]
194
+ fig = px.histogram(pd_df, x=col, title=title, nbins=30)
195
+
196
+ elif subcmd == "bar":
197
+ if len(ca.positional) < 3:
198
+ return "Usage: plotinteractive bar <category> <value>"
199
+ cat_col, val_col = ca.positional[1], ca.positional[2]
200
+ fig = px.bar(pd_df, x=cat_col, y=val_col, title=title)
201
+
202
+ elif subcmd == "line":
203
+ if len(ca.positional) < 3:
204
+ return "Usage: plotinteractive line <x> <y>"
205
+ x_col, y_col = ca.positional[1], ca.positional[2]
206
+ color_col = ca.options.get("color")
207
+ fig = px.line(pd_df, x=x_col, y=y_col, color=color_col, title=title)
208
+
209
+ elif subcmd == "box":
210
+ if len(ca.positional) < 2:
211
+ return "Usage: plotinteractive box <col> [by <group>]"
212
+ col = ca.positional[1]
213
+ by_raw = ca.rest_after("by")
214
+ group = by_raw.strip().split()[0] if by_raw else None
215
+ fig = px.box(pd_df, y=col, x=group, title=title)
216
+
217
+ elif subcmd == "heatmap":
218
+
219
+ NUMERIC = ["float32", "float64", "int32", "int64", "int8", "int16"]
220
+ num_cols = [c for c in df.columns if str(df[c].dtype).lower() in NUMERIC]
221
+ corr = df.select(num_cols).to_pandas().corr()
222
+ fig = px.imshow(corr, title="Correlation Heatmap", color_continuous_scale="RdBu", zmin=-1, zmax=1)
223
+
224
+ else:
225
+ return f"Unknown sub-command: {subcmd}"
226
+
227
+ from pathlib import Path
228
+ Path(out_path).parent.mkdir(parents=True, exist_ok=True)
229
+ pio.write_html(fig, out_path)
230
+ return f"Interactive {subcmd} chart saved: {out_path}\nOpen in a web browser to view."
231
+ except Exception as e:
232
+ return friendly_error(e, "plotinteractive")
233
+
234
+
235
+ @command("plotanimated", usage="plotanimated <y> <x> <time_col> [--out=anim.gif]")
236
+ def cmd_plotanimated(session: Session, args: str) -> str:
237
+ """Animated line/scatter plot over a time variable (saved as GIF).
238
+
239
+ Shows how the relationship between x and y changes across time steps.
240
+
241
+ Examples:
242
+ plotanimated sales month year --out=sales_trend.gif
243
+ plotanimated price date --out=price_animation.gif
244
+ """
245
+ import matplotlib
246
+ matplotlib.use("Agg")
247
+ import matplotlib.pyplot as plt
248
+ import matplotlib.animation as animation
249
+ import numpy as np
250
+ import polars as pl
251
+
252
+ ca = CommandArgs(args)
253
+ if len(ca.positional) < 3:
254
+ return "Usage: plotanimated <y> <x> <time_col> [--out=anim.gif]"
255
+
256
+ y_col, x_col, t_col = ca.positional[0], ca.positional[1], ca.positional[2]
257
+ out_path = ca.options.get("out", str(session.output_dir / "animated.gif"))
258
+
259
+ try:
260
+ df = session.require_data()
261
+ for c in [y_col, x_col, t_col]:
262
+ if c not in df.columns:
263
+ return f"Column not found: {c}"
264
+
265
+ sub = df.select([t_col, x_col, y_col]).drop_nulls().sort(t_col)
266
+ time_vals = sub[t_col].cast(pl.Utf8).to_list()
267
+ unique_times = sorted(set(time_vals))
268
+
269
+ if len(unique_times) < 2:
270
+ return f"Need at least 2 unique values in '{t_col}' for animation."
271
+ if len(unique_times) > 60:
272
+ unique_times = unique_times[:60] # cap
273
+
274
+ x_all = sub[x_col].to_numpy().astype(float)
275
+ y_all = sub[y_col].to_numpy().astype(float)
276
+ x_min, x_max = x_all.min(), x_all.max()
277
+ y_min, y_max = y_all.min(), y_all.max()
278
+
279
+ fig, ax = plt.subplots(figsize=(7, 5))
280
+ scatter = ax.scatter([], [], alpha=0.7, color="#4C72B0", s=30)
281
+ ax.set_xlim(x_min - abs(x_min) * 0.05, x_max + abs(x_max) * 0.05)
282
+ ax.set_ylim(y_min - abs(y_min) * 0.05, y_max + abs(y_max) * 0.05)
283
+ ax.set_xlabel(x_col)
284
+ ax.set_ylabel(y_col)
285
+ title_obj = ax.set_title("")
286
+
287
+ def update(frame):
288
+ t = unique_times[frame]
289
+ mask = [tv == t for tv in time_vals]
290
+ xd = x_all[mask]
291
+ yd = y_all[mask]
292
+ scatter.set_offsets(np.column_stack([xd, yd]))
293
+ title_obj.set_text(f"{t_col}: {t}")
294
+ return scatter, title_obj
295
+
296
+ ani = animation.FuncAnimation(fig, update, frames=len(unique_times),
297
+ interval=300, blit=False)
298
+
299
+ from pathlib import Path
300
+ Path(out_path).parent.mkdir(parents=True, exist_ok=True)
301
+ try:
302
+ ani.save(out_path, writer="pillow", fps=3)
303
+ except Exception:
304
+ # Fallback: save as MP4 or just first frame
305
+ out_path = out_path.replace(".gif", ".png")
306
+ update(0)
307
+ fig.savefig(out_path, dpi=120)
308
+ plt.close(fig)
309
+ session.plot_paths.append(out_path)
310
+ return f"Animated plot saved: {out_path} ({len(unique_times)} frames)"
311
+ except Exception as e:
312
+ return friendly_error(e, "plotanimated")
@@ -0,0 +1,251 @@
1
+ """Extra visualization commands: corrplot, pairplot, violin, qq, residplot."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ from openstat.commands.base import command
8
+ from openstat.session import Session
9
+
10
+
11
+ def _stata_opts(raw: str) -> tuple[list[str], dict[str, str]]:
12
+ opts: dict[str, str] = {}
13
+ for m in re.finditer(r'(\w+)\(([^)]*)\)', raw):
14
+ opts[m.group(1).lower()] = m.group(2)
15
+ rest = re.sub(r'\w+\([^)]*\)', '', raw)
16
+ positional = [t.strip(',') for t in rest.split() if t.strip(',')]
17
+ return positional, opts
18
+
19
+
20
+ def _save_or_show(fig, path: str | None, default_name: str) -> str:
21
+ import os
22
+ if path:
23
+ os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
24
+ fig.savefig(path, dpi=150, bbox_inches="tight")
25
+ return path
26
+ os.makedirs("outputs", exist_ok=True)
27
+ out = f"outputs/{default_name}"
28
+ fig.savefig(out, dpi=150, bbox_inches="tight")
29
+ return out
30
+
31
+
32
+ @command("corrplot", usage="corrplot [var1 var2 ...] [saving(path.png)]")
33
+ def cmd_corrplot(session: Session, args: str) -> str:
34
+ """Correlation matrix heatmap."""
35
+ try:
36
+ import matplotlib.pyplot as plt
37
+ import numpy as np
38
+ except ImportError:
39
+ return "matplotlib not installed."
40
+ df = session.require_data()
41
+ positional, opts = _stata_opts(args)
42
+ cols = [c for c in positional if c in df.columns]
43
+ if not cols:
44
+ cols = [c for c in df.columns if df[c].dtype in (
45
+ __import__("polars").Float64, __import__("polars").Float32,
46
+ __import__("polars").Int64, __import__("polars").Int32,
47
+ )][:12]
48
+ if len(cols) < 2:
49
+ return "corrplot requires at least 2 numeric variables."
50
+ try:
51
+ import polars as pl
52
+ data = df.select(cols).drop_nulls().to_numpy().astype(float)
53
+ corr = np.corrcoef(data.T)
54
+ fig, ax = plt.subplots(figsize=(max(6, len(cols)), max(5, len(cols) - 1)))
55
+ im = ax.imshow(corr, cmap="RdBu_r", vmin=-1, vmax=1, aspect="auto")
56
+ plt.colorbar(im, ax=ax, shrink=0.8)
57
+ ax.set_xticks(range(len(cols)))
58
+ ax.set_yticks(range(len(cols)))
59
+ ax.set_xticklabels(cols, rotation=45, ha="right", fontsize=9)
60
+ ax.set_yticklabels(cols, fontsize=9)
61
+ for i in range(len(cols)):
62
+ for j in range(len(cols)):
63
+ ax.text(j, i, f"{corr[i,j]:.2f}", ha="center", va="center", fontsize=7,
64
+ color="white" if abs(corr[i, j]) > 0.6 else "black")
65
+ ax.set_title("Correlation Matrix")
66
+ fig.tight_layout()
67
+ path = opts.get("saving")
68
+ out = _save_or_show(fig, path, "corrplot.png")
69
+ plt.close(fig)
70
+ return f"Correlation plot saved: {out}"
71
+ except Exception as exc:
72
+ return f"corrplot error: {exc}"
73
+
74
+
75
+ @command("pairplot", usage="pairplot [var1 var2 ...] [saving(path.png)]")
76
+ def cmd_pairplot(session: Session, args: str) -> str:
77
+ """Scatter matrix (pairplot) of numeric variables."""
78
+ try:
79
+ import matplotlib.pyplot as plt
80
+ import numpy as np
81
+ except ImportError:
82
+ return "matplotlib not installed."
83
+ df = session.require_data()
84
+ positional, opts = _stata_opts(args)
85
+ cols = [c for c in positional if c in df.columns]
86
+ if not cols:
87
+ import polars as pl
88
+ cols = [c for c in df.columns if df[c].dtype in (
89
+ pl.Float64, pl.Float32, pl.Int64, pl.Int32,
90
+ )][:6]
91
+ if len(cols) < 2:
92
+ return "pairplot requires at least 2 numeric variables."
93
+ try:
94
+ data = df.select(cols).drop_nulls().to_numpy().astype(float)
95
+ k = len(cols)
96
+ fig, axes = plt.subplots(k, k, figsize=(2.5 * k, 2.5 * k))
97
+ for i in range(k):
98
+ for j in range(k):
99
+ ax = axes[i, j]
100
+ if i == j:
101
+ ax.hist(data[:, i], bins=20, color="steelblue", alpha=0.7)
102
+ ax.set_ylabel("")
103
+ else:
104
+ ax.scatter(data[:, j], data[:, i], alpha=0.3, s=10, color="steelblue")
105
+ if i == k - 1:
106
+ ax.set_xlabel(cols[j], fontsize=8)
107
+ if j == 0:
108
+ ax.set_ylabel(cols[i], fontsize=8)
109
+ ax.tick_params(labelsize=6)
110
+ fig.suptitle("Scatter Matrix", fontsize=12)
111
+ fig.tight_layout()
112
+ path = opts.get("saving")
113
+ out = _save_or_show(fig, path, "pairplot.png")
114
+ plt.close(fig)
115
+ return f"Pair plot saved: {out}"
116
+ except Exception as exc:
117
+ return f"pairplot error: {exc}"
118
+
119
+
120
+ @command("violin", usage="violin var [by(groupvar)] [saving(path.png)]")
121
+ def cmd_violin(session: Session, args: str) -> str:
122
+ """Violin plot for distribution visualization."""
123
+ try:
124
+ import matplotlib.pyplot as plt
125
+ import numpy as np
126
+ except ImportError:
127
+ return "matplotlib not installed."
128
+ df = session.require_data()
129
+ positional, opts = _stata_opts(args)
130
+ if not positional:
131
+ return "Usage: violin var [by(groupvar)] [saving(path.png)]"
132
+ col = positional[0]
133
+ if col not in df.columns:
134
+ return f"Column '{col}' not found."
135
+ by = opts.get("by")
136
+ try:
137
+ fig, ax = plt.subplots(figsize=(8, 5))
138
+ if by and by in df.columns:
139
+ groups = df[by].drop_nulls().unique().sort().to_list()
140
+ data_groups = [df.filter(__import__("polars").col(by) == g)[col].drop_nulls().to_numpy().astype(float)
141
+ for g in groups]
142
+ parts = ax.violinplot(data_groups, showmedians=True)
143
+ ax.set_xticks(range(1, len(groups) + 1))
144
+ ax.set_xticklabels([str(g) for g in groups])
145
+ ax.set_xlabel(str(by))
146
+ else:
147
+ data = df[col].drop_nulls().to_numpy().astype(float)
148
+ ax.violinplot([data], showmedians=True)
149
+ ax.set_xticks([1])
150
+ ax.set_xticklabels([col])
151
+ ax.set_ylabel(col)
152
+ ax.set_title(f"Violin Plot: {col}")
153
+ fig.tight_layout()
154
+ path = opts.get("saving")
155
+ out = _save_or_show(fig, path, f"violin_{col}.png")
156
+ plt.close(fig)
157
+ return f"Violin plot saved: {out}"
158
+ except Exception as exc:
159
+ return f"violin error: {exc}"
160
+
161
+
162
+ @command("qqplot", usage="qqplot var [saving(path.png)]")
163
+ def cmd_qqplot(session: Session, args: str) -> str:
164
+ """Quantile-Quantile (Q-Q) normality plot."""
165
+ try:
166
+ import matplotlib.pyplot as plt
167
+ import numpy as np
168
+ from scipy import stats as sp_stats
169
+ except ImportError:
170
+ return "matplotlib or scipy not installed."
171
+ df = session.require_data()
172
+ positional, opts = _stata_opts(args)
173
+ if not positional:
174
+ return "Usage: qqplot var [saving(path.png)]"
175
+ col = positional[0]
176
+ if col not in df.columns:
177
+ return f"Column '{col}' not found."
178
+ try:
179
+ data = df[col].drop_nulls().to_numpy().astype(float)
180
+ fig, ax = plt.subplots(figsize=(6, 6))
181
+ (osm, osr), (slope, intercept, r) = sp_stats.probplot(data, dist="norm")
182
+ ax.scatter(osm, osr, alpha=0.5, s=15, color="steelblue", label="Data")
183
+ x_line = np.array([osm.min(), osm.max()])
184
+ ax.plot(x_line, slope * x_line + intercept, "r-", lw=2, label="Normal line")
185
+ ax.set_xlabel("Theoretical quantiles")
186
+ ax.set_ylabel("Sample quantiles")
187
+ ax.set_title(f"Q-Q Plot: {col}")
188
+ ax.legend()
189
+ # Shapiro-Wilk stat
190
+ if len(data) <= 5000:
191
+ sw_stat, sw_p = sp_stats.shapiro(data[:5000])
192
+ ax.text(0.05, 0.95, f"Shapiro-Wilk p = {sw_p:.4f}", transform=ax.transAxes,
193
+ fontsize=9, verticalalignment="top")
194
+ fig.tight_layout()
195
+ path = opts.get("saving")
196
+ out = _save_or_show(fig, path, f"qqplot_{col}.png")
197
+ plt.close(fig)
198
+ return f"Q-Q plot saved: {out} (R²={r**2:.4f})"
199
+ except Exception as exc:
200
+ return f"qqplot error: {exc}"
201
+
202
+
203
+ @command("residplot", usage="residplot dep var1 [var2 ...] [saving(path.png)]")
204
+ def cmd_residplot(session: Session, args: str) -> str:
205
+ """Residual vs fitted and scale-location plots for OLS."""
206
+ try:
207
+ import matplotlib.pyplot as plt
208
+ import numpy as np
209
+ except ImportError:
210
+ return "matplotlib not installed."
211
+ df = session.require_data()
212
+ positional, opts = _stata_opts(args)
213
+ if len(positional) < 2:
214
+ return "Usage: residplot dep var1 [var2 ...]"
215
+ dep = positional[0]
216
+ indeps = [c for c in positional[1:] if c in df.columns]
217
+ if dep not in df.columns:
218
+ return f"Column '{dep}' not found."
219
+ if not indeps:
220
+ return "No valid predictor variables."
221
+ try:
222
+ sub = df.select([dep] + indeps).drop_nulls()
223
+ y = sub[dep].to_numpy().astype(float)
224
+ X = np.column_stack([np.ones(len(y)), sub.select(indeps).to_numpy().astype(float)])
225
+ beta = np.linalg.lstsq(X, y, rcond=None)[0]
226
+ fitted = X @ beta
227
+ resid = y - fitted
228
+ std_resid = resid / max(resid.std(), 1e-10)
229
+
230
+ fig, axes = plt.subplots(1, 2, figsize=(12, 5))
231
+
232
+ # Residuals vs Fitted
233
+ axes[0].scatter(fitted, resid, alpha=0.4, s=15, color="steelblue")
234
+ axes[0].axhline(0, color="red", lw=1)
235
+ axes[0].set_xlabel("Fitted values")
236
+ axes[0].set_ylabel("Residuals")
237
+ axes[0].set_title("Residuals vs Fitted")
238
+
239
+ # Scale-Location
240
+ axes[1].scatter(fitted, np.sqrt(np.abs(std_resid)), alpha=0.4, s=15, color="steelblue")
241
+ axes[1].set_xlabel("Fitted values")
242
+ axes[1].set_ylabel("√|Standardized residuals|")
243
+ axes[1].set_title("Scale-Location")
244
+
245
+ fig.tight_layout()
246
+ path = opts.get("saving")
247
+ out = _save_or_show(fig, path, f"residplot_{dep}.png")
248
+ plt.close(fig)
249
+ return f"Residual plots saved: {out}"
250
+ except Exception as exc:
251
+ return f"residplot error: {exc}"
@@ -0,0 +1,69 @@
1
+ """Watch command: re-run a script automatically when the file changes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+ import os
7
+
8
+ from openstat.commands.base import command, CommandArgs
9
+ from openstat.session import Session
10
+
11
+
12
+ @command("watch", usage="watch <script.ost> [--interval=2]")
13
+ def cmd_watch(session: Session, args: str) -> str:
14
+ """Watch a script file and re-run it whenever it changes.
15
+
16
+ Monitors the file's modification time every N seconds (default 2).
17
+ Press Ctrl+C to stop watching.
18
+
19
+ Examples:
20
+ watch analysis.ost
21
+ watch pipeline.ost --interval=5
22
+ """
23
+ ca = CommandArgs(args)
24
+ if not ca.positional:
25
+ return "Usage: watch <script.ost> [--interval=2]"
26
+
27
+ script_path = ca.positional[0]
28
+ interval = float(ca.options.get("interval", 2))
29
+
30
+ if not os.path.exists(script_path):
31
+ return f"File not found: {script_path}"
32
+
33
+ from openstat.script_runner import run_script_advanced
34
+
35
+ last_mtime = os.path.getmtime(script_path)
36
+
37
+ print(f"Watching {script_path} (interval={interval}s). Press Ctrl+C to stop.")
38
+
39
+ try:
40
+ # Run once immediately
41
+ run_script_advanced(script_path, session)
42
+ print(f"[Initial run complete]")
43
+
44
+ while True:
45
+ time.sleep(interval)
46
+ try:
47
+ current_mtime = os.path.getmtime(script_path)
48
+ except FileNotFoundError:
49
+ print(f"File removed: {script_path}. Stopping watch.")
50
+ break
51
+
52
+ if current_mtime != last_mtime:
53
+ last_mtime = current_mtime
54
+ print(f"\n[{_timestamp()}] File changed — re-running {script_path}...")
55
+ try:
56
+ run_script_advanced(script_path, session)
57
+ print(f"[{_timestamp()}] Done.")
58
+ except Exception as exc:
59
+ print(f"[{_timestamp()}] Error: {exc}")
60
+
61
+ except KeyboardInterrupt:
62
+ pass
63
+
64
+ return f"Watch stopped: {script_path}"
65
+
66
+
67
+ def _timestamp() -> str:
68
+ from datetime import datetime
69
+ return datetime.now().strftime("%H:%M:%S")