openstat-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. openstat/__init__.py +3 -0
  2. openstat/__main__.py +4 -0
  3. openstat/backends/__init__.py +16 -0
  4. openstat/backends/duckdb_backend.py +70 -0
  5. openstat/backends/polars_backend.py +52 -0
  6. openstat/cli.py +92 -0
  7. openstat/commands/__init__.py +82 -0
  8. openstat/commands/adv_stat_cmds.py +1255 -0
  9. openstat/commands/advanced_ml_cmds.py +576 -0
  10. openstat/commands/advreg_cmds.py +207 -0
  11. openstat/commands/alias_cmds.py +135 -0
  12. openstat/commands/arch_cmds.py +82 -0
  13. openstat/commands/arules_cmds.py +111 -0
  14. openstat/commands/automodel_cmds.py +212 -0
  15. openstat/commands/backend_cmds.py +82 -0
  16. openstat/commands/base.py +170 -0
  17. openstat/commands/bayes_cmds.py +71 -0
  18. openstat/commands/causal_cmds.py +269 -0
  19. openstat/commands/cluster_cmds.py +152 -0
  20. openstat/commands/data_cmds.py +996 -0
  21. openstat/commands/datamanip_cmds.py +672 -0
  22. openstat/commands/dataquality_cmds.py +174 -0
  23. openstat/commands/datetime_cmds.py +176 -0
  24. openstat/commands/dimreduce_cmds.py +184 -0
  25. openstat/commands/discrete_cmds.py +149 -0
  26. openstat/commands/dsl_cmds.py +143 -0
  27. openstat/commands/epi_cmds.py +93 -0
  28. openstat/commands/equiv_tobit_cmds.py +94 -0
  29. openstat/commands/esttab_cmds.py +196 -0
  30. openstat/commands/export_beamer_cmds.py +142 -0
  31. openstat/commands/export_cmds.py +201 -0
  32. openstat/commands/export_extra_cmds.py +240 -0
  33. openstat/commands/factor_cmds.py +180 -0
  34. openstat/commands/groupby_cmds.py +155 -0
  35. openstat/commands/help_cmds.py +237 -0
  36. openstat/commands/i18n_cmds.py +43 -0
  37. openstat/commands/import_extra_cmds.py +561 -0
  38. openstat/commands/influence_cmds.py +134 -0
  39. openstat/commands/iv_cmds.py +106 -0
  40. openstat/commands/manova_cmds.py +105 -0
  41. openstat/commands/mediate_cmds.py +233 -0
  42. openstat/commands/meta_cmds.py +284 -0
  43. openstat/commands/mi_cmds.py +228 -0
  44. openstat/commands/mixed_cmds.py +79 -0
  45. openstat/commands/mixture_changepoint_cmds.py +166 -0
  46. openstat/commands/ml_adv_cmds.py +147 -0
  47. openstat/commands/ml_cmds.py +178 -0
  48. openstat/commands/model_eval_cmds.py +142 -0
  49. openstat/commands/network_cmds.py +288 -0
  50. openstat/commands/nlquery_cmds.py +161 -0
  51. openstat/commands/nonparam_cmds.py +149 -0
  52. openstat/commands/outreg_cmds.py +247 -0
  53. openstat/commands/panel_cmds.py +141 -0
  54. openstat/commands/pdf_cmds.py +226 -0
  55. openstat/commands/pipeline_cmds.py +319 -0
  56. openstat/commands/plot_cmds.py +189 -0
  57. openstat/commands/plugin_cmds.py +79 -0
  58. openstat/commands/posthoc_cmds.py +153 -0
  59. openstat/commands/power_cmds.py +172 -0
  60. openstat/commands/profile_cmds.py +246 -0
  61. openstat/commands/rbridge_cmds.py +81 -0
  62. openstat/commands/regex_cmds.py +104 -0
  63. openstat/commands/report_cmds.py +48 -0
  64. openstat/commands/repro_cmds.py +129 -0
  65. openstat/commands/resampling_cmds.py +109 -0
  66. openstat/commands/reshape_cmds.py +223 -0
  67. openstat/commands/sem_cmds.py +177 -0
  68. openstat/commands/stat_cmds.py +1040 -0
  69. openstat/commands/stata_import_cmds.py +215 -0
  70. openstat/commands/string_cmds.py +124 -0
  71. openstat/commands/surv_cmds.py +145 -0
  72. openstat/commands/survey_cmds.py +153 -0
  73. openstat/commands/textanalysis_cmds.py +192 -0
  74. openstat/commands/ts_adv_cmds.py +136 -0
  75. openstat/commands/ts_cmds.py +195 -0
  76. openstat/commands/tui_cmds.py +111 -0
  77. openstat/commands/ux_cmds.py +191 -0
  78. openstat/commands/validate_cmds.py +270 -0
  79. openstat/commands/viz_adv_cmds.py +312 -0
  80. openstat/commands/viz_extra_cmds.py +251 -0
  81. openstat/commands/watch_cmds.py +69 -0
  82. openstat/config.py +106 -0
  83. openstat/dsl/__init__.py +0 -0
  84. openstat/dsl/parser.py +332 -0
  85. openstat/dsl/tokenizer.py +105 -0
  86. openstat/i18n.py +120 -0
  87. openstat/io/__init__.py +0 -0
  88. openstat/io/loader.py +187 -0
  89. openstat/jupyter/__init__.py +18 -0
  90. openstat/jupyter/display.py +18 -0
  91. openstat/jupyter/magic.py +60 -0
  92. openstat/logging_config.py +59 -0
  93. openstat/plots/__init__.py +0 -0
  94. openstat/plots/plotter.py +437 -0
  95. openstat/plots/surv_plots.py +32 -0
  96. openstat/plots/ts_plots.py +59 -0
  97. openstat/plugins/__init__.py +5 -0
  98. openstat/plugins/manager.py +69 -0
  99. openstat/repl.py +457 -0
  100. openstat/reporting/__init__.py +0 -0
  101. openstat/reporting/eda.py +208 -0
  102. openstat/reporting/report.py +67 -0
  103. openstat/script_runner.py +319 -0
  104. openstat/session.py +133 -0
  105. openstat/stats/__init__.py +0 -0
  106. openstat/stats/advanced_regression.py +269 -0
  107. openstat/stats/arch_garch.py +84 -0
  108. openstat/stats/bayesian.py +103 -0
  109. openstat/stats/causal.py +258 -0
  110. openstat/stats/clustering.py +206 -0
  111. openstat/stats/discrete.py +311 -0
  112. openstat/stats/epidemiology.py +119 -0
  113. openstat/stats/equiv_tobit.py +163 -0
  114. openstat/stats/factor.py +174 -0
  115. openstat/stats/imputation.py +282 -0
  116. openstat/stats/influence.py +78 -0
  117. openstat/stats/iv.py +131 -0
  118. openstat/stats/manova.py +124 -0
  119. openstat/stats/mixed.py +128 -0
  120. openstat/stats/ml.py +275 -0
  121. openstat/stats/ml_advanced.py +117 -0
  122. openstat/stats/model_eval.py +183 -0
  123. openstat/stats/models.py +1342 -0
  124. openstat/stats/nonparametric.py +130 -0
  125. openstat/stats/panel.py +179 -0
  126. openstat/stats/power.py +295 -0
  127. openstat/stats/resampling.py +203 -0
  128. openstat/stats/survey.py +213 -0
  129. openstat/stats/survival.py +196 -0
  130. openstat/stats/timeseries.py +142 -0
  131. openstat/stats/ts_advanced.py +114 -0
  132. openstat/types.py +11 -0
  133. openstat/web/__init__.py +1 -0
  134. openstat/web/app.py +117 -0
  135. openstat/web/session_manager.py +73 -0
  136. openstat/web/static/app.js +117 -0
  137. openstat/web/static/index.html +38 -0
  138. openstat/web/static/style.css +103 -0
  139. openstat_cli-1.0.0.dist-info/METADATA +748 -0
  140. openstat_cli-1.0.0.dist-info/RECORD +143 -0
  141. openstat_cli-1.0.0.dist-info/WHEEL +4 -0
  142. openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
  143. openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
openstat/repl.py ADDED
@@ -0,0 +1,457 @@
1
+ """Interactive REPL for OpenStat."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from prompt_toolkit import PromptSession
8
+ from prompt_toolkit.completion import Completer, Completion
9
+ from prompt_toolkit.completion import PathCompleter
10
+ from prompt_toolkit.document import Document
11
+ from prompt_toolkit.history import FileHistory
12
+ from rich.console import Console
13
+
14
+ from openstat import __version__
15
+ from openstat.session import Session
16
+ from openstat.commands import COMMANDS
17
+ from openstat.logging_config import get_logger
18
+
19
+ console = Console()
20
+ log = get_logger("repl")
21
+
22
+ _HISTORY_DIR = Path.home() / ".openstat"
23
+ _HISTORY_FILE = _HISTORY_DIR / "history"
24
+
25
+ _BANNER = f"""\
26
+ [bold cyan]OpenStat v{__version__}[/bold cyan] — Open-source statistical analysis tool
27
+ Type [green]help[/green] for commands, [green]quit[/green] to exit.
28
+ """
29
+
30
+ _EXIT_COMMANDS = {"quit", "exit", "q"}
31
+
32
+
33
+ _FILE_COMMANDS = {"load", "save", "merge", "run", "append"}
34
+ _FILLNA_STRATEGIES = ("mean", "median", "mode", "forward", "backward")
35
+ _CAST_TYPES = ("int", "float", "str", "bool")
36
+
37
+
38
+ class _DynamicCompleter(Completer):
39
+ """Tab-complete command names, column names, file paths, and sub-options."""
40
+
41
+ def __init__(self, session: Session) -> None:
42
+ self.session = session
43
+ self._commands = sorted(set(COMMANDS.keys()) | _EXIT_COMMANDS)
44
+ self._path_completer = PathCompleter(expanduser=True)
45
+
46
+ def get_completions(self, document: Document, complete_event): # type: ignore[override]
47
+ text = document.text_before_cursor
48
+ words = text.split()
49
+ word = document.get_word_before_cursor()
50
+
51
+ if len(words) <= 1 and not text.endswith(" "):
52
+ # First word → command names
53
+ for cmd in self._commands:
54
+ if cmd.startswith(word):
55
+ yield Completion(cmd, start_position=-len(word))
56
+ else:
57
+ cmd = words[0].lower() if words else ""
58
+
59
+ # File path completion for load/save/merge
60
+ if cmd in _FILE_COMMANDS and len(words) <= 2:
61
+ yield from self._path_completer.get_completions(document, complete_event)
62
+ return
63
+
64
+ # Sub-commands for 'plot'
65
+ if cmd == "plot" and len(words) <= 2:
66
+ for sub in ("hist", "scatter", "line", "box", "bar", "heatmap", "diagnostics"):
67
+ if sub.startswith(word):
68
+ yield Completion(sub, start_position=-len(word))
69
+
70
+ # Strategy completion for 'fillna'
71
+ if cmd == "fillna" and len(words) == 3:
72
+ for s in _FILLNA_STRATEGIES:
73
+ if s.startswith(word):
74
+ yield Completion(s, start_position=-len(word))
75
+
76
+ # Type completion for 'cast'
77
+ if cmd == "cast" and len(words) == 3:
78
+ for t in _CAST_TYPES:
79
+ if t.startswith(word):
80
+ yield Completion(t, start_position=-len(word))
81
+
82
+ # Option completions for new v0.2.0 commands
83
+ if cmd == "margins" and word.startswith("--at="):
84
+ prefix = "--at="
85
+ for opt in ("means", "average"):
86
+ full = prefix + opt
87
+ if full.startswith(word):
88
+ yield Completion(full, start_position=-len(word))
89
+
90
+ if cmd == "margins" and word.startswith("--a"):
91
+ yield Completion("--at=", start_position=-len(word))
92
+
93
+ if cmd == "quantreg" and word.startswith("tau"):
94
+ for tau in ("tau=0.25", "tau=0.5", "tau=0.75", "tau=0.9"):
95
+ if tau.startswith(word):
96
+ yield Completion(tau, start_position=-len(word))
97
+
98
+ if cmd == "bootstrap":
99
+ if word.startswith("n"):
100
+ for opt in ("n=100", "n=500", "n=1000"):
101
+ if opt.startswith(word):
102
+ yield Completion(opt, start_position=-len(word))
103
+ if word.startswith("ci"):
104
+ for opt in ("ci=90", "ci=95", "ci=99"):
105
+ if opt.startswith(word):
106
+ yield Completion(opt, start_position=-len(word))
107
+
108
+ if cmd in ("ols", "logit", "probit", "poisson", "negbin",
109
+ "tobit", "mlogit", "ologit", "oprobit", "did"):
110
+ if word.startswith("--c"):
111
+ yield Completion("--cluster=", start_position=-len(word))
112
+ if word.startswith("--r"):
113
+ yield Completion("--robust", start_position=-len(word))
114
+
115
+ # Tobit limit completions
116
+ if cmd == "tobit":
117
+ if word.startswith("ll"):
118
+ yield Completion("ll(0)", start_position=-len(word))
119
+ if word.startswith("ul"):
120
+ yield Completion("ul()", start_position=-len(word))
121
+
122
+ # psmatch option completions
123
+ if cmd == "psmatch":
124
+ if word.startswith("treat"):
125
+ yield Completion("treatment()", start_position=-len(word))
126
+ if word.startswith("cal"):
127
+ yield Completion("caliper(0.2)", start_position=-len(word))
128
+ if word.startswith("nn"):
129
+ for sub in ("nn(1)", "nn(3)", "nn(5)"):
130
+ if sub.startswith(word):
131
+ yield Completion(sub, start_position=-len(word))
132
+
133
+ # egen function completions
134
+ if cmd == "egen" and "=" in text:
135
+ for fn in ("mean", "sum", "min", "max", "median", "count",
136
+ "rank", "group", "rowtotal", "rowmean"):
137
+ if fn.startswith(word):
138
+ yield Completion(fn + "(", start_position=-len(word))
139
+
140
+ if cmd == "poisson" and word.startswith("--e"):
141
+ yield Completion("--exposure=", start_position=-len(word))
142
+
143
+ if cmd == "estat" and len(words) <= 2:
144
+ for sub in ("hettest", "ovtest", "linktest", "ic", "all",
145
+ "icc", "lrtest", "firststage", "overid", "endogtest",
146
+ "phtest", "deff", "screeplot", "loadings"):
147
+ if sub.startswith(word):
148
+ yield Completion(sub, start_position=-len(word))
149
+
150
+ # power sub-commands
151
+ if cmd == "power" and len(words) <= 2:
152
+ for sub in ("onemean", "twomeans", "oneprop", "twoprop", "ols"):
153
+ if sub.startswith(word):
154
+ yield Completion(sub, start_position=-len(word))
155
+
156
+ # power option completions
157
+ if cmd == "power" and len(words) > 2:
158
+ for opt in ("n(", "alpha(0.05)", "power(0.80)", "delta(", "sd(1)",
159
+ "p0(", "pa(", "p1(", "p2(", "f2(", "k(", "ratio(1)"):
160
+ if opt.startswith(word):
161
+ yield Completion(opt, start_position=-len(word))
162
+
163
+ # sampsi option completions
164
+ if cmd == "sampsi" and len(words) > 2:
165
+ for opt in ("sd(1)", "alpha(0.05)", "power(0.80)"):
166
+ if opt.startswith(word):
167
+ yield Completion(opt, start_position=-len(word))
168
+
169
+ # report sub-commands
170
+ if cmd == "report" and len(words) <= 2:
171
+ for sub in ("eda",):
172
+ if sub.startswith(word):
173
+ yield Completion(sub, start_position=-len(word))
174
+
175
+ # export format completions
176
+ if cmd == "export" and len(words) <= 2:
177
+ for fmt in ("docx", "pptx"):
178
+ if fmt.startswith(word):
179
+ yield Completion(fmt, start_position=-len(word))
180
+
181
+ # factor / pca option completions
182
+ if cmd in ("factor", "pca") and len(words) > 1:
183
+ for opt in ("n(", "method(pc)", "method(ml)", "--norotate"):
184
+ if opt.startswith(word):
185
+ yield Completion(opt, start_position=-len(word))
186
+
187
+ # estat loadings blanks option
188
+ if cmd == "estat" and len(words) > 2 and words[1] == "loadings":
189
+ for opt in ("blanks(0.3)", "blanks(0.5)"):
190
+ if opt.startswith(word):
191
+ yield Completion(opt, start_position=-len(word))
192
+
193
+ # nonparametric tests
194
+ if cmd in ("ranksum", "kwallis") and "by" not in text:
195
+ if "by".startswith(word):
196
+ yield Completion("by(", start_position=-len(word))
197
+
198
+ if cmd == "spearman" and self.session.df is not None:
199
+ pass # handled by column name fallback below
200
+
201
+ # ML commands
202
+ if cmd in ("lasso", "ridge", "elasticnet"):
203
+ for opt in ("alpha(", "cv(5)"):
204
+ if opt.startswith(word):
205
+ yield Completion(opt, start_position=-len(word))
206
+
207
+ if cmd == "elasticnet":
208
+ if "l1ratio".startswith(word):
209
+ yield Completion("l1ratio(0.5)", start_position=-len(word))
210
+
211
+ if cmd == "cart":
212
+ for opt in ("depth(5)", "task(regression)", "task(classification)", "minleaf(5)"):
213
+ if opt.startswith(word):
214
+ yield Completion(opt, start_position=-len(word))
215
+
216
+ if cmd == "crossval":
217
+ for opt in ("method(ols)", "method(lasso)", "method(ridge)", "method(cart)",
218
+ "k(5)", "k(10)", "scoring(r2)", "scoring(neg_mean_squared_error)"):
219
+ if opt.startswith(word):
220
+ yield Completion(opt, start_position=-len(word))
221
+
222
+ # clustering
223
+ if cmd == "cluster" and len(words) <= 2:
224
+ for sub in ("kmeans", "hierarchical"):
225
+ if sub.startswith(word):
226
+ yield Completion(sub, start_position=-len(word))
227
+
228
+ if cmd == "cluster":
229
+ for opt in ("k(3)", "k(5)", "linkage(ward)", "linkage(complete)", "linkage(average)"):
230
+ if opt.startswith(word):
231
+ yield Completion(opt, start_position=-len(word))
232
+
233
+ if cmd == "discriminant":
234
+ for opt in ("method(lda)", "method(qda)"):
235
+ if opt.startswith(word):
236
+ yield Completion(opt, start_position=-len(word))
237
+
238
+ # MANOVA / ANOVA2
239
+ if cmd == "manova" and "=" not in text:
240
+ if "=".startswith(word):
241
+ yield Completion("=", start_position=-len(word))
242
+
243
+ # ARCH/GARCH
244
+ if cmd in ("arch", "garch"):
245
+ for opt in ("p(1)", "q(1)", "dist(normal)", "dist(t)",
246
+ "model(GARCH)", "model(EGARCH)", "model(GJR-GARCH)"):
247
+ if opt.startswith(word):
248
+ yield Completion(opt, start_position=-len(word))
249
+
250
+ # Bayesian
251
+ if cmd == "bayes" and len(words) <= 2:
252
+ if "ols".startswith(word) or ":".startswith(word):
253
+ yield Completion(": ols", start_position=-len(word))
254
+
255
+ if cmd == "bayes":
256
+ for opt in ("samples(4000)", "priorscale(10)", "ci(0.95)"):
257
+ if opt.startswith(word):
258
+ yield Completion(opt, start_position=-len(word))
259
+
260
+ # reshape / collapse / encode
261
+ if cmd == "reshape" and len(words) <= 2:
262
+ for sub in ("wide", "long"):
263
+ if sub.startswith(word):
264
+ yield Completion(sub, start_position=-len(word))
265
+
266
+ if cmd in ("reshape", "collapse"):
267
+ for opt in ("i(", "j(", "by("):
268
+ if opt.startswith(word):
269
+ yield Completion(opt, start_position=-len(word))
270
+
271
+ if cmd == "collapse" and len(words) <= 2:
272
+ for stat in ("(mean)", "(sum)", "(count)", "(median)", "(std)",
273
+ "(min)", "(max)"):
274
+ if stat.startswith(word):
275
+ yield Completion(stat, start_position=-len(word))
276
+
277
+ if cmd == "estimates" and len(words) <= 2:
278
+ if "table".startswith(word):
279
+ yield Completion("table", start_position=-len(word))
280
+
281
+ # xtreg estimator completions
282
+ if cmd == "xtreg":
283
+ for sub in ("fe", "re", "be", "--robust", "--cluster="):
284
+ if sub.startswith(word):
285
+ yield Completion(sub, start_position=-len(word))
286
+
287
+ # arima order completion
288
+ if cmd == "arima" and word.startswith("order"):
289
+ for sub in ("order(1,0,0)", "order(1,1,0)", "order(1,1,1)", "order(2,1,1)"):
290
+ if sub.startswith(word):
291
+ yield Completion(sub, start_position=-len(word))
292
+
293
+ # var lags completion
294
+ if cmd == "var" and word.startswith("lags"):
295
+ for sub in ("lags(1)", "lags(2)", "lags(3)", "lags(4)"):
296
+ if sub.startswith(word):
297
+ yield Completion(sub, start_position=-len(word))
298
+
299
+ # mi sub-commands
300
+ if cmd == "mi" and len(words) <= 2:
301
+ for sub in ("impute", "estimate:", "describe"):
302
+ if sub.startswith(word):
303
+ yield Completion(sub, start_position=-len(word))
304
+
305
+ # svy: sub-commands
306
+ if cmd == "svy:" and len(words) <= 2:
307
+ for sub in ("summarize", "ols", "logit"):
308
+ if sub.startswith(word):
309
+ yield Completion(sub, start_position=-len(word))
310
+
311
+ # sts sub-commands
312
+ if cmd == "sts" and len(words) <= 2:
313
+ for sub in ("graph", "test"):
314
+ if sub.startswith(word):
315
+ yield Completion(sub, start_position=-len(word))
316
+
317
+ # plugin sub-commands
318
+ if cmd == "plugin" and len(words) <= 2:
319
+ for sub in ("list", "info"):
320
+ if sub.startswith(word):
321
+ yield Completion(sub, start_position=-len(word))
322
+
323
+ # set sub-commands
324
+ if cmd == "set" and len(words) <= 2:
325
+ if "backend".startswith(word):
326
+ yield Completion("backend", start_position=-len(word))
327
+ if cmd == "set" and len(words) == 3:
328
+ for sub in ("polars", "duckdb"):
329
+ if sub.startswith(word):
330
+ yield Completion(sub, start_position=-len(word))
331
+
332
+ # plot sub-commands expanded
333
+ if cmd == "plot" and len(words) <= 2:
334
+ for sub in ("acf", "pacf"):
335
+ if sub.startswith(word):
336
+ yield Completion(sub, start_position=-len(word))
337
+
338
+ # Column names (if data loaded) — default fallback
339
+ if self.session.df is not None:
340
+ for col in self.session.df.columns:
341
+ if col.startswith(word):
342
+ yield Completion(col, start_position=-len(word))
343
+
344
+
345
+ def _dispatch(session: Session, line: str) -> str | None:
346
+ """Parse and execute a single command line."""
347
+ line = line.strip()
348
+ if not line or line.startswith("#"):
349
+ return None
350
+
351
+ # Expand aliases before parsing
352
+ try:
353
+ from openstat.commands.alias_cmds import resolve_alias
354
+ line = resolve_alias(line)
355
+ except ImportError:
356
+ pass
357
+
358
+ parts = line.split(None, 2)
359
+ # Try two-word command first (e.g. "export pdf", "import do")
360
+ if len(parts) >= 2:
361
+ two_word = f"{parts[0].lower()} {parts[1].lower()}"
362
+ if two_word in COMMANDS:
363
+ cmd_name = two_word
364
+ args = parts[2] if len(parts) > 2 else ""
365
+ else:
366
+ cmd_name = parts[0].lower()
367
+ args = " ".join(parts[1:]) if len(parts) > 1 else ""
368
+ else:
369
+ cmd_name = parts[0].lower()
370
+ args = ""
371
+
372
+ if cmd_name in _EXIT_COMMANDS:
373
+ return "__QUIT__"
374
+
375
+ handler = COMMANDS.get(cmd_name)
376
+ if handler is None:
377
+ return f"Unknown command: {cmd_name}. Type 'help' for available commands."
378
+
379
+ session.record(line)
380
+ log.debug("dispatch: %s (args=%r)", cmd_name, args)
381
+ try:
382
+ result = handler(session, args)
383
+ log.debug("result length: %d", len(result) if result else 0)
384
+ # Write to session log file if active
385
+ if session._log_file is not None:
386
+ try:
387
+ import re as _re
388
+ plain = _re.sub(r"\[/?[^\]]*\]", "", result or "")
389
+ session._log_file.write(f". {line}\n{plain}\n\n")
390
+ session._log_file.flush()
391
+ except Exception:
392
+ pass
393
+ return result
394
+ except Exception as e:
395
+ log.exception("Unhandled error in command '%s'", cmd_name)
396
+ import logging
397
+ import traceback
398
+ msg = f"Internal error: {e}"
399
+ if logging.getLogger("openstat").isEnabledFor(logging.DEBUG):
400
+ msg += "\n" + traceback.format_exc()
401
+ return msg
402
+
403
+
404
+ def run_repl(session: Session | None = None) -> None:
405
+ """Start the interactive REPL."""
406
+ if session is None:
407
+ session = Session()
408
+
409
+ console.print(_BANNER)
410
+
411
+ # Discover and load plugins
412
+ try:
413
+ from openstat.commands.plugin_cmds import init_plugins
414
+ loaded = init_plugins()
415
+ if loaded:
416
+ console.print(f"[dim]Plugins loaded: {', '.join(loaded)}[/dim]")
417
+ except Exception:
418
+ pass
419
+
420
+ completer = _DynamicCompleter(session)
421
+
422
+ _HISTORY_DIR.mkdir(parents=True, exist_ok=True)
423
+ prompt_session: PromptSession[str] = PromptSession(
424
+ history=FileHistory(str(_HISTORY_FILE)),
425
+ completer=completer,
426
+ )
427
+
428
+ while True:
429
+ try:
430
+ line = prompt_session.prompt("openstat> ")
431
+ except (EOFError, KeyboardInterrupt):
432
+ console.print("\nBye!")
433
+ break
434
+
435
+ result = _dispatch(session, line)
436
+ if result == "__QUIT__":
437
+ console.print("Bye!")
438
+ break
439
+ if result:
440
+ console.print(result)
441
+
442
+
443
+ def run_script(
444
+ path: str, session: Session | None = None, *, strict: bool = False
445
+ ) -> None:
446
+ """Execute an .ost script file.
447
+
448
+ Supports foreach, forvalues, and if/else control flow.
449
+ If strict=True, stop on first error and raise SystemExit(1).
450
+ """
451
+ if session is None:
452
+ session = Session()
453
+
454
+ log.info("Running script: %s (strict=%s)", path, strict)
455
+
456
+ from openstat.script_runner import run_script_advanced
457
+ run_script_advanced(path, session, console, _dispatch, strict=strict)
File without changes
@@ -0,0 +1,208 @@
1
+ """Automated EDA report generation (self-contained HTML)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import base64
6
+ import io
7
+ import os
8
+ from pathlib import Path
9
+
10
+
11
+ def _b64_fig(fig) -> str:
12
+ buf = io.BytesIO()
13
+ fig.savefig(buf, format="png", dpi=80, bbox_inches="tight")
14
+ buf.seek(0)
15
+ return base64.b64encode(buf.read()).decode()
16
+
17
+
18
+ def _color_corr(val: float) -> str:
19
+ """Return background hex color for a correlation value."""
20
+ if val >= 0.7:
21
+ return "#2ecc71"
22
+ if val >= 0.4:
23
+ return "#a8e6b1"
24
+ if val <= -0.7:
25
+ return "#e74c3c"
26
+ if val <= -0.4:
27
+ return "#f5b7b1"
28
+ return "#ffffff"
29
+
30
+
31
+ def generate_eda_report(session, path: str) -> str:
32
+ """Generate a self-contained HTML EDA report.
33
+
34
+ Parameters
35
+ ----------
36
+ session : Session
37
+ path : output file path (HTML)
38
+
39
+ Returns
40
+ -------
41
+ Absolute path of the generated file.
42
+ """
43
+ from openstat.session import Session # local import to avoid circulars
44
+
45
+ df = session.require_data()
46
+ import polars as pl
47
+
48
+ # ── Section helpers ────────────────────────────────────────────────
49
+ sections: list[str] = []
50
+
51
+ # 1. Dataset Overview
52
+ n_rows, n_cols = df.shape
53
+ n_missing = sum(df[c].null_count() for c in df.columns)
54
+ mem_kb = df.estimated_size() / 1024
55
+ dtypes_html = "".join(
56
+ f"<tr><td>{c}</td><td>{df[c].dtype}</td></tr>" for c in df.columns
57
+ )
58
+ sections.append(f"""
59
+ <h2>1. Dataset Overview</h2>
60
+ <table>
61
+ <tr><th>Rows</th><td>{n_rows:,}</td></tr>
62
+ <tr><th>Columns</th><td>{n_cols}</td></tr>
63
+ <tr><th>Missing cells</th><td>{n_missing:,}</td></tr>
64
+ <tr><th>Memory (approx)</th><td>{mem_kb:.1f} KB</td></tr>
65
+ </table>
66
+ <h3>Column Types</h3>
67
+ <table><tr><th>Column</th><th>Type</th></tr>{dtypes_html}</table>
68
+ """)
69
+
70
+ # 2. Missing Values
71
+ miss_rows = ""
72
+ for c in df.columns:
73
+ cnt = df[c].null_count()
74
+ pct = cnt / n_rows * 100 if n_rows else 0
75
+ miss_rows += f"<tr><td>{c}</td><td>{cnt}</td><td>{pct:.1f}%</td></tr>"
76
+ sections.append(f"""
77
+ <h2>2. Missing Values</h2>
78
+ <table>
79
+ <tr><th>Column</th><th>Missing</th><th>%</th></tr>
80
+ {miss_rows}
81
+ </table>
82
+ """)
83
+
84
+ # 3. Numeric Summary
85
+ numeric_cols = [c for c in df.columns if df[c].dtype in (
86
+ pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64,
87
+ pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
88
+ )]
89
+ if numeric_cols:
90
+ num_rows = ""
91
+ for c in numeric_cols:
92
+ s = df[c].drop_nulls()
93
+ if s.len() == 0:
94
+ continue
95
+ import numpy as np
96
+ arr = s.to_numpy().astype(float)
97
+ num_rows += (
98
+ f"<tr><td>{c}</td>"
99
+ f"<td>{arr.min():.4g}</td><td>{arr.max():.4g}</td>"
100
+ f"<td>{arr.mean():.4g}</td><td>{np.median(arr):.4g}</td>"
101
+ f"<td>{arr.std():.4g}</td>"
102
+ f"<td>{float(pl.Series(arr).skew() or 0):.3f}</td>"
103
+ f"<td>{float(pl.Series(arr).kurtosis() or 0):.3f}</td>"
104
+ "</tr>"
105
+ )
106
+ sections.append(f"""
107
+ <h2>3. Numeric Summary</h2>
108
+ <table>
109
+ <tr><th>Column</th><th>Min</th><th>Max</th><th>Mean</th>
110
+ <th>Median</th><th>Std</th><th>Skew</th><th>Kurt</th></tr>
111
+ {num_rows}
112
+ </table>
113
+ """)
114
+
115
+ # 4. Categorical Summary
116
+ cat_cols = [c for c in df.columns if df[c].dtype in (pl.Utf8, pl.String, pl.Categorical, pl.Boolean)]
117
+ if cat_cols:
118
+ cat_html = ""
119
+ for c in cat_cols:
120
+ vc = df[c].value_counts().sort("count", descending=True).head(10)
121
+ rows = "".join(
122
+ f"<tr><td>{row[c]}</td><td>{row['count']}</td></tr>"
123
+ for row in vc.iter_rows(named=True)
124
+ )
125
+ cat_html += f"<h3>{c}</h3><table><tr><th>Value</th><th>Count</th></tr>{rows}</table>"
126
+ sections.append(f"<h2>4. Categorical Summary</h2>{cat_html}")
127
+
128
+ # 5. Correlation Matrix
129
+ if len(numeric_cols) >= 2:
130
+ import numpy as np
131
+ mat = df.select(numeric_cols).to_numpy().astype(float)
132
+ mask = ~np.isnan(mat).any(axis=1)
133
+ mat = mat[mask]
134
+ corr = np.corrcoef(mat.T) if mat.shape[0] > 1 else np.eye(len(numeric_cols))
135
+ header = "<tr><th></th>" + "".join(f"<th>{c}</th>" for c in numeric_cols) + "</tr>"
136
+ corr_rows = ""
137
+ for i, ci in enumerate(numeric_cols):
138
+ corr_rows += f"<tr><th>{ci}</th>"
139
+ for j in range(len(numeric_cols)):
140
+ v = corr[i, j]
141
+ bg = _color_corr(v)
142
+ corr_rows += f'<td style="background:{bg}">{v:.3f}</td>'
143
+ corr_rows += "</tr>"
144
+ sections.append(f"""
145
+ <h2>5. Correlation Matrix</h2>
146
+ <table>{header}{corr_rows}</table>
147
+ """)
148
+
149
+ # 6. Distribution Plots
150
+ try:
151
+ import matplotlib
152
+ matplotlib.use("Agg")
153
+ import matplotlib.pyplot as plt
154
+
155
+ plot_imgs = ""
156
+ for c in numeric_cols[:20]: # cap at 20 cols
157
+ arr = df[c].drop_nulls().to_numpy().astype(float)
158
+ if len(arr) < 2:
159
+ continue
160
+ fig, ax = plt.subplots(figsize=(4, 2.5))
161
+ ax.hist(arr, bins=min(30, max(5, len(arr) // 10)), edgecolor="white")
162
+ ax.set_title(c, fontsize=9)
163
+ ax.tick_params(labelsize=7)
164
+ b64 = _b64_fig(fig)
165
+ plt.close(fig)
166
+ plot_imgs += f'<img src="data:image/png;base64,{b64}" style="margin:4px" />'
167
+ if plot_imgs:
168
+ sections.append(f"<h2>6. Distribution Plots</h2><div>{plot_imgs}</div>")
169
+ except Exception:
170
+ pass
171
+
172
+ # 7. Model Results
173
+ if session.results:
174
+ model_html = ""
175
+ for mr in session.results:
176
+ model_html += f"<h3>{mr.name} — {mr.formula}</h3><pre>{mr.table}</pre>"
177
+ sections.append(f"<h2>7. Model Results</h2>{model_html}")
178
+
179
+ # ── Assemble HTML ──────────────────────────────────────────────────
180
+ body = "\n".join(sections)
181
+ dataset_name = session.dataset_name or "Dataset"
182
+ html = f"""<!DOCTYPE html>
183
+ <html lang="en">
184
+ <head>
185
+ <meta charset="UTF-8">
186
+ <title>EDA Report — {dataset_name}</title>
187
+ <style>
188
+ body {{ font-family: Arial, sans-serif; margin: 30px; color: #333; }}
189
+ h1 {{ color: #2c3e50; }}
190
+ h2 {{ color: #34495e; border-bottom: 2px solid #bdc3c7; padding-bottom: 4px; }}
191
+ table {{ border-collapse: collapse; margin: 12px 0; font-size: 13px; }}
192
+ th, td {{ border: 1px solid #ddd; padding: 6px 10px; text-align: right; }}
193
+ th {{ background: #ecf0f1; text-align: left; }}
194
+ td:first-child {{ text-align: left; }}
195
+ pre {{ background: #f8f9fa; padding: 12px; border-radius: 4px; font-size: 12px; }}
196
+ img {{ border: 1px solid #ddd; border-radius: 4px; }}
197
+ </style>
198
+ </head>
199
+ <body>
200
+ <h1>EDA Report — {dataset_name}</h1>
201
+ <p>{n_rows:,} rows × {n_cols} columns &nbsp;|&nbsp; Generated by OpenStat</p>
202
+ {body}
203
+ </body>
204
+ </html>"""
205
+
206
+ os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
207
+ Path(path).write_text(html, encoding="utf-8")
208
+ return os.path.abspath(path)