openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
openstat/repl.py
ADDED
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
"""Interactive REPL for OpenStat."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from prompt_toolkit import PromptSession
|
|
8
|
+
from prompt_toolkit.completion import Completer, Completion
|
|
9
|
+
from prompt_toolkit.completion import PathCompleter
|
|
10
|
+
from prompt_toolkit.document import Document
|
|
11
|
+
from prompt_toolkit.history import FileHistory
|
|
12
|
+
from rich.console import Console
|
|
13
|
+
|
|
14
|
+
from openstat import __version__
|
|
15
|
+
from openstat.session import Session
|
|
16
|
+
from openstat.commands import COMMANDS
|
|
17
|
+
from openstat.logging_config import get_logger
|
|
18
|
+
|
|
19
|
+
console = Console()
|
|
20
|
+
log = get_logger("repl")
|
|
21
|
+
|
|
22
|
+
_HISTORY_DIR = Path.home() / ".openstat"
|
|
23
|
+
_HISTORY_FILE = _HISTORY_DIR / "history"
|
|
24
|
+
|
|
25
|
+
_BANNER = f"""\
|
|
26
|
+
[bold cyan]OpenStat v{__version__}[/bold cyan] — Open-source statistical analysis tool
|
|
27
|
+
Type [green]help[/green] for commands, [green]quit[/green] to exit.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
_EXIT_COMMANDS = {"quit", "exit", "q"}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
_FILE_COMMANDS = {"load", "save", "merge", "run", "append"}
|
|
34
|
+
_FILLNA_STRATEGIES = ("mean", "median", "mode", "forward", "backward")
|
|
35
|
+
_CAST_TYPES = ("int", "float", "str", "bool")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class _DynamicCompleter(Completer):
|
|
39
|
+
"""Tab-complete command names, column names, file paths, and sub-options."""
|
|
40
|
+
|
|
41
|
+
def __init__(self, session: Session) -> None:
|
|
42
|
+
self.session = session
|
|
43
|
+
self._commands = sorted(set(COMMANDS.keys()) | _EXIT_COMMANDS)
|
|
44
|
+
self._path_completer = PathCompleter(expanduser=True)
|
|
45
|
+
|
|
46
|
+
def get_completions(self, document: Document, complete_event): # type: ignore[override]
|
|
47
|
+
text = document.text_before_cursor
|
|
48
|
+
words = text.split()
|
|
49
|
+
word = document.get_word_before_cursor()
|
|
50
|
+
|
|
51
|
+
if len(words) <= 1 and not text.endswith(" "):
|
|
52
|
+
# First word → command names
|
|
53
|
+
for cmd in self._commands:
|
|
54
|
+
if cmd.startswith(word):
|
|
55
|
+
yield Completion(cmd, start_position=-len(word))
|
|
56
|
+
else:
|
|
57
|
+
cmd = words[0].lower() if words else ""
|
|
58
|
+
|
|
59
|
+
# File path completion for load/save/merge
|
|
60
|
+
if cmd in _FILE_COMMANDS and len(words) <= 2:
|
|
61
|
+
yield from self._path_completer.get_completions(document, complete_event)
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
# Sub-commands for 'plot'
|
|
65
|
+
if cmd == "plot" and len(words) <= 2:
|
|
66
|
+
for sub in ("hist", "scatter", "line", "box", "bar", "heatmap", "diagnostics"):
|
|
67
|
+
if sub.startswith(word):
|
|
68
|
+
yield Completion(sub, start_position=-len(word))
|
|
69
|
+
|
|
70
|
+
# Strategy completion for 'fillna'
|
|
71
|
+
if cmd == "fillna" and len(words) == 3:
|
|
72
|
+
for s in _FILLNA_STRATEGIES:
|
|
73
|
+
if s.startswith(word):
|
|
74
|
+
yield Completion(s, start_position=-len(word))
|
|
75
|
+
|
|
76
|
+
# Type completion for 'cast'
|
|
77
|
+
if cmd == "cast" and len(words) == 3:
|
|
78
|
+
for t in _CAST_TYPES:
|
|
79
|
+
if t.startswith(word):
|
|
80
|
+
yield Completion(t, start_position=-len(word))
|
|
81
|
+
|
|
82
|
+
# Option completions for new v0.2.0 commands
|
|
83
|
+
if cmd == "margins" and word.startswith("--at="):
|
|
84
|
+
prefix = "--at="
|
|
85
|
+
for opt in ("means", "average"):
|
|
86
|
+
full = prefix + opt
|
|
87
|
+
if full.startswith(word):
|
|
88
|
+
yield Completion(full, start_position=-len(word))
|
|
89
|
+
|
|
90
|
+
if cmd == "margins" and word.startswith("--a"):
|
|
91
|
+
yield Completion("--at=", start_position=-len(word))
|
|
92
|
+
|
|
93
|
+
if cmd == "quantreg" and word.startswith("tau"):
|
|
94
|
+
for tau in ("tau=0.25", "tau=0.5", "tau=0.75", "tau=0.9"):
|
|
95
|
+
if tau.startswith(word):
|
|
96
|
+
yield Completion(tau, start_position=-len(word))
|
|
97
|
+
|
|
98
|
+
if cmd == "bootstrap":
|
|
99
|
+
if word.startswith("n"):
|
|
100
|
+
for opt in ("n=100", "n=500", "n=1000"):
|
|
101
|
+
if opt.startswith(word):
|
|
102
|
+
yield Completion(opt, start_position=-len(word))
|
|
103
|
+
if word.startswith("ci"):
|
|
104
|
+
for opt in ("ci=90", "ci=95", "ci=99"):
|
|
105
|
+
if opt.startswith(word):
|
|
106
|
+
yield Completion(opt, start_position=-len(word))
|
|
107
|
+
|
|
108
|
+
if cmd in ("ols", "logit", "probit", "poisson", "negbin",
|
|
109
|
+
"tobit", "mlogit", "ologit", "oprobit", "did"):
|
|
110
|
+
if word.startswith("--c"):
|
|
111
|
+
yield Completion("--cluster=", start_position=-len(word))
|
|
112
|
+
if word.startswith("--r"):
|
|
113
|
+
yield Completion("--robust", start_position=-len(word))
|
|
114
|
+
|
|
115
|
+
# Tobit limit completions
|
|
116
|
+
if cmd == "tobit":
|
|
117
|
+
if word.startswith("ll"):
|
|
118
|
+
yield Completion("ll(0)", start_position=-len(word))
|
|
119
|
+
if word.startswith("ul"):
|
|
120
|
+
yield Completion("ul()", start_position=-len(word))
|
|
121
|
+
|
|
122
|
+
# psmatch option completions
|
|
123
|
+
if cmd == "psmatch":
|
|
124
|
+
if word.startswith("treat"):
|
|
125
|
+
yield Completion("treatment()", start_position=-len(word))
|
|
126
|
+
if word.startswith("cal"):
|
|
127
|
+
yield Completion("caliper(0.2)", start_position=-len(word))
|
|
128
|
+
if word.startswith("nn"):
|
|
129
|
+
for sub in ("nn(1)", "nn(3)", "nn(5)"):
|
|
130
|
+
if sub.startswith(word):
|
|
131
|
+
yield Completion(sub, start_position=-len(word))
|
|
132
|
+
|
|
133
|
+
# egen function completions
|
|
134
|
+
if cmd == "egen" and "=" in text:
|
|
135
|
+
for fn in ("mean", "sum", "min", "max", "median", "count",
|
|
136
|
+
"rank", "group", "rowtotal", "rowmean"):
|
|
137
|
+
if fn.startswith(word):
|
|
138
|
+
yield Completion(fn + "(", start_position=-len(word))
|
|
139
|
+
|
|
140
|
+
if cmd == "poisson" and word.startswith("--e"):
|
|
141
|
+
yield Completion("--exposure=", start_position=-len(word))
|
|
142
|
+
|
|
143
|
+
if cmd == "estat" and len(words) <= 2:
|
|
144
|
+
for sub in ("hettest", "ovtest", "linktest", "ic", "all",
|
|
145
|
+
"icc", "lrtest", "firststage", "overid", "endogtest",
|
|
146
|
+
"phtest", "deff", "screeplot", "loadings"):
|
|
147
|
+
if sub.startswith(word):
|
|
148
|
+
yield Completion(sub, start_position=-len(word))
|
|
149
|
+
|
|
150
|
+
# power sub-commands
|
|
151
|
+
if cmd == "power" and len(words) <= 2:
|
|
152
|
+
for sub in ("onemean", "twomeans", "oneprop", "twoprop", "ols"):
|
|
153
|
+
if sub.startswith(word):
|
|
154
|
+
yield Completion(sub, start_position=-len(word))
|
|
155
|
+
|
|
156
|
+
# power option completions
|
|
157
|
+
if cmd == "power" and len(words) > 2:
|
|
158
|
+
for opt in ("n(", "alpha(0.05)", "power(0.80)", "delta(", "sd(1)",
|
|
159
|
+
"p0(", "pa(", "p1(", "p2(", "f2(", "k(", "ratio(1)"):
|
|
160
|
+
if opt.startswith(word):
|
|
161
|
+
yield Completion(opt, start_position=-len(word))
|
|
162
|
+
|
|
163
|
+
# sampsi option completions
|
|
164
|
+
if cmd == "sampsi" and len(words) > 2:
|
|
165
|
+
for opt in ("sd(1)", "alpha(0.05)", "power(0.80)"):
|
|
166
|
+
if opt.startswith(word):
|
|
167
|
+
yield Completion(opt, start_position=-len(word))
|
|
168
|
+
|
|
169
|
+
# report sub-commands
|
|
170
|
+
if cmd == "report" and len(words) <= 2:
|
|
171
|
+
for sub in ("eda",):
|
|
172
|
+
if sub.startswith(word):
|
|
173
|
+
yield Completion(sub, start_position=-len(word))
|
|
174
|
+
|
|
175
|
+
# export format completions
|
|
176
|
+
if cmd == "export" and len(words) <= 2:
|
|
177
|
+
for fmt in ("docx", "pptx"):
|
|
178
|
+
if fmt.startswith(word):
|
|
179
|
+
yield Completion(fmt, start_position=-len(word))
|
|
180
|
+
|
|
181
|
+
# factor / pca option completions
|
|
182
|
+
if cmd in ("factor", "pca") and len(words) > 1:
|
|
183
|
+
for opt in ("n(", "method(pc)", "method(ml)", "--norotate"):
|
|
184
|
+
if opt.startswith(word):
|
|
185
|
+
yield Completion(opt, start_position=-len(word))
|
|
186
|
+
|
|
187
|
+
# estat loadings blanks option
|
|
188
|
+
if cmd == "estat" and len(words) > 2 and words[1] == "loadings":
|
|
189
|
+
for opt in ("blanks(0.3)", "blanks(0.5)"):
|
|
190
|
+
if opt.startswith(word):
|
|
191
|
+
yield Completion(opt, start_position=-len(word))
|
|
192
|
+
|
|
193
|
+
# nonparametric tests
|
|
194
|
+
if cmd in ("ranksum", "kwallis") and "by" not in text:
|
|
195
|
+
if "by".startswith(word):
|
|
196
|
+
yield Completion("by(", start_position=-len(word))
|
|
197
|
+
|
|
198
|
+
if cmd == "spearman" and self.session.df is not None:
|
|
199
|
+
pass # handled by column name fallback below
|
|
200
|
+
|
|
201
|
+
# ML commands
|
|
202
|
+
if cmd in ("lasso", "ridge", "elasticnet"):
|
|
203
|
+
for opt in ("alpha(", "cv(5)"):
|
|
204
|
+
if opt.startswith(word):
|
|
205
|
+
yield Completion(opt, start_position=-len(word))
|
|
206
|
+
|
|
207
|
+
if cmd == "elasticnet":
|
|
208
|
+
if "l1ratio".startswith(word):
|
|
209
|
+
yield Completion("l1ratio(0.5)", start_position=-len(word))
|
|
210
|
+
|
|
211
|
+
if cmd == "cart":
|
|
212
|
+
for opt in ("depth(5)", "task(regression)", "task(classification)", "minleaf(5)"):
|
|
213
|
+
if opt.startswith(word):
|
|
214
|
+
yield Completion(opt, start_position=-len(word))
|
|
215
|
+
|
|
216
|
+
if cmd == "crossval":
|
|
217
|
+
for opt in ("method(ols)", "method(lasso)", "method(ridge)", "method(cart)",
|
|
218
|
+
"k(5)", "k(10)", "scoring(r2)", "scoring(neg_mean_squared_error)"):
|
|
219
|
+
if opt.startswith(word):
|
|
220
|
+
yield Completion(opt, start_position=-len(word))
|
|
221
|
+
|
|
222
|
+
# clustering
|
|
223
|
+
if cmd == "cluster" and len(words) <= 2:
|
|
224
|
+
for sub in ("kmeans", "hierarchical"):
|
|
225
|
+
if sub.startswith(word):
|
|
226
|
+
yield Completion(sub, start_position=-len(word))
|
|
227
|
+
|
|
228
|
+
if cmd == "cluster":
|
|
229
|
+
for opt in ("k(3)", "k(5)", "linkage(ward)", "linkage(complete)", "linkage(average)"):
|
|
230
|
+
if opt.startswith(word):
|
|
231
|
+
yield Completion(opt, start_position=-len(word))
|
|
232
|
+
|
|
233
|
+
if cmd == "discriminant":
|
|
234
|
+
for opt in ("method(lda)", "method(qda)"):
|
|
235
|
+
if opt.startswith(word):
|
|
236
|
+
yield Completion(opt, start_position=-len(word))
|
|
237
|
+
|
|
238
|
+
# MANOVA / ANOVA2
|
|
239
|
+
if cmd == "manova" and "=" not in text:
|
|
240
|
+
if "=".startswith(word):
|
|
241
|
+
yield Completion("=", start_position=-len(word))
|
|
242
|
+
|
|
243
|
+
# ARCH/GARCH
|
|
244
|
+
if cmd in ("arch", "garch"):
|
|
245
|
+
for opt in ("p(1)", "q(1)", "dist(normal)", "dist(t)",
|
|
246
|
+
"model(GARCH)", "model(EGARCH)", "model(GJR-GARCH)"):
|
|
247
|
+
if opt.startswith(word):
|
|
248
|
+
yield Completion(opt, start_position=-len(word))
|
|
249
|
+
|
|
250
|
+
# Bayesian
|
|
251
|
+
if cmd == "bayes" and len(words) <= 2:
|
|
252
|
+
if "ols".startswith(word) or ":".startswith(word):
|
|
253
|
+
yield Completion(": ols", start_position=-len(word))
|
|
254
|
+
|
|
255
|
+
if cmd == "bayes":
|
|
256
|
+
for opt in ("samples(4000)", "priorscale(10)", "ci(0.95)"):
|
|
257
|
+
if opt.startswith(word):
|
|
258
|
+
yield Completion(opt, start_position=-len(word))
|
|
259
|
+
|
|
260
|
+
# reshape / collapse / encode
|
|
261
|
+
if cmd == "reshape" and len(words) <= 2:
|
|
262
|
+
for sub in ("wide", "long"):
|
|
263
|
+
if sub.startswith(word):
|
|
264
|
+
yield Completion(sub, start_position=-len(word))
|
|
265
|
+
|
|
266
|
+
if cmd in ("reshape", "collapse"):
|
|
267
|
+
for opt in ("i(", "j(", "by("):
|
|
268
|
+
if opt.startswith(word):
|
|
269
|
+
yield Completion(opt, start_position=-len(word))
|
|
270
|
+
|
|
271
|
+
if cmd == "collapse" and len(words) <= 2:
|
|
272
|
+
for stat in ("(mean)", "(sum)", "(count)", "(median)", "(std)",
|
|
273
|
+
"(min)", "(max)"):
|
|
274
|
+
if stat.startswith(word):
|
|
275
|
+
yield Completion(stat, start_position=-len(word))
|
|
276
|
+
|
|
277
|
+
if cmd == "estimates" and len(words) <= 2:
|
|
278
|
+
if "table".startswith(word):
|
|
279
|
+
yield Completion("table", start_position=-len(word))
|
|
280
|
+
|
|
281
|
+
# xtreg estimator completions
|
|
282
|
+
if cmd == "xtreg":
|
|
283
|
+
for sub in ("fe", "re", "be", "--robust", "--cluster="):
|
|
284
|
+
if sub.startswith(word):
|
|
285
|
+
yield Completion(sub, start_position=-len(word))
|
|
286
|
+
|
|
287
|
+
# arima order completion
|
|
288
|
+
if cmd == "arima" and word.startswith("order"):
|
|
289
|
+
for sub in ("order(1,0,0)", "order(1,1,0)", "order(1,1,1)", "order(2,1,1)"):
|
|
290
|
+
if sub.startswith(word):
|
|
291
|
+
yield Completion(sub, start_position=-len(word))
|
|
292
|
+
|
|
293
|
+
# var lags completion
|
|
294
|
+
if cmd == "var" and word.startswith("lags"):
|
|
295
|
+
for sub in ("lags(1)", "lags(2)", "lags(3)", "lags(4)"):
|
|
296
|
+
if sub.startswith(word):
|
|
297
|
+
yield Completion(sub, start_position=-len(word))
|
|
298
|
+
|
|
299
|
+
# mi sub-commands
|
|
300
|
+
if cmd == "mi" and len(words) <= 2:
|
|
301
|
+
for sub in ("impute", "estimate:", "describe"):
|
|
302
|
+
if sub.startswith(word):
|
|
303
|
+
yield Completion(sub, start_position=-len(word))
|
|
304
|
+
|
|
305
|
+
# svy: sub-commands
|
|
306
|
+
if cmd == "svy:" and len(words) <= 2:
|
|
307
|
+
for sub in ("summarize", "ols", "logit"):
|
|
308
|
+
if sub.startswith(word):
|
|
309
|
+
yield Completion(sub, start_position=-len(word))
|
|
310
|
+
|
|
311
|
+
# sts sub-commands
|
|
312
|
+
if cmd == "sts" and len(words) <= 2:
|
|
313
|
+
for sub in ("graph", "test"):
|
|
314
|
+
if sub.startswith(word):
|
|
315
|
+
yield Completion(sub, start_position=-len(word))
|
|
316
|
+
|
|
317
|
+
# plugin sub-commands
|
|
318
|
+
if cmd == "plugin" and len(words) <= 2:
|
|
319
|
+
for sub in ("list", "info"):
|
|
320
|
+
if sub.startswith(word):
|
|
321
|
+
yield Completion(sub, start_position=-len(word))
|
|
322
|
+
|
|
323
|
+
# set sub-commands
|
|
324
|
+
if cmd == "set" and len(words) <= 2:
|
|
325
|
+
if "backend".startswith(word):
|
|
326
|
+
yield Completion("backend", start_position=-len(word))
|
|
327
|
+
if cmd == "set" and len(words) == 3:
|
|
328
|
+
for sub in ("polars", "duckdb"):
|
|
329
|
+
if sub.startswith(word):
|
|
330
|
+
yield Completion(sub, start_position=-len(word))
|
|
331
|
+
|
|
332
|
+
# plot sub-commands expanded
|
|
333
|
+
if cmd == "plot" and len(words) <= 2:
|
|
334
|
+
for sub in ("acf", "pacf"):
|
|
335
|
+
if sub.startswith(word):
|
|
336
|
+
yield Completion(sub, start_position=-len(word))
|
|
337
|
+
|
|
338
|
+
# Column names (if data loaded) — default fallback
|
|
339
|
+
if self.session.df is not None:
|
|
340
|
+
for col in self.session.df.columns:
|
|
341
|
+
if col.startswith(word):
|
|
342
|
+
yield Completion(col, start_position=-len(word))
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _dispatch(session: Session, line: str) -> str | None:
|
|
346
|
+
"""Parse and execute a single command line."""
|
|
347
|
+
line = line.strip()
|
|
348
|
+
if not line or line.startswith("#"):
|
|
349
|
+
return None
|
|
350
|
+
|
|
351
|
+
# Expand aliases before parsing
|
|
352
|
+
try:
|
|
353
|
+
from openstat.commands.alias_cmds import resolve_alias
|
|
354
|
+
line = resolve_alias(line)
|
|
355
|
+
except ImportError:
|
|
356
|
+
pass
|
|
357
|
+
|
|
358
|
+
parts = line.split(None, 2)
|
|
359
|
+
# Try two-word command first (e.g. "export pdf", "import do")
|
|
360
|
+
if len(parts) >= 2:
|
|
361
|
+
two_word = f"{parts[0].lower()} {parts[1].lower()}"
|
|
362
|
+
if two_word in COMMANDS:
|
|
363
|
+
cmd_name = two_word
|
|
364
|
+
args = parts[2] if len(parts) > 2 else ""
|
|
365
|
+
else:
|
|
366
|
+
cmd_name = parts[0].lower()
|
|
367
|
+
args = " ".join(parts[1:]) if len(parts) > 1 else ""
|
|
368
|
+
else:
|
|
369
|
+
cmd_name = parts[0].lower()
|
|
370
|
+
args = ""
|
|
371
|
+
|
|
372
|
+
if cmd_name in _EXIT_COMMANDS:
|
|
373
|
+
return "__QUIT__"
|
|
374
|
+
|
|
375
|
+
handler = COMMANDS.get(cmd_name)
|
|
376
|
+
if handler is None:
|
|
377
|
+
return f"Unknown command: {cmd_name}. Type 'help' for available commands."
|
|
378
|
+
|
|
379
|
+
session.record(line)
|
|
380
|
+
log.debug("dispatch: %s (args=%r)", cmd_name, args)
|
|
381
|
+
try:
|
|
382
|
+
result = handler(session, args)
|
|
383
|
+
log.debug("result length: %d", len(result) if result else 0)
|
|
384
|
+
# Write to session log file if active
|
|
385
|
+
if session._log_file is not None:
|
|
386
|
+
try:
|
|
387
|
+
import re as _re
|
|
388
|
+
plain = _re.sub(r"\[/?[^\]]*\]", "", result or "")
|
|
389
|
+
session._log_file.write(f". {line}\n{plain}\n\n")
|
|
390
|
+
session._log_file.flush()
|
|
391
|
+
except Exception:
|
|
392
|
+
pass
|
|
393
|
+
return result
|
|
394
|
+
except Exception as e:
|
|
395
|
+
log.exception("Unhandled error in command '%s'", cmd_name)
|
|
396
|
+
import logging
|
|
397
|
+
import traceback
|
|
398
|
+
msg = f"Internal error: {e}"
|
|
399
|
+
if logging.getLogger("openstat").isEnabledFor(logging.DEBUG):
|
|
400
|
+
msg += "\n" + traceback.format_exc()
|
|
401
|
+
return msg
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def run_repl(session: Session | None = None) -> None:
|
|
405
|
+
"""Start the interactive REPL."""
|
|
406
|
+
if session is None:
|
|
407
|
+
session = Session()
|
|
408
|
+
|
|
409
|
+
console.print(_BANNER)
|
|
410
|
+
|
|
411
|
+
# Discover and load plugins
|
|
412
|
+
try:
|
|
413
|
+
from openstat.commands.plugin_cmds import init_plugins
|
|
414
|
+
loaded = init_plugins()
|
|
415
|
+
if loaded:
|
|
416
|
+
console.print(f"[dim]Plugins loaded: {', '.join(loaded)}[/dim]")
|
|
417
|
+
except Exception:
|
|
418
|
+
pass
|
|
419
|
+
|
|
420
|
+
completer = _DynamicCompleter(session)
|
|
421
|
+
|
|
422
|
+
_HISTORY_DIR.mkdir(parents=True, exist_ok=True)
|
|
423
|
+
prompt_session: PromptSession[str] = PromptSession(
|
|
424
|
+
history=FileHistory(str(_HISTORY_FILE)),
|
|
425
|
+
completer=completer,
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
while True:
|
|
429
|
+
try:
|
|
430
|
+
line = prompt_session.prompt("openstat> ")
|
|
431
|
+
except (EOFError, KeyboardInterrupt):
|
|
432
|
+
console.print("\nBye!")
|
|
433
|
+
break
|
|
434
|
+
|
|
435
|
+
result = _dispatch(session, line)
|
|
436
|
+
if result == "__QUIT__":
|
|
437
|
+
console.print("Bye!")
|
|
438
|
+
break
|
|
439
|
+
if result:
|
|
440
|
+
console.print(result)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def run_script(
|
|
444
|
+
path: str, session: Session | None = None, *, strict: bool = False
|
|
445
|
+
) -> None:
|
|
446
|
+
"""Execute an .ost script file.
|
|
447
|
+
|
|
448
|
+
Supports foreach, forvalues, and if/else control flow.
|
|
449
|
+
If strict=True, stop on first error and raise SystemExit(1).
|
|
450
|
+
"""
|
|
451
|
+
if session is None:
|
|
452
|
+
session = Session()
|
|
453
|
+
|
|
454
|
+
log.info("Running script: %s (strict=%s)", path, strict)
|
|
455
|
+
|
|
456
|
+
from openstat.script_runner import run_script_advanced
|
|
457
|
+
run_script_advanced(path, session, console, _dispatch, strict=strict)
|
|
File without changes
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""Automated EDA report generation (self-contained HTML)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
import io
|
|
7
|
+
import os
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _b64_fig(fig) -> str:
|
|
12
|
+
buf = io.BytesIO()
|
|
13
|
+
fig.savefig(buf, format="png", dpi=80, bbox_inches="tight")
|
|
14
|
+
buf.seek(0)
|
|
15
|
+
return base64.b64encode(buf.read()).decode()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _color_corr(val: float) -> str:
|
|
19
|
+
"""Return background hex color for a correlation value."""
|
|
20
|
+
if val >= 0.7:
|
|
21
|
+
return "#2ecc71"
|
|
22
|
+
if val >= 0.4:
|
|
23
|
+
return "#a8e6b1"
|
|
24
|
+
if val <= -0.7:
|
|
25
|
+
return "#e74c3c"
|
|
26
|
+
if val <= -0.4:
|
|
27
|
+
return "#f5b7b1"
|
|
28
|
+
return "#ffffff"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def generate_eda_report(session, path: str) -> str:
|
|
32
|
+
"""Generate a self-contained HTML EDA report.
|
|
33
|
+
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
session : Session
|
|
37
|
+
path : output file path (HTML)
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
Absolute path of the generated file.
|
|
42
|
+
"""
|
|
43
|
+
from openstat.session import Session # local import to avoid circulars
|
|
44
|
+
|
|
45
|
+
df = session.require_data()
|
|
46
|
+
import polars as pl
|
|
47
|
+
|
|
48
|
+
# ── Section helpers ────────────────────────────────────────────────
|
|
49
|
+
sections: list[str] = []
|
|
50
|
+
|
|
51
|
+
# 1. Dataset Overview
|
|
52
|
+
n_rows, n_cols = df.shape
|
|
53
|
+
n_missing = sum(df[c].null_count() for c in df.columns)
|
|
54
|
+
mem_kb = df.estimated_size() / 1024
|
|
55
|
+
dtypes_html = "".join(
|
|
56
|
+
f"<tr><td>{c}</td><td>{df[c].dtype}</td></tr>" for c in df.columns
|
|
57
|
+
)
|
|
58
|
+
sections.append(f"""
|
|
59
|
+
<h2>1. Dataset Overview</h2>
|
|
60
|
+
<table>
|
|
61
|
+
<tr><th>Rows</th><td>{n_rows:,}</td></tr>
|
|
62
|
+
<tr><th>Columns</th><td>{n_cols}</td></tr>
|
|
63
|
+
<tr><th>Missing cells</th><td>{n_missing:,}</td></tr>
|
|
64
|
+
<tr><th>Memory (approx)</th><td>{mem_kb:.1f} KB</td></tr>
|
|
65
|
+
</table>
|
|
66
|
+
<h3>Column Types</h3>
|
|
67
|
+
<table><tr><th>Column</th><th>Type</th></tr>{dtypes_html}</table>
|
|
68
|
+
""")
|
|
69
|
+
|
|
70
|
+
# 2. Missing Values
|
|
71
|
+
miss_rows = ""
|
|
72
|
+
for c in df.columns:
|
|
73
|
+
cnt = df[c].null_count()
|
|
74
|
+
pct = cnt / n_rows * 100 if n_rows else 0
|
|
75
|
+
miss_rows += f"<tr><td>{c}</td><td>{cnt}</td><td>{pct:.1f}%</td></tr>"
|
|
76
|
+
sections.append(f"""
|
|
77
|
+
<h2>2. Missing Values</h2>
|
|
78
|
+
<table>
|
|
79
|
+
<tr><th>Column</th><th>Missing</th><th>%</th></tr>
|
|
80
|
+
{miss_rows}
|
|
81
|
+
</table>
|
|
82
|
+
""")
|
|
83
|
+
|
|
84
|
+
# 3. Numeric Summary
|
|
85
|
+
numeric_cols = [c for c in df.columns if df[c].dtype in (
|
|
86
|
+
pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
87
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
|
|
88
|
+
)]
|
|
89
|
+
if numeric_cols:
|
|
90
|
+
num_rows = ""
|
|
91
|
+
for c in numeric_cols:
|
|
92
|
+
s = df[c].drop_nulls()
|
|
93
|
+
if s.len() == 0:
|
|
94
|
+
continue
|
|
95
|
+
import numpy as np
|
|
96
|
+
arr = s.to_numpy().astype(float)
|
|
97
|
+
num_rows += (
|
|
98
|
+
f"<tr><td>{c}</td>"
|
|
99
|
+
f"<td>{arr.min():.4g}</td><td>{arr.max():.4g}</td>"
|
|
100
|
+
f"<td>{arr.mean():.4g}</td><td>{np.median(arr):.4g}</td>"
|
|
101
|
+
f"<td>{arr.std():.4g}</td>"
|
|
102
|
+
f"<td>{float(pl.Series(arr).skew() or 0):.3f}</td>"
|
|
103
|
+
f"<td>{float(pl.Series(arr).kurtosis() or 0):.3f}</td>"
|
|
104
|
+
"</tr>"
|
|
105
|
+
)
|
|
106
|
+
sections.append(f"""
|
|
107
|
+
<h2>3. Numeric Summary</h2>
|
|
108
|
+
<table>
|
|
109
|
+
<tr><th>Column</th><th>Min</th><th>Max</th><th>Mean</th>
|
|
110
|
+
<th>Median</th><th>Std</th><th>Skew</th><th>Kurt</th></tr>
|
|
111
|
+
{num_rows}
|
|
112
|
+
</table>
|
|
113
|
+
""")
|
|
114
|
+
|
|
115
|
+
# 4. Categorical Summary
|
|
116
|
+
cat_cols = [c for c in df.columns if df[c].dtype in (pl.Utf8, pl.String, pl.Categorical, pl.Boolean)]
|
|
117
|
+
if cat_cols:
|
|
118
|
+
cat_html = ""
|
|
119
|
+
for c in cat_cols:
|
|
120
|
+
vc = df[c].value_counts().sort("count", descending=True).head(10)
|
|
121
|
+
rows = "".join(
|
|
122
|
+
f"<tr><td>{row[c]}</td><td>{row['count']}</td></tr>"
|
|
123
|
+
for row in vc.iter_rows(named=True)
|
|
124
|
+
)
|
|
125
|
+
cat_html += f"<h3>{c}</h3><table><tr><th>Value</th><th>Count</th></tr>{rows}</table>"
|
|
126
|
+
sections.append(f"<h2>4. Categorical Summary</h2>{cat_html}")
|
|
127
|
+
|
|
128
|
+
# 5. Correlation Matrix
|
|
129
|
+
if len(numeric_cols) >= 2:
|
|
130
|
+
import numpy as np
|
|
131
|
+
mat = df.select(numeric_cols).to_numpy().astype(float)
|
|
132
|
+
mask = ~np.isnan(mat).any(axis=1)
|
|
133
|
+
mat = mat[mask]
|
|
134
|
+
corr = np.corrcoef(mat.T) if mat.shape[0] > 1 else np.eye(len(numeric_cols))
|
|
135
|
+
header = "<tr><th></th>" + "".join(f"<th>{c}</th>" for c in numeric_cols) + "</tr>"
|
|
136
|
+
corr_rows = ""
|
|
137
|
+
for i, ci in enumerate(numeric_cols):
|
|
138
|
+
corr_rows += f"<tr><th>{ci}</th>"
|
|
139
|
+
for j in range(len(numeric_cols)):
|
|
140
|
+
v = corr[i, j]
|
|
141
|
+
bg = _color_corr(v)
|
|
142
|
+
corr_rows += f'<td style="background:{bg}">{v:.3f}</td>'
|
|
143
|
+
corr_rows += "</tr>"
|
|
144
|
+
sections.append(f"""
|
|
145
|
+
<h2>5. Correlation Matrix</h2>
|
|
146
|
+
<table>{header}{corr_rows}</table>
|
|
147
|
+
""")
|
|
148
|
+
|
|
149
|
+
# 6. Distribution Plots
|
|
150
|
+
try:
|
|
151
|
+
import matplotlib
|
|
152
|
+
matplotlib.use("Agg")
|
|
153
|
+
import matplotlib.pyplot as plt
|
|
154
|
+
|
|
155
|
+
plot_imgs = ""
|
|
156
|
+
for c in numeric_cols[:20]: # cap at 20 cols
|
|
157
|
+
arr = df[c].drop_nulls().to_numpy().astype(float)
|
|
158
|
+
if len(arr) < 2:
|
|
159
|
+
continue
|
|
160
|
+
fig, ax = plt.subplots(figsize=(4, 2.5))
|
|
161
|
+
ax.hist(arr, bins=min(30, max(5, len(arr) // 10)), edgecolor="white")
|
|
162
|
+
ax.set_title(c, fontsize=9)
|
|
163
|
+
ax.tick_params(labelsize=7)
|
|
164
|
+
b64 = _b64_fig(fig)
|
|
165
|
+
plt.close(fig)
|
|
166
|
+
plot_imgs += f'<img src="data:image/png;base64,{b64}" style="margin:4px" />'
|
|
167
|
+
if plot_imgs:
|
|
168
|
+
sections.append(f"<h2>6. Distribution Plots</h2><div>{plot_imgs}</div>")
|
|
169
|
+
except Exception:
|
|
170
|
+
pass
|
|
171
|
+
|
|
172
|
+
# 7. Model Results
|
|
173
|
+
if session.results:
|
|
174
|
+
model_html = ""
|
|
175
|
+
for mr in session.results:
|
|
176
|
+
model_html += f"<h3>{mr.name} — {mr.formula}</h3><pre>{mr.table}</pre>"
|
|
177
|
+
sections.append(f"<h2>7. Model Results</h2>{model_html}")
|
|
178
|
+
|
|
179
|
+
# ── Assemble HTML ──────────────────────────────────────────────────
|
|
180
|
+
body = "\n".join(sections)
|
|
181
|
+
dataset_name = session.dataset_name or "Dataset"
|
|
182
|
+
html = f"""<!DOCTYPE html>
|
|
183
|
+
<html lang="en">
|
|
184
|
+
<head>
|
|
185
|
+
<meta charset="UTF-8">
|
|
186
|
+
<title>EDA Report — {dataset_name}</title>
|
|
187
|
+
<style>
|
|
188
|
+
body {{ font-family: Arial, sans-serif; margin: 30px; color: #333; }}
|
|
189
|
+
h1 {{ color: #2c3e50; }}
|
|
190
|
+
h2 {{ color: #34495e; border-bottom: 2px solid #bdc3c7; padding-bottom: 4px; }}
|
|
191
|
+
table {{ border-collapse: collapse; margin: 12px 0; font-size: 13px; }}
|
|
192
|
+
th, td {{ border: 1px solid #ddd; padding: 6px 10px; text-align: right; }}
|
|
193
|
+
th {{ background: #ecf0f1; text-align: left; }}
|
|
194
|
+
td:first-child {{ text-align: left; }}
|
|
195
|
+
pre {{ background: #f8f9fa; padding: 12px; border-radius: 4px; font-size: 12px; }}
|
|
196
|
+
img {{ border: 1px solid #ddd; border-radius: 4px; }}
|
|
197
|
+
</style>
|
|
198
|
+
</head>
|
|
199
|
+
<body>
|
|
200
|
+
<h1>EDA Report — {dataset_name}</h1>
|
|
201
|
+
<p>{n_rows:,} rows × {n_cols} columns | Generated by OpenStat</p>
|
|
202
|
+
{body}
|
|
203
|
+
</body>
|
|
204
|
+
</html>"""
|
|
205
|
+
|
|
206
|
+
os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
|
|
207
|
+
Path(path).write_text(html, encoding="utf-8")
|
|
208
|
+
return os.path.abspath(path)
|