openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
openstat/config.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Configuration management for OpenStat.
|
|
2
|
+
|
|
3
|
+
Loads settings from ~/.openstat/config.toml (if exists) with sensible defaults.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
_CONFIG_DIR = Path.home() / ".openstat"
|
|
12
|
+
_CONFIG_FILE = _CONFIG_DIR / "config.toml"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Config:
|
|
17
|
+
"""OpenStat configuration with defaults."""
|
|
18
|
+
|
|
19
|
+
# Data
|
|
20
|
+
output_dir: str = "outputs"
|
|
21
|
+
csv_separator: str = ","
|
|
22
|
+
infer_schema_length: int = 10_000
|
|
23
|
+
|
|
24
|
+
# Display
|
|
25
|
+
tabulate_limit: int = 50
|
|
26
|
+
head_default: int = 10
|
|
27
|
+
|
|
28
|
+
# Undo
|
|
29
|
+
max_undo_stack: int = 20
|
|
30
|
+
max_undo_memory_mb: int = 500 # adaptive: skip snapshots if exceeds
|
|
31
|
+
|
|
32
|
+
# Plotting
|
|
33
|
+
plot_dpi: int = 150
|
|
34
|
+
plot_figsize_w: float = 8.0
|
|
35
|
+
plot_figsize_h: float = 5.0
|
|
36
|
+
plot_style: str = "default"
|
|
37
|
+
|
|
38
|
+
# Model
|
|
39
|
+
condition_threshold: int = 30
|
|
40
|
+
min_obs_per_predictor: int = 5
|
|
41
|
+
bootstrap_iterations: int = 1000
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def load(cls) -> "Config":
|
|
45
|
+
"""Load config from TOML file, falling back to defaults."""
|
|
46
|
+
cfg = cls()
|
|
47
|
+
if not _CONFIG_FILE.exists():
|
|
48
|
+
return cfg
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
import tomllib
|
|
52
|
+
except ImportError:
|
|
53
|
+
try:
|
|
54
|
+
import tomli as tomllib # type: ignore[no-redef]
|
|
55
|
+
except ImportError:
|
|
56
|
+
return cfg # no TOML parser available, use defaults
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
with open(_CONFIG_FILE, "rb") as f:
|
|
60
|
+
data = tomllib.load(f)
|
|
61
|
+
except Exception:
|
|
62
|
+
return cfg # malformed config, use defaults
|
|
63
|
+
|
|
64
|
+
# Flatten sections
|
|
65
|
+
flat: dict[str, object] = {}
|
|
66
|
+
for section_key, section_val in data.items():
|
|
67
|
+
if isinstance(section_val, dict):
|
|
68
|
+
for k, v in section_val.items():
|
|
69
|
+
flat[f"{section_key}_{k}"] = v
|
|
70
|
+
else:
|
|
71
|
+
flat[section_key] = section_val
|
|
72
|
+
|
|
73
|
+
# Apply known keys
|
|
74
|
+
for key in cfg.__dataclass_fields__:
|
|
75
|
+
if key in flat:
|
|
76
|
+
try:
|
|
77
|
+
setattr(cfg, key, flat[key])
|
|
78
|
+
except (TypeError, ValueError):
|
|
79
|
+
pass # ignore invalid values
|
|
80
|
+
|
|
81
|
+
return cfg
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# Singleton — loaded once at import time
|
|
85
|
+
_config: Config | None = None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def get_config() -> Config:
|
|
89
|
+
"""Return the global configuration (loads on first call)."""
|
|
90
|
+
global _config
|
|
91
|
+
if _config is None:
|
|
92
|
+
_config = Config.load()
|
|
93
|
+
return _config
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def reset_config(override: Config | None = None) -> Config:
|
|
97
|
+
"""Reset the global config singleton.
|
|
98
|
+
|
|
99
|
+
If *override* is given it becomes the new config; otherwise
|
|
100
|
+
a fresh default ``Config()`` is used. Returns the new config.
|
|
101
|
+
|
|
102
|
+
Intended for tests that need isolation from each other.
|
|
103
|
+
"""
|
|
104
|
+
global _config
|
|
105
|
+
_config = override if override is not None else Config()
|
|
106
|
+
return _config
|
openstat/dsl/__init__.py
ADDED
|
File without changes
|
openstat/dsl/parser.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
"""Safe recursive-descent parser: expression string -> Polars expression.
|
|
2
|
+
|
|
3
|
+
Grammar:
|
|
4
|
+
expr -> or_expr
|
|
5
|
+
or_expr -> and_expr ('or' and_expr)*
|
|
6
|
+
and_expr -> not_expr ('and' not_expr)*
|
|
7
|
+
not_expr -> 'not' not_expr | compare
|
|
8
|
+
compare -> add (comp_op add)?
|
|
9
|
+
add -> mul (('+' | '-') mul)*
|
|
10
|
+
mul -> power (('*' | '/' | '%') power)*
|
|
11
|
+
power -> unary ('**' unary)?
|
|
12
|
+
unary -> '-' unary | atom
|
|
13
|
+
atom -> NUMBER | STRING | func_call | IDENT | '(' expr ')'
|
|
14
|
+
func_call -> IDENT '(' args? ')'
|
|
15
|
+
args -> expr (',' expr)*
|
|
16
|
+
|
|
17
|
+
Produces a polars.Expr. No Python eval is ever used.
|
|
18
|
+
|
|
19
|
+
Supported functions (whitelisted):
|
|
20
|
+
Math: log, sqrt, abs, round, exp
|
|
21
|
+
String: upper, lower, len_chars
|
|
22
|
+
Null: is_null, is_not_null, fill_null
|
|
23
|
+
Type: cast_float, cast_int, cast_str
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import math
|
|
29
|
+
|
|
30
|
+
import polars as pl
|
|
31
|
+
|
|
32
|
+
from openstat.dsl.tokenizer import TT, Token, tokenize
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ParseError(Exception):
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# ── Whitelisted functions ────────────────────────────────────────────
|
|
40
|
+
|
|
41
|
+
def _apply_function(name: str, args: list[pl.Expr]) -> pl.Expr:
|
|
42
|
+
"""Apply a whitelisted function to Polars expressions."""
|
|
43
|
+
# Math functions (1 argument)
|
|
44
|
+
if name == "log" and len(args) == 1:
|
|
45
|
+
return args[0].log(math.e)
|
|
46
|
+
if name == "log10" and len(args) == 1:
|
|
47
|
+
return args[0].log(10)
|
|
48
|
+
if name == "sqrt" and len(args) == 1:
|
|
49
|
+
return args[0].sqrt()
|
|
50
|
+
if name == "abs" and len(args) == 1:
|
|
51
|
+
return args[0].abs()
|
|
52
|
+
if name == "exp" and len(args) == 1:
|
|
53
|
+
return args[0].exp()
|
|
54
|
+
if name == "round" and len(args) in (1, 2):
|
|
55
|
+
decimals = 0
|
|
56
|
+
if len(args) == 2:
|
|
57
|
+
# Extract literal integer from the expression
|
|
58
|
+
try:
|
|
59
|
+
# Evaluate the literal expression to get the integer value
|
|
60
|
+
decimals = int(pl.select(args[1]).item())
|
|
61
|
+
except Exception:
|
|
62
|
+
raise ParseError("round() second argument must be a literal integer")
|
|
63
|
+
return args[0].round(decimals)
|
|
64
|
+
|
|
65
|
+
# String functions (1 argument, operate on the column)
|
|
66
|
+
if name == "upper" and len(args) == 1:
|
|
67
|
+
return args[0].str.to_uppercase()
|
|
68
|
+
if name == "lower" and len(args) == 1:
|
|
69
|
+
return args[0].str.to_lowercase()
|
|
70
|
+
if name == "len_chars" and len(args) == 1:
|
|
71
|
+
return args[0].str.len_chars()
|
|
72
|
+
if name == "strip" and len(args) == 1:
|
|
73
|
+
return args[0].str.strip_chars()
|
|
74
|
+
if name == "contains" and len(args) == 2:
|
|
75
|
+
return args[0].str.contains(args[1])
|
|
76
|
+
|
|
77
|
+
# Null functions
|
|
78
|
+
if name == "is_null" and len(args) == 1:
|
|
79
|
+
return args[0].is_null()
|
|
80
|
+
if name == "is_not_null" and len(args) == 1:
|
|
81
|
+
return args[0].is_not_null()
|
|
82
|
+
if name == "fill_null" and len(args) == 2:
|
|
83
|
+
return args[0].fill_null(args[1])
|
|
84
|
+
|
|
85
|
+
# Cast functions
|
|
86
|
+
if name == "cast_float" and len(args) == 1:
|
|
87
|
+
return args[0].cast(pl.Float64)
|
|
88
|
+
if name == "cast_int" and len(args) == 1:
|
|
89
|
+
return args[0].cast(pl.Int64)
|
|
90
|
+
if name == "cast_str" and len(args) == 1:
|
|
91
|
+
return args[0].cast(pl.Utf8)
|
|
92
|
+
|
|
93
|
+
available = (
|
|
94
|
+
"log, log10, sqrt, abs, exp, round, "
|
|
95
|
+
"upper, lower, len_chars, strip, contains, "
|
|
96
|
+
"is_null, is_not_null, fill_null, "
|
|
97
|
+
"cast_float, cast_int, cast_str"
|
|
98
|
+
)
|
|
99
|
+
raise ParseError(
|
|
100
|
+
f"Unknown function '{name}' with {len(args)} argument(s). "
|
|
101
|
+
f"Available: {available}"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# ── Parser ───────────────────────────────────────────────────────────
|
|
106
|
+
|
|
107
|
+
class _Parser:
|
|
108
|
+
def __init__(self, tokens: list[Token]) -> None:
|
|
109
|
+
self.tokens = tokens
|
|
110
|
+
self.pos = 0
|
|
111
|
+
|
|
112
|
+
# -- helpers ---------------------------------------------------------
|
|
113
|
+
|
|
114
|
+
def _peek(self) -> Token:
|
|
115
|
+
return self.tokens[self.pos]
|
|
116
|
+
|
|
117
|
+
def _advance(self) -> Token:
|
|
118
|
+
tok = self.tokens[self.pos]
|
|
119
|
+
self.pos += 1
|
|
120
|
+
return tok
|
|
121
|
+
|
|
122
|
+
def _expect(self, tt: TT, value: str | None = None) -> Token:
|
|
123
|
+
tok = self._advance()
|
|
124
|
+
if tok.type != tt or (value is not None and tok.value != value):
|
|
125
|
+
raise ParseError(f"Expected {tt.name} {value!r}, got {tok}")
|
|
126
|
+
return tok
|
|
127
|
+
|
|
128
|
+
def _match_op(self, *ops: str) -> str | None:
|
|
129
|
+
tok = self._peek()
|
|
130
|
+
if tok.type == TT.OP and tok.value in ops:
|
|
131
|
+
self._advance()
|
|
132
|
+
return tok.value
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
# -- grammar ---------------------------------------------------------
|
|
136
|
+
|
|
137
|
+
def parse(self) -> pl.Expr:
|
|
138
|
+
expr = self._or_expr()
|
|
139
|
+
if self._peek().type != TT.EOF:
|
|
140
|
+
raise ParseError(f"Unexpected token: {self._peek()}")
|
|
141
|
+
return expr
|
|
142
|
+
|
|
143
|
+
def _or_expr(self) -> pl.Expr:
|
|
144
|
+
left = self._and_expr()
|
|
145
|
+
while self._peek().type == TT.OR:
|
|
146
|
+
self._advance()
|
|
147
|
+
right = self._and_expr()
|
|
148
|
+
left = left | right
|
|
149
|
+
return left
|
|
150
|
+
|
|
151
|
+
def _and_expr(self) -> pl.Expr:
|
|
152
|
+
left = self._not_expr()
|
|
153
|
+
while self._peek().type == TT.AND:
|
|
154
|
+
self._advance()
|
|
155
|
+
right = self._not_expr()
|
|
156
|
+
left = left & right
|
|
157
|
+
return left
|
|
158
|
+
|
|
159
|
+
def _not_expr(self) -> pl.Expr:
|
|
160
|
+
if self._peek().type == TT.NOT:
|
|
161
|
+
self._advance()
|
|
162
|
+
return ~self._not_expr()
|
|
163
|
+
return self._compare()
|
|
164
|
+
|
|
165
|
+
def _compare(self) -> pl.Expr:
|
|
166
|
+
left = self._add()
|
|
167
|
+
op = self._match_op(">", "<", ">=", "<=", "==", "!=")
|
|
168
|
+
if op is None:
|
|
169
|
+
return left
|
|
170
|
+
right = self._add()
|
|
171
|
+
ops = {
|
|
172
|
+
">": left > right,
|
|
173
|
+
"<": left < right,
|
|
174
|
+
">=": left >= right,
|
|
175
|
+
"<=": left <= right,
|
|
176
|
+
"==": left == right,
|
|
177
|
+
"!=": left != right,
|
|
178
|
+
}
|
|
179
|
+
return ops[op]
|
|
180
|
+
|
|
181
|
+
def _add(self) -> pl.Expr:
|
|
182
|
+
left = self._mul()
|
|
183
|
+
while True:
|
|
184
|
+
op = self._match_op("+", "-")
|
|
185
|
+
if op is None:
|
|
186
|
+
break
|
|
187
|
+
right = self._mul()
|
|
188
|
+
left = left + right if op == "+" else left - right
|
|
189
|
+
return left
|
|
190
|
+
|
|
191
|
+
def _mul(self) -> pl.Expr:
|
|
192
|
+
left = self._power()
|
|
193
|
+
while True:
|
|
194
|
+
op = self._match_op("*", "/", "%")
|
|
195
|
+
if op is None:
|
|
196
|
+
break
|
|
197
|
+
right = self._power()
|
|
198
|
+
if op == "*":
|
|
199
|
+
left = left * right
|
|
200
|
+
elif op == "/":
|
|
201
|
+
left = left / right
|
|
202
|
+
else:
|
|
203
|
+
left = left % right
|
|
204
|
+
return left
|
|
205
|
+
|
|
206
|
+
def _power(self) -> pl.Expr:
|
|
207
|
+
base = self._unary()
|
|
208
|
+
if self._match_op("**"):
|
|
209
|
+
exp = self._unary()
|
|
210
|
+
return base.pow(exp)
|
|
211
|
+
return base
|
|
212
|
+
|
|
213
|
+
def _unary(self) -> pl.Expr:
|
|
214
|
+
if self._match_op("-"):
|
|
215
|
+
return -self._unary()
|
|
216
|
+
return self._atom()
|
|
217
|
+
|
|
218
|
+
def _atom(self) -> pl.Expr:
|
|
219
|
+
tok = self._peek()
|
|
220
|
+
|
|
221
|
+
if tok.type == TT.NUMBER:
|
|
222
|
+
self._advance()
|
|
223
|
+
val = float(tok.value) if "." in tok.value else int(tok.value)
|
|
224
|
+
return pl.lit(val)
|
|
225
|
+
|
|
226
|
+
if tok.type == TT.STRING:
|
|
227
|
+
self._advance()
|
|
228
|
+
return pl.lit(tok.value)
|
|
229
|
+
|
|
230
|
+
if tok.type == TT.IDENT:
|
|
231
|
+
# Check if it's a function call: IDENT '('
|
|
232
|
+
next_pos = self.pos + 1
|
|
233
|
+
if next_pos < len(self.tokens) and self.tokens[next_pos].type == TT.LPAREN:
|
|
234
|
+
return self._func_call()
|
|
235
|
+
self._advance()
|
|
236
|
+
return pl.col(tok.value)
|
|
237
|
+
|
|
238
|
+
if tok.type == TT.LPAREN:
|
|
239
|
+
self._advance()
|
|
240
|
+
expr = self._or_expr()
|
|
241
|
+
self._expect(TT.RPAREN)
|
|
242
|
+
return expr
|
|
243
|
+
|
|
244
|
+
raise ParseError(f"Unexpected token: {tok}")
|
|
245
|
+
|
|
246
|
+
def _func_call(self) -> pl.Expr:
|
|
247
|
+
"""Parse function_name(arg1, arg2, ...)."""
|
|
248
|
+
name_tok = self._advance() # IDENT
|
|
249
|
+
self._expect(TT.LPAREN)
|
|
250
|
+
|
|
251
|
+
args: list[pl.Expr] = []
|
|
252
|
+
if self._peek().type != TT.RPAREN:
|
|
253
|
+
args.append(self._or_expr())
|
|
254
|
+
while self._peek().type == TT.COMMA:
|
|
255
|
+
self._advance() # skip comma
|
|
256
|
+
args.append(self._or_expr())
|
|
257
|
+
|
|
258
|
+
self._expect(TT.RPAREN)
|
|
259
|
+
return _apply_function(name_tok.value, args)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def parse_expression(text: str) -> pl.Expr:
|
|
263
|
+
"""Parse an expression string into a Polars Expr (safe, no eval)."""
|
|
264
|
+
tokens = tokenize(text)
|
|
265
|
+
return _Parser(tokens).parse()
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def parse_formula(text: str) -> tuple[str, list[str]]:
|
|
269
|
+
"""Parse 'y ~ x1 + x2 + x3' into (dep_var, [indep_vars]).
|
|
270
|
+
|
|
271
|
+
The '+' here means 'include predictor', not arithmetic addition.
|
|
272
|
+
|
|
273
|
+
Interaction syntax:
|
|
274
|
+
- x1:x2 → interaction only (product term)
|
|
275
|
+
- x1*x2 → full factorial = x1 + x2 + x1:x2
|
|
276
|
+
"""
|
|
277
|
+
text = text.strip()
|
|
278
|
+
if "~" not in text:
|
|
279
|
+
raise ParseError("Formula must contain '~', e.g. y ~ x1 + x2")
|
|
280
|
+
left, right = text.split("~", 1)
|
|
281
|
+
dep = left.strip()
|
|
282
|
+
if not dep:
|
|
283
|
+
raise ParseError("Missing dependent variable before '~'")
|
|
284
|
+
|
|
285
|
+
# Expand x1*x2 → x1 + x2 + x1:x2 before splitting on +
|
|
286
|
+
right = _expand_star_interactions(right)
|
|
287
|
+
|
|
288
|
+
indeps = [v.strip() for v in right.split("+")]
|
|
289
|
+
indeps = [v for v in indeps if v]
|
|
290
|
+
if not indeps:
|
|
291
|
+
raise ParseError("Missing independent variables after '~'")
|
|
292
|
+
|
|
293
|
+
# Normalize interaction terms: strip whitespace around ':'
|
|
294
|
+
indeps = [
|
|
295
|
+
":".join(p.strip() for p in v.split(":")) if ":" in v else v
|
|
296
|
+
for v in indeps
|
|
297
|
+
]
|
|
298
|
+
|
|
299
|
+
# Deduplicate while preserving order
|
|
300
|
+
seen: set[str] = set()
|
|
301
|
+
unique: list[str] = []
|
|
302
|
+
for v in indeps:
|
|
303
|
+
if v not in seen:
|
|
304
|
+
seen.add(v)
|
|
305
|
+
unique.append(v)
|
|
306
|
+
|
|
307
|
+
return dep, unique
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _expand_star_interactions(rhs: str) -> str:
|
|
311
|
+
"""Expand full-factorial ``*`` terms in a formula RHS string.
|
|
312
|
+
|
|
313
|
+
- ``x1*x2`` → ``x1 + x2 + x1:x2``
|
|
314
|
+
- ``x1*x2*x3`` → ``x1 + x2 + x3 + x1:x2 + x1:x3 + x2:x3 + x1:x2:x3``
|
|
315
|
+
"""
|
|
316
|
+
from itertools import combinations
|
|
317
|
+
|
|
318
|
+
terms = [t.strip() for t in rhs.split("+")]
|
|
319
|
+
expanded: list[str] = []
|
|
320
|
+
for term in terms:
|
|
321
|
+
if "*" in term and ":" not in term:
|
|
322
|
+
parts = [p.strip() for p in term.split("*")]
|
|
323
|
+
# Generate all subsets of size 1..len(parts)
|
|
324
|
+
for r in range(1, len(parts) + 1):
|
|
325
|
+
for combo in combinations(parts, r):
|
|
326
|
+
if r == 1:
|
|
327
|
+
expanded.append(combo[0])
|
|
328
|
+
else:
|
|
329
|
+
expanded.append(":".join(combo))
|
|
330
|
+
else:
|
|
331
|
+
expanded.append(term)
|
|
332
|
+
return " + ".join(expanded)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Safe tokenizer for OpenStat expressions.
|
|
2
|
+
|
|
3
|
+
Produces a flat list of typed tokens from a string expression.
|
|
4
|
+
No Python eval is ever used.
|
|
5
|
+
|
|
6
|
+
Supports:
|
|
7
|
+
- Backtick-quoted identifiers: `Column Name`, `income ($)`
|
|
8
|
+
- Function calls: log(x), sqrt(x), is_null(x)
|
|
9
|
+
- Standard operators and boolean keywords
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from enum import Enum, auto
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TT(Enum):
|
|
20
|
+
"""Token types."""
|
|
21
|
+
|
|
22
|
+
NUMBER = auto()
|
|
23
|
+
STRING = auto()
|
|
24
|
+
IDENT = auto()
|
|
25
|
+
OP = auto()
|
|
26
|
+
LPAREN = auto()
|
|
27
|
+
RPAREN = auto()
|
|
28
|
+
COMMA = auto()
|
|
29
|
+
AND = auto()
|
|
30
|
+
OR = auto()
|
|
31
|
+
NOT = auto()
|
|
32
|
+
EOF = auto()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(frozen=True)
|
|
36
|
+
class Token:
|
|
37
|
+
type: TT
|
|
38
|
+
value: str
|
|
39
|
+
|
|
40
|
+
def __repr__(self) -> str:
|
|
41
|
+
return f"Token({self.type.name}, {self.value!r})"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# Order matters: longer operators first.
|
|
45
|
+
_TOKEN_SPEC: list[tuple[TT | None, str]] = [
|
|
46
|
+
(None, r"\s+"), # skip whitespace
|
|
47
|
+
(TT.NUMBER, r"\d+(?:\.\d+)?"),
|
|
48
|
+
(TT.STRING, r'"[^"]*"|\'[^\']*\''),
|
|
49
|
+
(TT.IDENT, r"`[^`]+`"), # backtick-quoted identifiers
|
|
50
|
+
(TT.OP, r">=|<=|!=|==|>|<|\+|-|\*\*|\*|/|%"),
|
|
51
|
+
(TT.LPAREN, r"\("),
|
|
52
|
+
(TT.RPAREN, r"\)"),
|
|
53
|
+
(TT.COMMA, r","),
|
|
54
|
+
(TT.IDENT, r"[A-Za-z_][A-Za-z0-9_]*"),
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
_KEYWORDS = {"and": TT.AND, "or": TT.OR, "not": TT.NOT}
|
|
58
|
+
|
|
59
|
+
_PATTERN = re.compile(
|
|
60
|
+
"|".join(f"(?P<G{i}>{pat})" for i, (_, pat) in enumerate(_TOKEN_SPEC))
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def tokenize(text: str) -> list[Token]:
|
|
65
|
+
"""Tokenize an expression string into a list of Tokens.
|
|
66
|
+
|
|
67
|
+
Raises ValueError if the input contains unrecognized characters.
|
|
68
|
+
"""
|
|
69
|
+
tokens: list[Token] = []
|
|
70
|
+
last_end = 0
|
|
71
|
+
for m in _PATTERN.finditer(text):
|
|
72
|
+
# Check for unmatched characters between tokens
|
|
73
|
+
if m.start() > last_end:
|
|
74
|
+
bad = text[last_end:m.start()]
|
|
75
|
+
raise ValueError(
|
|
76
|
+
f"Unexpected character(s) at position {last_end}: {bad!r}"
|
|
77
|
+
)
|
|
78
|
+
last_end = m.end()
|
|
79
|
+
for i, (tt, _) in enumerate(_TOKEN_SPEC):
|
|
80
|
+
val = m.group(f"G{i}")
|
|
81
|
+
if val is not None:
|
|
82
|
+
if tt is None:
|
|
83
|
+
break # whitespace — skip
|
|
84
|
+
if tt == TT.IDENT:
|
|
85
|
+
# Strip backticks if present
|
|
86
|
+
if val.startswith("`") and val.endswith("`"):
|
|
87
|
+
tokens.append(Token(TT.IDENT, val[1:-1]))
|
|
88
|
+
elif val.lower() in _KEYWORDS:
|
|
89
|
+
tokens.append(Token(_KEYWORDS[val.lower()], val.lower()))
|
|
90
|
+
else:
|
|
91
|
+
tokens.append(Token(tt, val))
|
|
92
|
+
elif tt == TT.STRING:
|
|
93
|
+
tokens.append(Token(tt, val[1:-1])) # strip quotes
|
|
94
|
+
else:
|
|
95
|
+
tokens.append(Token(tt, val))
|
|
96
|
+
break
|
|
97
|
+
# Check for trailing unmatched characters
|
|
98
|
+
if last_end < len(text):
|
|
99
|
+
bad = text[last_end:]
|
|
100
|
+
if bad.strip(): # ignore trailing whitespace
|
|
101
|
+
raise ValueError(
|
|
102
|
+
f"Unexpected character(s) at position {last_end}: {bad.strip()!r}"
|
|
103
|
+
)
|
|
104
|
+
tokens.append(Token(TT.EOF, ""))
|
|
105
|
+
return tokens
|
openstat/i18n.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Lightweight i18n: translate UI strings for OpenStat.
|
|
2
|
+
|
|
3
|
+
Usage::
|
|
4
|
+
|
|
5
|
+
from openstat.i18n import t, set_locale
|
|
6
|
+
set_locale("tr")
|
|
7
|
+
print(t("no_data")) # → "Veri yüklenmedi."
|
|
8
|
+
|
|
9
|
+
Supported locales: en (default), tr.
|
|
10
|
+
Additional locales can be registered at runtime via ``register_locale()``.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
_LOCALE: str = "en"
|
|
16
|
+
|
|
17
|
+
_STRINGS: dict[str, dict[str, str]] = {
|
|
18
|
+
"en": {
|
|
19
|
+
# Generic errors
|
|
20
|
+
"no_data": "No dataset loaded. Use: load <path>",
|
|
21
|
+
"col_not_found": "Column not found: {col}",
|
|
22
|
+
"unknown_subcmd": "Unknown sub-command: {subcmd}",
|
|
23
|
+
# Data commands
|
|
24
|
+
"load_ok": "Loaded {rows:,} rows × {cols} columns from {path}",
|
|
25
|
+
"save_ok": "Saved to: {path}",
|
|
26
|
+
"describe_header": "Dataset: {name} | {rows:,} rows × {cols} columns",
|
|
27
|
+
"summarize_header": "Summary Statistics",
|
|
28
|
+
# Model results
|
|
29
|
+
"model_fitted": "{model} fitted. {info}",
|
|
30
|
+
"model_none": "No model fitted yet.",
|
|
31
|
+
# Export
|
|
32
|
+
"export_docx_ok": "Word document saved: {path}",
|
|
33
|
+
"export_pptx_ok": "PowerPoint saved: {path}",
|
|
34
|
+
# Session
|
|
35
|
+
"session_info_header": "Session Information",
|
|
36
|
+
"seed_set": "Seed set to {seed}. Reproducible random operations enabled.",
|
|
37
|
+
"seed_none": "No seed set.",
|
|
38
|
+
# Dashboard
|
|
39
|
+
"dashboard_closed": "Dashboard closed.",
|
|
40
|
+
"dashboard_missing": (
|
|
41
|
+
"textual is required for the dashboard.\n"
|
|
42
|
+
"Install: pip install textual"
|
|
43
|
+
),
|
|
44
|
+
# Misc
|
|
45
|
+
"undo_ok": "Undo successful. Restored previous dataset.",
|
|
46
|
+
"undo_fail": "Nothing to undo.",
|
|
47
|
+
},
|
|
48
|
+
"tr": {
|
|
49
|
+
# Generic errors
|
|
50
|
+
"no_data": "Veri kümesi yüklenmedi. Kullanım: load <yol>",
|
|
51
|
+
"col_not_found": "Sütun bulunamadı: {col}",
|
|
52
|
+
"unknown_subcmd": "Bilinmeyen alt komut: {subcmd}",
|
|
53
|
+
# Data commands
|
|
54
|
+
"load_ok": "{path} dosyasından {rows:,} satır × {cols} sütun yüklendi",
|
|
55
|
+
"save_ok": "Kaydedildi: {path}",
|
|
56
|
+
"describe_header": "Veri kümesi: {name} | {rows:,} satır × {cols} sütun",
|
|
57
|
+
"summarize_header": "Özet İstatistikler",
|
|
58
|
+
# Model results
|
|
59
|
+
"model_fitted": "{model} tahmin edildi. {info}",
|
|
60
|
+
"model_none": "Henüz model tahmin edilmedi.",
|
|
61
|
+
# Export
|
|
62
|
+
"export_docx_ok": "Word belgesi kaydedildi: {path}",
|
|
63
|
+
"export_pptx_ok": "PowerPoint kaydedildi: {path}",
|
|
64
|
+
# Session
|
|
65
|
+
"session_info_header": "Oturum Bilgisi",
|
|
66
|
+
"seed_set": "Başlangıç değeri {seed} olarak ayarlandı. Tekrarlanabilir rastgele işlemler etkin.",
|
|
67
|
+
"seed_none": "Başlangıç değeri ayarlanmadı.",
|
|
68
|
+
# Dashboard
|
|
69
|
+
"dashboard_closed": "Gösterge paneli kapatıldı.",
|
|
70
|
+
"dashboard_missing": (
|
|
71
|
+
"Gösterge paneli için textual gereklidir.\n"
|
|
72
|
+
"Kurulum: pip install textual"
|
|
73
|
+
),
|
|
74
|
+
# Misc
|
|
75
|
+
"undo_ok": "Geri alma başarılı. Önceki veri kümesi geri yüklendi.",
|
|
76
|
+
"undo_fail": "Geri alınacak bir şey yok.",
|
|
77
|
+
},
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def set_locale(locale: str) -> None:
|
|
82
|
+
"""Set the active locale (e.g. 'en', 'tr')."""
|
|
83
|
+
global _LOCALE
|
|
84
|
+
if locale not in _STRINGS:
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f"Locale '{locale}' not available. "
|
|
87
|
+
f"Available: {', '.join(_STRINGS)}"
|
|
88
|
+
)
|
|
89
|
+
_LOCALE = locale
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_locale() -> str:
|
|
93
|
+
"""Return the currently active locale code."""
|
|
94
|
+
return _LOCALE
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def register_locale(locale: str, strings: dict[str, str]) -> None:
|
|
98
|
+
"""Register (or extend) a locale with a mapping of key → translated string.
|
|
99
|
+
|
|
100
|
+
Strings for keys not provided fall back to English.
|
|
101
|
+
"""
|
|
102
|
+
if locale not in _STRINGS:
|
|
103
|
+
_STRINGS[locale] = {}
|
|
104
|
+
_STRINGS[locale].update(strings)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def t(key: str, **kwargs: object) -> str:
|
|
108
|
+
"""Translate *key* using the active locale, with optional format args.
|
|
109
|
+
|
|
110
|
+
Falls back to English if the key is missing in the active locale.
|
|
111
|
+
Returns the key itself if missing everywhere.
|
|
112
|
+
"""
|
|
113
|
+
locale_map = _STRINGS.get(_LOCALE, {})
|
|
114
|
+
template = locale_map.get(key) or _STRINGS["en"].get(key) or key
|
|
115
|
+
if kwargs:
|
|
116
|
+
try:
|
|
117
|
+
return template.format(**kwargs)
|
|
118
|
+
except KeyError:
|
|
119
|
+
return template
|
|
120
|
+
return template
|
openstat/io/__init__.py
ADDED
|
File without changes
|