openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""UX helpers: bookmark, history search, timer, multiline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
import time
|
|
5
|
+
from openstat.commands.base import command, CommandArgs, friendly_error
|
|
6
|
+
from openstat.session import Session
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# ── Bookmark ─────────────────────────────────────────────────────────────────
|
|
10
|
+
|
|
11
|
+
@command("bookmark", usage="bookmark save|load|list|rm [name] [command]")
|
|
12
|
+
def cmd_bookmark(session: Session, args: str) -> str:
|
|
13
|
+
"""Save and recall frequently used commands as bookmarks.
|
|
14
|
+
|
|
15
|
+
Sub-commands:
|
|
16
|
+
bookmark save <name> <command> — save a command as bookmark
|
|
17
|
+
bookmark load <name> — execute a bookmarked command
|
|
18
|
+
bookmark list — list all bookmarks
|
|
19
|
+
bookmark rm <name> — remove a bookmark
|
|
20
|
+
|
|
21
|
+
Examples:
|
|
22
|
+
bookmark save myols "ols income educ age"
|
|
23
|
+
bookmark load myols
|
|
24
|
+
bookmark list
|
|
25
|
+
bookmark rm myols
|
|
26
|
+
"""
|
|
27
|
+
if not hasattr(session, "_bookmarks"):
|
|
28
|
+
session._bookmarks = {} # type: ignore[attr-defined]
|
|
29
|
+
bm: dict = session._bookmarks # type: ignore[attr-defined]
|
|
30
|
+
|
|
31
|
+
tokens = args.strip().split(None, 2)
|
|
32
|
+
if not tokens:
|
|
33
|
+
return "Usage: bookmark save|load|list|rm [name] [command]"
|
|
34
|
+
|
|
35
|
+
subcmd = tokens[0].lower()
|
|
36
|
+
|
|
37
|
+
if subcmd == "list":
|
|
38
|
+
if not bm:
|
|
39
|
+
return "No bookmarks saved. Use: bookmark save <name> <command>"
|
|
40
|
+
lines = ["Bookmarks:"]
|
|
41
|
+
for name, cmd in bm.items():
|
|
42
|
+
lines.append(f" {name:<20} {cmd}")
|
|
43
|
+
return "\n".join(lines)
|
|
44
|
+
|
|
45
|
+
if subcmd == "rm":
|
|
46
|
+
name = tokens[1] if len(tokens) > 1 else ""
|
|
47
|
+
if name in bm:
|
|
48
|
+
del bm[name]
|
|
49
|
+
return f"Bookmark '{name}' removed."
|
|
50
|
+
return f"Bookmark '{name}' not found."
|
|
51
|
+
|
|
52
|
+
if subcmd == "save":
|
|
53
|
+
if len(tokens) < 3:
|
|
54
|
+
return "Usage: bookmark save <name> <command>"
|
|
55
|
+
name = tokens[1]
|
|
56
|
+
cmd_str = tokens[2].strip("\"'")
|
|
57
|
+
bm[name] = cmd_str
|
|
58
|
+
return f"Bookmark '{name}' saved: {cmd_str}"
|
|
59
|
+
|
|
60
|
+
if subcmd == "load":
|
|
61
|
+
name = tokens[1] if len(tokens) > 1 else ""
|
|
62
|
+
if name not in bm:
|
|
63
|
+
return f"Bookmark '{name}' not found. Use 'bookmark list' to see saved."
|
|
64
|
+
cmd_str = bm[name]
|
|
65
|
+
from openstat.commands.base import run_command
|
|
66
|
+
return run_command(session, cmd_str)
|
|
67
|
+
|
|
68
|
+
return f"Unknown sub-command: {subcmd}"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# ── History search ────────────────────────────────────────────────────────────
|
|
72
|
+
|
|
73
|
+
@command("history search", usage="history search <pattern> [--n=20]")
|
|
74
|
+
def cmd_history_search(session: Session, args: str) -> str:
|
|
75
|
+
"""Search command history for matching entries.
|
|
76
|
+
|
|
77
|
+
Options:
|
|
78
|
+
--n=<k> maximum results to show (default: 20)
|
|
79
|
+
|
|
80
|
+
Examples:
|
|
81
|
+
history search ols
|
|
82
|
+
history search "export" --n=10
|
|
83
|
+
history search plot
|
|
84
|
+
"""
|
|
85
|
+
ca = CommandArgs(args)
|
|
86
|
+
if not ca.positional:
|
|
87
|
+
return "Usage: history search <pattern>"
|
|
88
|
+
|
|
89
|
+
pattern = " ".join(ca.positional).lower()
|
|
90
|
+
top_n = int(ca.options.get("n", 20))
|
|
91
|
+
|
|
92
|
+
history = getattr(session, "history", [])
|
|
93
|
+
if not history:
|
|
94
|
+
return "History is empty."
|
|
95
|
+
|
|
96
|
+
matches = [
|
|
97
|
+
(i + 1, cmd)
|
|
98
|
+
for i, cmd in enumerate(history)
|
|
99
|
+
if pattern in cmd.lower()
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
if not matches:
|
|
103
|
+
return f"No history entries matching: {pattern}"
|
|
104
|
+
|
|
105
|
+
lines = [f"History search: '{pattern}' ({len(matches)} matches)"]
|
|
106
|
+
for idx, cmd in matches[-top_n:]:
|
|
107
|
+
lines.append(f" {idx:>5} {cmd}")
|
|
108
|
+
return "\n".join(lines)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@command("history show", usage="history show [--n=20]")
|
|
112
|
+
def cmd_history_show(session: Session, args: str) -> str:
|
|
113
|
+
"""Show recent command history.
|
|
114
|
+
|
|
115
|
+
Options:
|
|
116
|
+
--n=<k> number of recent commands (default: 20)
|
|
117
|
+
|
|
118
|
+
Examples:
|
|
119
|
+
history show
|
|
120
|
+
history show --n=50
|
|
121
|
+
"""
|
|
122
|
+
ca = CommandArgs(args)
|
|
123
|
+
top_n = int(ca.options.get("n", 20))
|
|
124
|
+
history = getattr(session, "history", [])
|
|
125
|
+
if not history:
|
|
126
|
+
return "History is empty."
|
|
127
|
+
recent = history[-top_n:]
|
|
128
|
+
offset = len(history) - len(recent)
|
|
129
|
+
lines = [f"Recent history ({len(recent)} of {len(history)} entries):"]
|
|
130
|
+
for i, cmd in enumerate(recent, offset + 1):
|
|
131
|
+
lines.append(f" {i:>5} {cmd}")
|
|
132
|
+
return "\n".join(lines)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# ── Timer ────────────────────────────────────────────────────────────────────
|
|
136
|
+
|
|
137
|
+
@command("timer", usage="timer <command with args>")
|
|
138
|
+
def cmd_timer(session: Session, args: str) -> str:
|
|
139
|
+
"""Time the execution of a command.
|
|
140
|
+
|
|
141
|
+
Runs the specified command and reports wall-clock time.
|
|
142
|
+
|
|
143
|
+
Examples:
|
|
144
|
+
timer ols income educ age
|
|
145
|
+
timer bootstrap ols income educ age --reps=500
|
|
146
|
+
timer describe
|
|
147
|
+
"""
|
|
148
|
+
if not args.strip():
|
|
149
|
+
return "Usage: timer <command> [args]"
|
|
150
|
+
|
|
151
|
+
from openstat.commands.base import run_command
|
|
152
|
+
|
|
153
|
+
start = time.perf_counter()
|
|
154
|
+
result = run_command(session, args.strip())
|
|
155
|
+
elapsed = time.perf_counter() - start
|
|
156
|
+
|
|
157
|
+
sep = "-" * 50
|
|
158
|
+
if elapsed < 1:
|
|
159
|
+
time_str = f"{elapsed * 1000:.1f} ms"
|
|
160
|
+
elif elapsed < 60:
|
|
161
|
+
time_str = f"{elapsed:.3f} s"
|
|
162
|
+
else:
|
|
163
|
+
m, s = divmod(elapsed, 60)
|
|
164
|
+
time_str = f"{int(m)}m {s:.1f}s"
|
|
165
|
+
|
|
166
|
+
parts = [result, "", sep, f"Elapsed: {time_str} (command: {args.strip()[:60]})"]
|
|
167
|
+
return "\n".join(parts)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# ── Multiline input helper ────────────────────────────────────────────────────
|
|
171
|
+
|
|
172
|
+
@command("multiline", usage="multiline")
|
|
173
|
+
def cmd_multiline(session: Session, args: str) -> str:
|
|
174
|
+
"""Show instructions for multiline input in the REPL.
|
|
175
|
+
|
|
176
|
+
In the OpenStat REPL, you can use backslash continuation:
|
|
177
|
+
ols income educ age \\
|
|
178
|
+
--robust \\
|
|
179
|
+
--cluster=state
|
|
180
|
+
|
|
181
|
+
Or use a semicolon to chain commands on one line:
|
|
182
|
+
describe; summarize income educ
|
|
183
|
+
|
|
184
|
+
Or write commands to a .ost script file and run:
|
|
185
|
+
run my_analysis.ost
|
|
186
|
+
|
|
187
|
+
For interactive multiline, use the pipeline command:
|
|
188
|
+
pipeline define myflow "ols y x" | "plot coef" | "export pdf"
|
|
189
|
+
pipeline run myflow
|
|
190
|
+
"""
|
|
191
|
+
return cmd_multiline.__doc__ or "See: pipeline define / run"
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""Data validation commands: validate, fuzzyjoin."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re as _re
|
|
6
|
+
|
|
7
|
+
from openstat.commands.base import command, CommandArgs, friendly_error
|
|
8
|
+
from openstat.session import Session
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# ── validate ─────────────────────────────────────────────────────────────────
|
|
12
|
+
|
|
13
|
+
@command("validate", usage="validate <col> <rule> [<rule> ...]")
|
|
14
|
+
def cmd_validate(session: Session, args: str) -> str:
|
|
15
|
+
"""Validate a column against one or more rules.
|
|
16
|
+
|
|
17
|
+
Rules:
|
|
18
|
+
min=<N> — all values ≥ N
|
|
19
|
+
max=<N> — all values ≤ N
|
|
20
|
+
notnull — no missing values
|
|
21
|
+
unique — all values distinct
|
|
22
|
+
positive — all values > 0
|
|
23
|
+
nonneg — all values ≥ 0
|
|
24
|
+
regex=<pattern> — all values match regex (string columns)
|
|
25
|
+
oneof=a,b,c — all values in the allowed set
|
|
26
|
+
between=lo,hi — all values in [lo, hi]
|
|
27
|
+
|
|
28
|
+
Examples:
|
|
29
|
+
validate age min=0 max=120 notnull
|
|
30
|
+
validate gender oneof=male,female,other
|
|
31
|
+
validate email regex=^[^@]+@[^@]+\\.[^@]+$
|
|
32
|
+
validate score between=0,100
|
|
33
|
+
"""
|
|
34
|
+
import polars as pl
|
|
35
|
+
|
|
36
|
+
ca = CommandArgs(args)
|
|
37
|
+
if len(ca.positional) < 2:
|
|
38
|
+
return "Usage: validate <col> <rule> [<rule> ...]"
|
|
39
|
+
|
|
40
|
+
col_name = ca.positional[0]
|
|
41
|
+
rules = ca.positional[1:]
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
df = session.require_data()
|
|
45
|
+
if col_name not in df.columns:
|
|
46
|
+
return f"Column not found: {col_name}"
|
|
47
|
+
|
|
48
|
+
col = df[col_name]
|
|
49
|
+
n_total = df.height
|
|
50
|
+
failures: list[str] = []
|
|
51
|
+
passes: list[str] = []
|
|
52
|
+
|
|
53
|
+
NUMERIC = (pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
54
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64)
|
|
55
|
+
|
|
56
|
+
for rule in rules:
|
|
57
|
+
rule = rule.strip()
|
|
58
|
+
|
|
59
|
+
if rule == "notnull":
|
|
60
|
+
n_miss = col.null_count()
|
|
61
|
+
if n_miss > 0:
|
|
62
|
+
failures.append(f"notnull: {n_miss} missing values")
|
|
63
|
+
else:
|
|
64
|
+
passes.append("notnull ✓")
|
|
65
|
+
|
|
66
|
+
elif rule == "unique":
|
|
67
|
+
n_dup = n_total - col.drop_nulls().n_unique()
|
|
68
|
+
if n_dup > 0:
|
|
69
|
+
failures.append(f"unique: {n_dup} duplicate values")
|
|
70
|
+
else:
|
|
71
|
+
passes.append("unique ✓")
|
|
72
|
+
|
|
73
|
+
elif rule == "positive":
|
|
74
|
+
if col.dtype not in NUMERIC:
|
|
75
|
+
failures.append("positive: column is not numeric")
|
|
76
|
+
else:
|
|
77
|
+
n_bad = col.drop_nulls().filter(col.drop_nulls() <= 0).len()
|
|
78
|
+
if n_bad:
|
|
79
|
+
failures.append(f"positive: {n_bad} values ≤ 0")
|
|
80
|
+
else:
|
|
81
|
+
passes.append("positive ✓")
|
|
82
|
+
|
|
83
|
+
elif rule == "nonneg":
|
|
84
|
+
if col.dtype not in NUMERIC:
|
|
85
|
+
failures.append("nonneg: column is not numeric")
|
|
86
|
+
else:
|
|
87
|
+
n_bad = col.drop_nulls().filter(col.drop_nulls() < 0).len()
|
|
88
|
+
if n_bad:
|
|
89
|
+
failures.append(f"nonneg: {n_bad} negative values")
|
|
90
|
+
else:
|
|
91
|
+
passes.append("nonneg ✓")
|
|
92
|
+
|
|
93
|
+
elif rule.startswith("min="):
|
|
94
|
+
val = float(rule[4:])
|
|
95
|
+
if col.dtype not in NUMERIC:
|
|
96
|
+
failures.append(f"min={val}: column is not numeric")
|
|
97
|
+
else:
|
|
98
|
+
n_bad = col.drop_nulls().filter(col.drop_nulls() < val).len()
|
|
99
|
+
if n_bad:
|
|
100
|
+
failures.append(f"min={val}: {n_bad} values below minimum")
|
|
101
|
+
else:
|
|
102
|
+
passes.append(f"min={val} ✓")
|
|
103
|
+
|
|
104
|
+
elif rule.startswith("max="):
|
|
105
|
+
val = float(rule[4:])
|
|
106
|
+
if col.dtype not in NUMERIC:
|
|
107
|
+
failures.append(f"max={val}: column is not numeric")
|
|
108
|
+
else:
|
|
109
|
+
n_bad = col.drop_nulls().filter(col.drop_nulls() > val).len()
|
|
110
|
+
if n_bad:
|
|
111
|
+
failures.append(f"max={val}: {n_bad} values above maximum")
|
|
112
|
+
else:
|
|
113
|
+
passes.append(f"max={val} ✓")
|
|
114
|
+
|
|
115
|
+
elif rule.startswith("between="):
|
|
116
|
+
parts = rule[8:].split(",")
|
|
117
|
+
if len(parts) != 2:
|
|
118
|
+
failures.append(f"between: invalid format (use between=lo,hi)")
|
|
119
|
+
continue
|
|
120
|
+
lo, hi = float(parts[0]), float(parts[1])
|
|
121
|
+
if col.dtype not in NUMERIC:
|
|
122
|
+
failures.append(f"between={lo},{hi}: column is not numeric")
|
|
123
|
+
else:
|
|
124
|
+
ser = col.drop_nulls()
|
|
125
|
+
n_bad = ser.filter((ser < lo) | (ser > hi)).len()
|
|
126
|
+
if n_bad:
|
|
127
|
+
failures.append(f"between={lo},{hi}: {n_bad} values out of range")
|
|
128
|
+
else:
|
|
129
|
+
passes.append(f"between={lo},{hi} ✓")
|
|
130
|
+
|
|
131
|
+
elif rule.startswith("oneof="):
|
|
132
|
+
allowed = set(rule[6:].split(","))
|
|
133
|
+
vals = col.drop_nulls().cast(pl.Utf8).to_list()
|
|
134
|
+
bad = [v for v in vals if v not in allowed]
|
|
135
|
+
if bad:
|
|
136
|
+
sample = bad[:5]
|
|
137
|
+
failures.append(f"oneof: {len(bad)} invalid values, e.g. {sample}")
|
|
138
|
+
else:
|
|
139
|
+
passes.append(f"oneof={rule[6:]} ✓")
|
|
140
|
+
|
|
141
|
+
elif rule.startswith("regex="):
|
|
142
|
+
pattern = rule[6:]
|
|
143
|
+
try:
|
|
144
|
+
compiled = _re.compile(pattern)
|
|
145
|
+
except _re.error as exc:
|
|
146
|
+
failures.append(f"regex: invalid pattern — {exc}")
|
|
147
|
+
continue
|
|
148
|
+
vals = col.drop_nulls().cast(pl.Utf8).to_list()
|
|
149
|
+
bad = [v for v in vals if not compiled.search(v)]
|
|
150
|
+
if bad:
|
|
151
|
+
sample = bad[:3]
|
|
152
|
+
failures.append(f"regex: {len(bad)} non-matching values, e.g. {sample}")
|
|
153
|
+
else:
|
|
154
|
+
passes.append(f"regex ✓")
|
|
155
|
+
|
|
156
|
+
else:
|
|
157
|
+
failures.append(f"Unknown rule: {rule}")
|
|
158
|
+
|
|
159
|
+
lines = [f"Validation: {col_name} (N={n_total})", "=" * 50]
|
|
160
|
+
for msg in passes:
|
|
161
|
+
lines.append(f" PASS {msg}")
|
|
162
|
+
for msg in failures:
|
|
163
|
+
lines.append(f" FAIL {msg}")
|
|
164
|
+
lines.append("=" * 50)
|
|
165
|
+
if failures:
|
|
166
|
+
lines.append(f"Result: {len(failures)} check(s) FAILED, {len(passes)} passed")
|
|
167
|
+
else:
|
|
168
|
+
lines.append(f"Result: All {len(passes)} check(s) PASSED")
|
|
169
|
+
return "\n".join(lines)
|
|
170
|
+
|
|
171
|
+
except Exception as e:
|
|
172
|
+
return friendly_error(e, "validate")
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
# ── fuzzyjoin ────────────────────────────────────────────────────────────────
|
|
176
|
+
|
|
177
|
+
@command("fuzzyjoin", usage="fuzzyjoin <other_file> on(<col>) [--threshold=80] [--method=ratio]")
|
|
178
|
+
def cmd_fuzzyjoin(session: Session, args: str) -> str:
|
|
179
|
+
"""Fuzzy (approximate) string join with another dataset.
|
|
180
|
+
|
|
181
|
+
Matches rows by similarity of a string column, useful for messy data.
|
|
182
|
+
|
|
183
|
+
Options:
|
|
184
|
+
on(<col>) — column to match on (must exist in both datasets)
|
|
185
|
+
--threshold=80 — minimum similarity score (0-100, default 80)
|
|
186
|
+
--method=ratio — scoring: ratio, partial_ratio, token_sort_ratio
|
|
187
|
+
|
|
188
|
+
Examples:
|
|
189
|
+
fuzzyjoin companies.csv on(name) --threshold=85
|
|
190
|
+
fuzzyjoin lookup.parquet on(city) --method=token_sort_ratio
|
|
191
|
+
"""
|
|
192
|
+
try:
|
|
193
|
+
from rapidfuzz import fuzz, process
|
|
194
|
+
except ImportError:
|
|
195
|
+
return (
|
|
196
|
+
"rapidfuzz is required for fuzzyjoin.\n"
|
|
197
|
+
"Install: pip install rapidfuzz"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
import polars as pl
|
|
201
|
+
from openstat.io.loader import load_file
|
|
202
|
+
|
|
203
|
+
ca = CommandArgs(args)
|
|
204
|
+
if not ca.positional:
|
|
205
|
+
return "Usage: fuzzyjoin <other_file> on(<col>) [--threshold=80]"
|
|
206
|
+
|
|
207
|
+
other_path = ca.positional[0]
|
|
208
|
+
on_raw = ca.rest_after("on")
|
|
209
|
+
if not on_raw:
|
|
210
|
+
return "Specify join column: on(<col>)"
|
|
211
|
+
on_col = on_raw.strip().strip("()")
|
|
212
|
+
|
|
213
|
+
threshold = float(ca.options.get("threshold", 80))
|
|
214
|
+
method_name = ca.options.get("method", "ratio")
|
|
215
|
+
scorer = {
|
|
216
|
+
"ratio": fuzz.ratio,
|
|
217
|
+
"partial_ratio": fuzz.partial_ratio,
|
|
218
|
+
"token_sort_ratio": fuzz.token_sort_ratio,
|
|
219
|
+
}.get(method_name, fuzz.ratio)
|
|
220
|
+
|
|
221
|
+
try:
|
|
222
|
+
df_left = session.require_data()
|
|
223
|
+
df_right = load_file(other_path)
|
|
224
|
+
|
|
225
|
+
if on_col not in df_left.columns:
|
|
226
|
+
return f"Column '{on_col}' not in current dataset."
|
|
227
|
+
if on_col not in df_right.columns:
|
|
228
|
+
return f"Column '{on_col}' not in {other_path}."
|
|
229
|
+
|
|
230
|
+
left_vals = df_left[on_col].cast(pl.Utf8).to_list()
|
|
231
|
+
right_vals = df_right[on_col].cast(pl.Utf8).to_list()
|
|
232
|
+
|
|
233
|
+
# For each left value find best match in right
|
|
234
|
+
best_match = []
|
|
235
|
+
best_score = []
|
|
236
|
+
for lv in left_vals:
|
|
237
|
+
if lv is None:
|
|
238
|
+
best_match.append(None)
|
|
239
|
+
best_score.append(0.0)
|
|
240
|
+
continue
|
|
241
|
+
result = process.extractOne(lv, right_vals, scorer=scorer)
|
|
242
|
+
if result and result[1] >= threshold:
|
|
243
|
+
best_match.append(result[0])
|
|
244
|
+
best_score.append(float(result[1]))
|
|
245
|
+
else:
|
|
246
|
+
best_match.append(None)
|
|
247
|
+
best_score.append(float(result[1]) if result else 0.0)
|
|
248
|
+
|
|
249
|
+
df_left = df_left.with_columns([
|
|
250
|
+
pl.Series("_fuzzy_match", best_match),
|
|
251
|
+
pl.Series("_fuzzy_score", best_score),
|
|
252
|
+
])
|
|
253
|
+
|
|
254
|
+
# Join right dataset on the matched value
|
|
255
|
+
df_right_renamed = df_right.rename(
|
|
256
|
+
{c: f"_r_{c}" if c != on_col else c for c in df_right.columns}
|
|
257
|
+
).rename({on_col: "_fuzzy_match"})
|
|
258
|
+
|
|
259
|
+
result = df_left.join(df_right_renamed, on="_fuzzy_match", how="left")
|
|
260
|
+
session.snapshot()
|
|
261
|
+
session.df = result
|
|
262
|
+
|
|
263
|
+
n_matched = sum(1 for s in best_score if s >= threshold)
|
|
264
|
+
return (
|
|
265
|
+
f"Fuzzy join complete. {n_matched}/{df_left.height} rows matched "
|
|
266
|
+
f"(threshold={threshold:.0f}). New shape: {session.shape_str}"
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
except Exception as e:
|
|
270
|
+
return friendly_error(e, "fuzzyjoin")
|