openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,996 @@
|
|
|
1
|
+
"""Data manipulation commands: load, describe, head, filter, select, derive, dropna, sort, rename, count, tail, merge, pivot, melt, sample, replace, duplicates, unique, encode, recode."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.table import Table
|
|
10
|
+
|
|
11
|
+
from openstat.session import Session
|
|
12
|
+
from openstat.config import get_config
|
|
13
|
+
from openstat.io.loader import load_file, save_file
|
|
14
|
+
from openstat.dsl.parser import parse_expression, ParseError
|
|
15
|
+
from openstat.commands.base import command, CommandArgs, rich_to_str, friendly_error
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@command("load", usage="load <path> [sheet=<name|index|list>]")
|
|
19
|
+
def cmd_load(session: Session, args: str) -> str:
|
|
20
|
+
"""Load a dataset from CSV, Parquet, Stata (.dta), or Excel (.xlsx) file.
|
|
21
|
+
|
|
22
|
+
For Excel files, specify a sheet with sheet=<name|index>.
|
|
23
|
+
Use sheet=list to see available sheet names.
|
|
24
|
+
|
|
25
|
+
Examples:
|
|
26
|
+
load data.csv
|
|
27
|
+
load results.xlsx
|
|
28
|
+
load workbook.xlsx sheet=Sheet2
|
|
29
|
+
load workbook.xlsx sheet=1
|
|
30
|
+
load workbook.xlsx sheet=list
|
|
31
|
+
"""
|
|
32
|
+
ca = CommandArgs(args)
|
|
33
|
+
if not ca.positional:
|
|
34
|
+
return "Usage: load <path> [sheet=<name|index|list>]"
|
|
35
|
+
path = ca.positional[0]
|
|
36
|
+
sheet = ca.options.get("sheet")
|
|
37
|
+
|
|
38
|
+
if sheet is not None and path.lower().endswith((".xlsx", ".xls")):
|
|
39
|
+
from openstat.io.loader import _load_excel
|
|
40
|
+
from pathlib import Path as _Path
|
|
41
|
+
# sheet=list → just show sheet names
|
|
42
|
+
if sheet == "list":
|
|
43
|
+
try:
|
|
44
|
+
_load_excel(_Path(path), sheet="list")
|
|
45
|
+
except ValueError as exc:
|
|
46
|
+
return str(exc)
|
|
47
|
+
# numeric index
|
|
48
|
+
sheet_arg: str | int = int(sheet) if sheet.isdigit() else sheet
|
|
49
|
+
session.df = _load_excel(_Path(path), sheet=sheet_arg)
|
|
50
|
+
else:
|
|
51
|
+
session.df = load_file(path, session=session)
|
|
52
|
+
|
|
53
|
+
session.dataset_path = path
|
|
54
|
+
session.dataset_name = path.split("/")[-1]
|
|
55
|
+
session._undo_stack.clear()
|
|
56
|
+
return f"Loaded {session.shape_str} from {path}"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@command("labels", usage="labels [column]")
|
|
60
|
+
def cmd_labels(session: Session, args: str) -> str:
|
|
61
|
+
"""Show variable/value labels from SAS, SPSS, or Stata files."""
|
|
62
|
+
labels = session._variable_labels
|
|
63
|
+
if not labels:
|
|
64
|
+
return "No variable labels available. Labels are loaded from .dta, .sav, or .sas7bdat files."
|
|
65
|
+
col = args.strip()
|
|
66
|
+
if col:
|
|
67
|
+
if col not in labels:
|
|
68
|
+
return f"No labels for column '{col}'. Columns with labels: {', '.join(labels.keys())}"
|
|
69
|
+
mapping = labels[col]
|
|
70
|
+
lines = [f"Labels for '{col}':"]
|
|
71
|
+
for val, lbl in sorted(mapping.items(), key=lambda x: str(x[0])):
|
|
72
|
+
lines.append(f" {val} = {lbl}")
|
|
73
|
+
return "\n".join(lines)
|
|
74
|
+
|
|
75
|
+
def render(console: Console) -> None:
|
|
76
|
+
table = Table(title="Variable Value Labels")
|
|
77
|
+
table.add_column("Column", style="cyan")
|
|
78
|
+
table.add_column("# Labels", justify="right")
|
|
79
|
+
table.add_column("Sample", style="dim")
|
|
80
|
+
for var, mapping in sorted(labels.items()):
|
|
81
|
+
sample = ", ".join(f"{k}={v}" for k, v in list(mapping.items())[:3])
|
|
82
|
+
if len(mapping) > 3:
|
|
83
|
+
sample += ", ..."
|
|
84
|
+
table.add_row(var, str(len(mapping)), sample)
|
|
85
|
+
console.print(table)
|
|
86
|
+
|
|
87
|
+
return rich_to_str(render)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@command("describe", usage="describe")
|
|
91
|
+
def cmd_describe(session: Session, args: str) -> str:
|
|
92
|
+
"""Show dataset structure: columns, types, missing values."""
|
|
93
|
+
df = session.require_data()
|
|
94
|
+
|
|
95
|
+
def render(console: Console) -> None:
|
|
96
|
+
table = Table(title=f"Dataset: {session.dataset_name or '(unnamed)'}")
|
|
97
|
+
table.add_column("Column", style="cyan")
|
|
98
|
+
table.add_column("Type", style="green")
|
|
99
|
+
table.add_column("Non-null", justify="right")
|
|
100
|
+
table.add_column("Missing", justify="right")
|
|
101
|
+
table.add_column("Unique", justify="right")
|
|
102
|
+
|
|
103
|
+
for col_name in df.columns:
|
|
104
|
+
col = df[col_name]
|
|
105
|
+
nulls = col.null_count()
|
|
106
|
+
non_null = df.height - nulls
|
|
107
|
+
unique = col.n_unique()
|
|
108
|
+
table.add_row(col_name, str(col.dtype), str(non_null), str(nulls), str(unique))
|
|
109
|
+
console.print(table)
|
|
110
|
+
console.print(f"Shape: {df.height:,} rows x {df.width} columns")
|
|
111
|
+
|
|
112
|
+
return rich_to_str(render)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@command("head", usage="head [N]")
|
|
116
|
+
def cmd_head(session: Session, args: str) -> str:
|
|
117
|
+
"""Show first N rows (default from config)."""
|
|
118
|
+
df = session.require_data()
|
|
119
|
+
n = get_config().head_default
|
|
120
|
+
if args.strip():
|
|
121
|
+
try:
|
|
122
|
+
n = int(args.strip())
|
|
123
|
+
except ValueError:
|
|
124
|
+
return "Usage: head [N]"
|
|
125
|
+
|
|
126
|
+
def render(console: Console) -> None:
|
|
127
|
+
table = Table(title=f"First {min(n, df.height)} rows")
|
|
128
|
+
for col_name in df.columns:
|
|
129
|
+
table.add_column(col_name)
|
|
130
|
+
for row in df.head(n).iter_rows():
|
|
131
|
+
table.add_row(*[str(v) for v in row])
|
|
132
|
+
console.print(table)
|
|
133
|
+
|
|
134
|
+
return rich_to_str(render)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@command("tail", usage="tail [N]")
|
|
138
|
+
def cmd_tail(session: Session, args: str) -> str:
|
|
139
|
+
"""Show last N rows (default 10)."""
|
|
140
|
+
df = session.require_data()
|
|
141
|
+
n = 10
|
|
142
|
+
if args.strip():
|
|
143
|
+
try:
|
|
144
|
+
n = int(args.strip())
|
|
145
|
+
except ValueError:
|
|
146
|
+
return "Usage: tail [N]"
|
|
147
|
+
|
|
148
|
+
def render(console: Console) -> None:
|
|
149
|
+
table = Table(title=f"Last {min(n, df.height)} rows")
|
|
150
|
+
for col_name in df.columns:
|
|
151
|
+
table.add_column(col_name)
|
|
152
|
+
for row in df.tail(n).iter_rows():
|
|
153
|
+
table.add_row(*[str(v) for v in row])
|
|
154
|
+
console.print(table)
|
|
155
|
+
|
|
156
|
+
return rich_to_str(render)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@command("count", usage="count")
|
|
160
|
+
def cmd_count(session: Session, args: str) -> str:
|
|
161
|
+
"""Show the number of rows and columns."""
|
|
162
|
+
df = session.require_data()
|
|
163
|
+
return f"{df.height:,} rows x {df.width} columns"
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@command("filter", usage="filter <expression>")
|
|
167
|
+
def cmd_filter(session: Session, args: str) -> str:
|
|
168
|
+
"""Filter rows using an expression. Use 'undo' to revert."""
|
|
169
|
+
df = session.require_data()
|
|
170
|
+
if not args.strip():
|
|
171
|
+
return "Usage: filter <expression> (e.g. filter age > 30)"
|
|
172
|
+
try:
|
|
173
|
+
expr = parse_expression(args)
|
|
174
|
+
before = df.height
|
|
175
|
+
session.snapshot()
|
|
176
|
+
session.df = df.filter(expr)
|
|
177
|
+
after = session.df.height
|
|
178
|
+
return f"Filtered: {before:,} -> {after:,} rows ({before - after:,} dropped). Use 'undo' to revert."
|
|
179
|
+
except ParseError as e:
|
|
180
|
+
return f"Parse error: {e}"
|
|
181
|
+
except Exception as e:
|
|
182
|
+
return friendly_error(e, "Filter error")
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
@command("select", usage="select <col1> <col2> ...")
|
|
186
|
+
def cmd_select(session: Session, args: str) -> str:
|
|
187
|
+
"""Select specific columns. Use 'undo' to revert."""
|
|
188
|
+
df = session.require_data()
|
|
189
|
+
cols = args.split()
|
|
190
|
+
if not cols:
|
|
191
|
+
return "Usage: select <col1> <col2> ..."
|
|
192
|
+
missing = [c for c in cols if c not in df.columns]
|
|
193
|
+
if missing:
|
|
194
|
+
return f"Columns not found: {', '.join(missing)}"
|
|
195
|
+
session.snapshot()
|
|
196
|
+
session.df = df.select(cols)
|
|
197
|
+
return f"Selected {len(cols)} columns. Shape: {session.shape_str}"
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
@command("derive", usage="derive <newcol> = <expression>")
|
|
201
|
+
def cmd_derive(session: Session, args: str) -> str:
|
|
202
|
+
"""Create a new column from an expression. Use 'undo' to revert."""
|
|
203
|
+
df = session.require_data()
|
|
204
|
+
if "=" not in args:
|
|
205
|
+
return "Usage: derive <newcol> = <expression>"
|
|
206
|
+
name, expr_str = args.split("=", 1)
|
|
207
|
+
name = name.strip()
|
|
208
|
+
if not name:
|
|
209
|
+
return "Usage: derive <newcol> = <expression>"
|
|
210
|
+
try:
|
|
211
|
+
expr = parse_expression(expr_str.strip())
|
|
212
|
+
session.snapshot()
|
|
213
|
+
session.df = df.with_columns(expr.alias(name))
|
|
214
|
+
return f"Created column '{name}'. Shape: {session.shape_str}"
|
|
215
|
+
except ParseError as e:
|
|
216
|
+
return f"Parse error: {e}"
|
|
217
|
+
except Exception as e:
|
|
218
|
+
return friendly_error(e, "Derive error")
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
@command("dropna", usage="dropna [col1 col2 ...]")
|
|
222
|
+
def cmd_dropna(session: Session, args: str) -> str:
|
|
223
|
+
"""Drop rows with missing values. Use 'undo' to revert."""
|
|
224
|
+
df = session.require_data()
|
|
225
|
+
cols = args.split() if args.strip() else None
|
|
226
|
+
before = df.height
|
|
227
|
+
if cols:
|
|
228
|
+
missing = [c for c in cols if c not in df.columns]
|
|
229
|
+
if missing:
|
|
230
|
+
return f"Columns not found: {', '.join(missing)}"
|
|
231
|
+
session.snapshot()
|
|
232
|
+
if cols:
|
|
233
|
+
session.df = df.drop_nulls(subset=cols)
|
|
234
|
+
else:
|
|
235
|
+
session.df = df.drop_nulls()
|
|
236
|
+
after = session.df.height
|
|
237
|
+
return f"Dropped nulls: {before:,} -> {after:,} rows ({before - after:,} removed)"
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
@command("sort", usage="sort <col> [--desc]")
|
|
241
|
+
def cmd_sort(session: Session, args: str) -> str:
|
|
242
|
+
"""Sort dataset by one or more columns. Use --desc for descending."""
|
|
243
|
+
df = session.require_data()
|
|
244
|
+
ca = CommandArgs(args)
|
|
245
|
+
descending = ca.has_flag("--desc")
|
|
246
|
+
cols = ca.positional
|
|
247
|
+
if not cols:
|
|
248
|
+
return "Usage: sort <col1> [col2 ...] [--desc]"
|
|
249
|
+
missing = [c for c in cols if c not in df.columns]
|
|
250
|
+
if missing:
|
|
251
|
+
return f"Columns not found: {', '.join(missing)}"
|
|
252
|
+
session.snapshot()
|
|
253
|
+
session.df = df.sort(cols, descending=descending)
|
|
254
|
+
direction = "descending" if descending else "ascending"
|
|
255
|
+
return f"Sorted by {', '.join(cols)} ({direction}). {session.shape_str}"
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
@command("rename", usage="rename <old> <new>")
|
|
259
|
+
def cmd_rename(session: Session, args: str) -> str:
|
|
260
|
+
"""Rename a column."""
|
|
261
|
+
df = session.require_data()
|
|
262
|
+
parts = args.split()
|
|
263
|
+
if len(parts) != 2:
|
|
264
|
+
return "Usage: rename <old_name> <new_name>"
|
|
265
|
+
old, new = parts
|
|
266
|
+
if old not in df.columns:
|
|
267
|
+
return f"Column not found: {old}"
|
|
268
|
+
if new in df.columns:
|
|
269
|
+
return f"Column already exists: {new}"
|
|
270
|
+
session.snapshot()
|
|
271
|
+
session.df = df.rename({old: new})
|
|
272
|
+
return f"Renamed '{old}' -> '{new}'"
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
@command("undo", usage="undo")
|
|
276
|
+
def cmd_undo(session: Session, args: str) -> str:
|
|
277
|
+
"""Undo the last data-modifying command (filter, select, derive, dropna, sort, rename)."""
|
|
278
|
+
if session.undo():
|
|
279
|
+
return f"Undone. Data restored: {session.shape_str} (undo stack: {session.undo_depth} remaining)"
|
|
280
|
+
return "Nothing to undo."
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
@command("save", usage="save <path.csv|path.parquet>")
|
|
284
|
+
def cmd_save(session: Session, args: str) -> str:
|
|
285
|
+
"""Save current dataset to CSV or Parquet."""
|
|
286
|
+
df = session.require_data()
|
|
287
|
+
path = args.strip()
|
|
288
|
+
if not path:
|
|
289
|
+
return "Usage: save <path.csv|path.parquet>"
|
|
290
|
+
try:
|
|
291
|
+
out = save_file(df, path)
|
|
292
|
+
return f"Saved {session.shape_str} to {out}"
|
|
293
|
+
except Exception as e:
|
|
294
|
+
return f"Save error: {e}"
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
@command("merge", usage="merge <path> on <key> [how=left|right|inner|outer]")
|
|
298
|
+
def cmd_merge(session: Session, args: str) -> str:
|
|
299
|
+
"""Merge (join) current dataset with another file on a key column."""
|
|
300
|
+
df = session.require_data()
|
|
301
|
+
ca = CommandArgs(args)
|
|
302
|
+
how = ca.get_option("how", "inner")
|
|
303
|
+
on_rest = ca.rest_after("on")
|
|
304
|
+
if not on_rest:
|
|
305
|
+
return "Usage: merge <path> on <key_col> [how=left|right|inner|outer]"
|
|
306
|
+
|
|
307
|
+
# file_path is everything before "on" keyword (with options stripped)
|
|
308
|
+
clean = ca.strip_flags_and_options()
|
|
309
|
+
before_on = re.split(r"\bon\b", clean, maxsplit=1)
|
|
310
|
+
file_path = before_on[0].strip()
|
|
311
|
+
key_col = on_rest.split()[0] if on_rest.strip() else ""
|
|
312
|
+
if not file_path or not key_col:
|
|
313
|
+
return "Usage: merge <path> on <key_col> [how=left|right|inner|outer]"
|
|
314
|
+
|
|
315
|
+
# Polars uses "full" instead of "outer"
|
|
316
|
+
if how == "outer":
|
|
317
|
+
how = "full"
|
|
318
|
+
valid_how = {"left", "right", "inner", "full", "cross"}
|
|
319
|
+
if how not in valid_how:
|
|
320
|
+
return f"Invalid join type: {how}. Use: {', '.join(sorted(valid_how))}"
|
|
321
|
+
|
|
322
|
+
try:
|
|
323
|
+
other = load_file(file_path)
|
|
324
|
+
except Exception as e:
|
|
325
|
+
return f"Cannot load merge file: {e}"
|
|
326
|
+
|
|
327
|
+
if key_col not in df.columns:
|
|
328
|
+
return f"Key column '{key_col}' not found in current dataset"
|
|
329
|
+
if key_col not in other.columns:
|
|
330
|
+
return f"Key column '{key_col}' not found in merge file"
|
|
331
|
+
|
|
332
|
+
session.snapshot()
|
|
333
|
+
before = df.height
|
|
334
|
+
session.df = df.join(other, on=key_col, how=how, suffix="_right")
|
|
335
|
+
after = session.df.height
|
|
336
|
+
|
|
337
|
+
return (
|
|
338
|
+
f"Merged ({how}): {before:,} + {other.height:,} -> {after:,} rows, "
|
|
339
|
+
f"{session.df.width} columns. Use 'undo' to revert."
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
@command("pivot", usage="pivot <value_col> by <col_col> [over <row_col>] [agg=mean|sum|count|first]")
|
|
344
|
+
def cmd_pivot(session: Session, args: str) -> str:
|
|
345
|
+
"""Pivot (reshape to wide format). Optional aggregation function."""
|
|
346
|
+
df = session.require_data()
|
|
347
|
+
ca = CommandArgs(args)
|
|
348
|
+
agg_func = ca.get_option("agg", "first")
|
|
349
|
+
|
|
350
|
+
AGG_MAP = {
|
|
351
|
+
"first": "first",
|
|
352
|
+
"mean": "mean",
|
|
353
|
+
"sum": "sum",
|
|
354
|
+
"count": "len",
|
|
355
|
+
"min": "min",
|
|
356
|
+
"max": "max",
|
|
357
|
+
}
|
|
358
|
+
if agg_func not in AGG_MAP:
|
|
359
|
+
return f"Unknown aggregation: {agg_func}. Available: {', '.join(AGG_MAP)}"
|
|
360
|
+
|
|
361
|
+
# Work with cleaned args (options/flags removed), split on word-boundary "by"
|
|
362
|
+
clean = ca.strip_flags_and_options()
|
|
363
|
+
by_parts = re.split(r"\bby\b", clean, maxsplit=1)
|
|
364
|
+
if len(by_parts) < 2:
|
|
365
|
+
return "Usage: pivot <value_col> by <column_col> [over <index_col>] [agg=mean|sum|count|first]"
|
|
366
|
+
|
|
367
|
+
value_col = by_parts[0].strip()
|
|
368
|
+
rest = by_parts[1].strip()
|
|
369
|
+
|
|
370
|
+
# Check for "over" keyword with word boundaries
|
|
371
|
+
over_parts = re.split(r"\bover\b", rest, maxsplit=1)
|
|
372
|
+
col_col = over_parts[0].strip()
|
|
373
|
+
index_col = over_parts[1].strip() if len(over_parts) > 1 else None
|
|
374
|
+
|
|
375
|
+
if not value_col or not col_col:
|
|
376
|
+
return "Usage: pivot <value_col> by <column_col> [over <index_col>] [agg=mean|sum|count|first]"
|
|
377
|
+
|
|
378
|
+
for c in [value_col, col_col] + ([index_col] if index_col else []):
|
|
379
|
+
if c not in df.columns:
|
|
380
|
+
return f"Column not found: {c}"
|
|
381
|
+
|
|
382
|
+
session.snapshot()
|
|
383
|
+
try:
|
|
384
|
+
pivot_kwargs: dict = dict(
|
|
385
|
+
on=col_col, values=value_col,
|
|
386
|
+
aggregate_function=AGG_MAP[agg_func],
|
|
387
|
+
)
|
|
388
|
+
if index_col:
|
|
389
|
+
pivot_kwargs["index"] = index_col
|
|
390
|
+
else:
|
|
391
|
+
others = [c for c in df.columns if c not in (value_col, col_col)]
|
|
392
|
+
if not others:
|
|
393
|
+
return "Need at least one column to serve as row index"
|
|
394
|
+
pivot_kwargs["index"] = others
|
|
395
|
+
session.df = df.pivot(**pivot_kwargs)
|
|
396
|
+
return f"Pivoted to wide format: {session.shape_str}. Use 'undo' to revert."
|
|
397
|
+
except Exception as e:
|
|
398
|
+
session.undo()
|
|
399
|
+
return friendly_error(e, "Pivot error")
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
@command("melt", usage="melt <id_cols>, <value_cols> [var_name=X] [value_name=Y]")
|
|
403
|
+
def cmd_melt(session: Session, args: str) -> str:
|
|
404
|
+
"""Melt (reshape to long format). Separate id and value cols with a comma."""
|
|
405
|
+
df = session.require_data()
|
|
406
|
+
ca = CommandArgs(args)
|
|
407
|
+
var_name = ca.get_option("var_name", "variable")
|
|
408
|
+
value_name = ca.get_option("value_name", "value")
|
|
409
|
+
|
|
410
|
+
clean = ca.strip_flags_and_options()
|
|
411
|
+
if "," not in clean:
|
|
412
|
+
return "Usage: melt <id_col1> <id_col2>, <val_col1> <val_col2> [var_name=X] [value_name=Y]"
|
|
413
|
+
|
|
414
|
+
parts = clean.split(",", 1)
|
|
415
|
+
id_cols = parts[0].split()
|
|
416
|
+
val_cols = parts[1].split()
|
|
417
|
+
|
|
418
|
+
if not id_cols or not val_cols:
|
|
419
|
+
return "Usage: melt <id_col1> <id_col2>, <val_col1> <val_col2> [var_name=X] [value_name=Y]"
|
|
420
|
+
|
|
421
|
+
for c in id_cols + val_cols:
|
|
422
|
+
if c not in df.columns:
|
|
423
|
+
return f"Column not found: {c}"
|
|
424
|
+
|
|
425
|
+
session.snapshot()
|
|
426
|
+
try:
|
|
427
|
+
session.df = df.unpivot(
|
|
428
|
+
on=val_cols, index=id_cols,
|
|
429
|
+
variable_name=var_name, value_name=value_name,
|
|
430
|
+
)
|
|
431
|
+
return f"Melted to long format: {session.shape_str}. Use 'undo' to revert."
|
|
432
|
+
except Exception as e:
|
|
433
|
+
session.undo()
|
|
434
|
+
return friendly_error(e, "Melt error")
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
@command("sample", usage="sample <N|N%>")
|
|
438
|
+
def cmd_sample(session: Session, args: str) -> str:
|
|
439
|
+
"""Take a random sample: N rows or N% of data."""
|
|
440
|
+
df = session.require_data()
|
|
441
|
+
arg = args.strip()
|
|
442
|
+
if not arg:
|
|
443
|
+
return "Usage: sample <N> or sample <N%>"
|
|
444
|
+
|
|
445
|
+
try:
|
|
446
|
+
if arg.endswith("%"):
|
|
447
|
+
pct = float(arg[:-1])
|
|
448
|
+
if not (0 < pct <= 100):
|
|
449
|
+
return "Percentage must be between 0 and 100"
|
|
450
|
+
n = max(1, int(df.height * pct / 100))
|
|
451
|
+
else:
|
|
452
|
+
n = int(arg)
|
|
453
|
+
if n <= 0:
|
|
454
|
+
return "Sample size must be positive"
|
|
455
|
+
except ValueError:
|
|
456
|
+
return "Usage: sample <N> or sample <N%>"
|
|
457
|
+
|
|
458
|
+
n = min(n, df.height)
|
|
459
|
+
session.snapshot()
|
|
460
|
+
session.df = df.sample(n=n, shuffle=True)
|
|
461
|
+
return f"Sampled {n:,} rows from {df.height:,}. {session.shape_str}. Use 'undo' to revert."
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
@command("replace", usage="replace <col> <old_value> <new_value>")
|
|
465
|
+
def cmd_replace(session: Session, args: str) -> str:
|
|
466
|
+
"""Replace values in a column."""
|
|
467
|
+
df = session.require_data()
|
|
468
|
+
parts = args.split(None, 2)
|
|
469
|
+
if len(parts) < 3:
|
|
470
|
+
return "Usage: replace <col> <old_value> <new_value>"
|
|
471
|
+
|
|
472
|
+
col, old_val, new_val = parts[0], parts[1], parts[2]
|
|
473
|
+
if col not in df.columns:
|
|
474
|
+
return f"Column not found: {col}"
|
|
475
|
+
|
|
476
|
+
# Strip quotes from values
|
|
477
|
+
old_val = old_val.strip("\"'")
|
|
478
|
+
new_val = new_val.strip("\"'")
|
|
479
|
+
|
|
480
|
+
dtype = df[col].dtype
|
|
481
|
+
session.snapshot()
|
|
482
|
+
|
|
483
|
+
try:
|
|
484
|
+
if dtype in (pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
485
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64):
|
|
486
|
+
old_v = float(old_val)
|
|
487
|
+
new_v = float(new_val)
|
|
488
|
+
count = df.filter(pl.col(col) == old_v).height
|
|
489
|
+
session.df = df.with_columns(
|
|
490
|
+
pl.when(pl.col(col) == old_v).then(pl.lit(new_v)).otherwise(pl.col(col)).alias(col)
|
|
491
|
+
)
|
|
492
|
+
else:
|
|
493
|
+
count = df.filter(pl.col(col) == old_val).height
|
|
494
|
+
session.df = df.with_columns(
|
|
495
|
+
pl.when(pl.col(col) == old_val).then(pl.lit(new_val)).otherwise(pl.col(col)).alias(col)
|
|
496
|
+
)
|
|
497
|
+
return f"Replaced {count:,} occurrence(s) in '{col}'. Use 'undo' to revert."
|
|
498
|
+
except Exception as e:
|
|
499
|
+
session.undo()
|
|
500
|
+
return friendly_error(e, "Replace error")
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
@command("duplicates", usage="duplicates [drop] [col1 col2 ...]")
|
|
504
|
+
def cmd_duplicates(session: Session, args: str) -> str:
|
|
505
|
+
"""Find or drop duplicate rows. Use 'drop' to remove them."""
|
|
506
|
+
df = session.require_data()
|
|
507
|
+
parts = args.split()
|
|
508
|
+
drop = "drop" in parts
|
|
509
|
+
cols = [p for p in parts if p != "drop"]
|
|
510
|
+
|
|
511
|
+
if cols:
|
|
512
|
+
missing = [c for c in cols if c not in df.columns]
|
|
513
|
+
if missing:
|
|
514
|
+
return f"Columns not found: {', '.join(missing)}"
|
|
515
|
+
subset = cols
|
|
516
|
+
else:
|
|
517
|
+
subset = None
|
|
518
|
+
|
|
519
|
+
if drop:
|
|
520
|
+
before = df.height
|
|
521
|
+
session.snapshot()
|
|
522
|
+
session.df = df.unique(subset=subset, keep="first")
|
|
523
|
+
after = session.df.height
|
|
524
|
+
removed = before - after
|
|
525
|
+
return f"Dropped {removed:,} duplicate(s): {before:,} -> {after:,} rows. Use 'undo' to revert."
|
|
526
|
+
else:
|
|
527
|
+
n_dups = df.height - df.unique(subset=subset, keep="first").height
|
|
528
|
+
if n_dups == 0:
|
|
529
|
+
suffix = f" (on {', '.join(cols)})" if cols else ""
|
|
530
|
+
return f"No duplicates found{suffix}."
|
|
531
|
+
return f"Found {n_dups:,} duplicate row(s). Use 'duplicates drop' to remove them."
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
@command("unique", usage="unique <col>")
|
|
535
|
+
def cmd_unique(session: Session, args: str) -> str:
|
|
536
|
+
"""Show unique values of a column."""
|
|
537
|
+
df = session.require_data()
|
|
538
|
+
col = args.strip()
|
|
539
|
+
if not col:
|
|
540
|
+
return "Usage: unique <col>"
|
|
541
|
+
if col not in df.columns:
|
|
542
|
+
return f"Column not found: {col}"
|
|
543
|
+
|
|
544
|
+
values = df[col].unique().sort().to_list()
|
|
545
|
+
n = len(values)
|
|
546
|
+
if n > 50:
|
|
547
|
+
shown = [str(v) for v in values[:50]]
|
|
548
|
+
return f"{col}: {n} unique values (showing first 50):\n{', '.join(shown)} ..."
|
|
549
|
+
return f"{col}: {n} unique values:\n{', '.join(str(v) for v in values)}"
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
@command("encode", usage="encode <col> [as <new_col>]")
|
|
553
|
+
def cmd_encode(session: Session, args: str) -> str:
|
|
554
|
+
"""Encode a string column as numeric codes (label encoding)."""
|
|
555
|
+
df = session.require_data()
|
|
556
|
+
ca = CommandArgs(args)
|
|
557
|
+
if not ca.positional:
|
|
558
|
+
return "Usage: encode <col> [as <new_col>]"
|
|
559
|
+
|
|
560
|
+
col = ca.positional[0]
|
|
561
|
+
if col not in df.columns:
|
|
562
|
+
return f"Column not found: {col}"
|
|
563
|
+
|
|
564
|
+
as_rest = ca.rest_after("as")
|
|
565
|
+
new_col = as_rest.split()[0] if as_rest else col + "_code"
|
|
566
|
+
|
|
567
|
+
# Build mapping: sorted unique values → 0, 1, 2, ...
|
|
568
|
+
unique_vals = df[col].unique().sort().to_list()
|
|
569
|
+
mapping = {v: i for i, v in enumerate(unique_vals)}
|
|
570
|
+
|
|
571
|
+
session.snapshot()
|
|
572
|
+
session.df = df.with_columns(
|
|
573
|
+
pl.col(col).replace_strict(mapping).cast(pl.Int64).alias(new_col)
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
lines = [f"Encoded '{col}' -> '{new_col}' ({len(unique_vals)} levels):"]
|
|
577
|
+
for v, code in list(mapping.items())[:20]:
|
|
578
|
+
lines.append(f" {code} = {v}")
|
|
579
|
+
if len(mapping) > 20:
|
|
580
|
+
lines.append(f" ... ({len(mapping) - 20} more)")
|
|
581
|
+
lines.append("Use 'undo' to revert.")
|
|
582
|
+
return "\n".join(lines)
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
@command("recode", usage='recode <col> "old1"=new1 "old2"=new2 ...')
|
|
586
|
+
def cmd_recode(session: Session, args: str) -> str:
|
|
587
|
+
"""Recode values in a column using mappings."""
|
|
588
|
+
df = session.require_data()
|
|
589
|
+
if "=" not in args:
|
|
590
|
+
return 'Usage: recode <col> "old1"=new1 "old2"=new2 ...'
|
|
591
|
+
|
|
592
|
+
parts = args.split(None, 1)
|
|
593
|
+
if len(parts) < 2:
|
|
594
|
+
return 'Usage: recode <col> "old1"=new1 "old2"=new2 ...'
|
|
595
|
+
|
|
596
|
+
col = parts[0]
|
|
597
|
+
if col not in df.columns:
|
|
598
|
+
return f"Column not found: {col}"
|
|
599
|
+
|
|
600
|
+
# Parse mapping pairs
|
|
601
|
+
mapping_str = parts[1]
|
|
602
|
+
mapping: dict[str, str] = {}
|
|
603
|
+
import shlex
|
|
604
|
+
try:
|
|
605
|
+
tokens = shlex.split(mapping_str)
|
|
606
|
+
except ValueError:
|
|
607
|
+
tokens = mapping_str.split()
|
|
608
|
+
|
|
609
|
+
for token in tokens:
|
|
610
|
+
if "=" not in token:
|
|
611
|
+
return f"Invalid mapping: {token}. Use old=new format."
|
|
612
|
+
old, new = token.split("=", 1)
|
|
613
|
+
old = old.strip("\"'")
|
|
614
|
+
new = new.strip("\"'")
|
|
615
|
+
mapping[old] = new
|
|
616
|
+
|
|
617
|
+
if not mapping:
|
|
618
|
+
return "No mappings provided."
|
|
619
|
+
|
|
620
|
+
session.snapshot()
|
|
621
|
+
dtype = df[col].dtype
|
|
622
|
+
is_numeric = dtype in (
|
|
623
|
+
pl.Float32, pl.Float64,
|
|
624
|
+
pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
625
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
expr = pl.col(col)
|
|
629
|
+
for old_val, new_val in mapping.items():
|
|
630
|
+
if is_numeric:
|
|
631
|
+
try:
|
|
632
|
+
old_v = float(old_val)
|
|
633
|
+
new_v = float(new_val)
|
|
634
|
+
expr = pl.when(pl.col(col) == old_v).then(pl.lit(new_v)).otherwise(expr)
|
|
635
|
+
except ValueError:
|
|
636
|
+
session.undo()
|
|
637
|
+
return f"Cannot convert '{old_val}' or '{new_val}' to number for numeric column '{col}'"
|
|
638
|
+
else:
|
|
639
|
+
expr = pl.when(pl.col(col) == old_val).then(pl.lit(new_val)).otherwise(expr)
|
|
640
|
+
|
|
641
|
+
session.df = df.with_columns(expr.alias(col))
|
|
642
|
+
return f"Recoded {len(mapping)} value(s) in '{col}'. Use 'undo' to revert."
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
@command("fillna", usage="fillna <col> <strategy>")
|
|
646
|
+
def cmd_fillna(session: Session, args: str) -> str:
|
|
647
|
+
"""Fill missing values: mean, median, mode, forward, backward, or value=N."""
|
|
648
|
+
df = session.require_data()
|
|
649
|
+
parts = args.split()
|
|
650
|
+
if len(parts) < 2:
|
|
651
|
+
return (
|
|
652
|
+
"Usage: fillna <col> <strategy>\n"
|
|
653
|
+
"Strategies: mean, median, mode, forward, backward, value=N"
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
col = parts[0]
|
|
657
|
+
strategy = parts[1]
|
|
658
|
+
if col not in df.columns:
|
|
659
|
+
return f"Column not found: {col}"
|
|
660
|
+
|
|
661
|
+
null_count = df[col].null_count()
|
|
662
|
+
if null_count == 0:
|
|
663
|
+
return f"No missing values in '{col}'."
|
|
664
|
+
|
|
665
|
+
session.snapshot()
|
|
666
|
+
|
|
667
|
+
try:
|
|
668
|
+
if strategy == "mean":
|
|
669
|
+
fill_val = df[col].mean()
|
|
670
|
+
session.df = df.with_columns(pl.col(col).fill_null(fill_val))
|
|
671
|
+
elif strategy == "median":
|
|
672
|
+
fill_val = df[col].median()
|
|
673
|
+
session.df = df.with_columns(pl.col(col).fill_null(fill_val))
|
|
674
|
+
elif strategy == "mode":
|
|
675
|
+
mode_val = df[col].drop_nulls().mode().to_list()
|
|
676
|
+
if not mode_val:
|
|
677
|
+
session.undo()
|
|
678
|
+
return "Cannot compute mode: no non-null values."
|
|
679
|
+
session.df = df.with_columns(pl.col(col).fill_null(mode_val[0]))
|
|
680
|
+
elif strategy == "forward":
|
|
681
|
+
session.df = df.with_columns(pl.col(col).forward_fill())
|
|
682
|
+
elif strategy == "backward":
|
|
683
|
+
session.df = df.with_columns(pl.col(col).backward_fill())
|
|
684
|
+
elif strategy.startswith("value="):
|
|
685
|
+
val_str = strategy.split("=", 1)[1]
|
|
686
|
+
try:
|
|
687
|
+
val = float(val_str)
|
|
688
|
+
except ValueError:
|
|
689
|
+
val = val_str
|
|
690
|
+
session.df = df.with_columns(pl.col(col).fill_null(val))
|
|
691
|
+
else:
|
|
692
|
+
session.undo()
|
|
693
|
+
return f"Unknown strategy: {strategy}. Use: mean, median, mode, forward, backward, value=N"
|
|
694
|
+
|
|
695
|
+
remaining = session.df[col].null_count()
|
|
696
|
+
filled = null_count - remaining
|
|
697
|
+
return f"Filled {filled:,} null(s) in '{col}' using {strategy}. Use 'undo' to revert."
|
|
698
|
+
except Exception as e:
|
|
699
|
+
session.undo()
|
|
700
|
+
return friendly_error(e, "Fillna error")
|
|
701
|
+
|
|
702
|
+
|
|
703
|
+
@command("cast", usage="cast <col> <type>")
|
|
704
|
+
def cmd_cast(session: Session, args: str) -> str:
|
|
705
|
+
"""Convert column type: int, float, str, bool."""
|
|
706
|
+
df = session.require_data()
|
|
707
|
+
parts = args.split()
|
|
708
|
+
if len(parts) < 2:
|
|
709
|
+
return "Usage: cast <col> <type> (types: int, float, str, bool)"
|
|
710
|
+
|
|
711
|
+
col = parts[0]
|
|
712
|
+
target_type = parts[1].lower()
|
|
713
|
+
if col not in df.columns:
|
|
714
|
+
return f"Column not found: {col}"
|
|
715
|
+
|
|
716
|
+
TYPE_MAP = {
|
|
717
|
+
"int": pl.Int64,
|
|
718
|
+
"float": pl.Float64,
|
|
719
|
+
"str": pl.Utf8,
|
|
720
|
+
"string": pl.Utf8,
|
|
721
|
+
"bool": pl.Boolean,
|
|
722
|
+
}
|
|
723
|
+
pl_type = TYPE_MAP.get(target_type)
|
|
724
|
+
if pl_type is None:
|
|
725
|
+
return f"Unknown type: {target_type}. Available: {', '.join(TYPE_MAP)}"
|
|
726
|
+
|
|
727
|
+
session.snapshot()
|
|
728
|
+
try:
|
|
729
|
+
session.df = df.with_columns(pl.col(col).cast(pl_type))
|
|
730
|
+
return f"Cast '{col}' to {target_type}. Use 'undo' to revert."
|
|
731
|
+
except Exception as e:
|
|
732
|
+
session.undo()
|
|
733
|
+
return friendly_error(e, "Cast error")
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
@command("lag", usage="lag <col> [N] [as <name>]")
|
|
737
|
+
def cmd_lag(session: Session, args: str) -> str:
|
|
738
|
+
"""Create a lagged variable (shift values down by N rows, default 1)."""
|
|
739
|
+
df = session.require_data()
|
|
740
|
+
ca = CommandArgs(args)
|
|
741
|
+
if not ca.positional:
|
|
742
|
+
return "Usage: lag <col> [N] [as <name>]"
|
|
743
|
+
|
|
744
|
+
col = ca.positional[0]
|
|
745
|
+
if col not in df.columns:
|
|
746
|
+
return f"Column not found: {col}"
|
|
747
|
+
|
|
748
|
+
n = 1
|
|
749
|
+
for p in ca.positional[1:]:
|
|
750
|
+
if p.lower() == "as":
|
|
751
|
+
break
|
|
752
|
+
try:
|
|
753
|
+
n = int(p)
|
|
754
|
+
except ValueError:
|
|
755
|
+
pass
|
|
756
|
+
|
|
757
|
+
as_rest = ca.rest_after("as")
|
|
758
|
+
new_name = as_rest.split()[0] if as_rest else f"{col}_lag{n}"
|
|
759
|
+
|
|
760
|
+
session.snapshot()
|
|
761
|
+
session.df = df.with_columns(pl.col(col).shift(n).alias(new_name))
|
|
762
|
+
return f"Created '{new_name}' (lag {n}). Use 'undo' to revert."
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
@command("lead", usage="lead <col> [N] [as <name>]")
|
|
766
|
+
def cmd_lead(session: Session, args: str) -> str:
|
|
767
|
+
"""Create a lead variable (shift values up by N rows, default 1)."""
|
|
768
|
+
df = session.require_data()
|
|
769
|
+
ca = CommandArgs(args)
|
|
770
|
+
if not ca.positional:
|
|
771
|
+
return "Usage: lead <col> [N] [as <name>]"
|
|
772
|
+
|
|
773
|
+
col = ca.positional[0]
|
|
774
|
+
if col not in df.columns:
|
|
775
|
+
return f"Column not found: {col}"
|
|
776
|
+
|
|
777
|
+
n = 1
|
|
778
|
+
for p in ca.positional[1:]:
|
|
779
|
+
if p.lower() == "as":
|
|
780
|
+
break
|
|
781
|
+
try:
|
|
782
|
+
n = int(p)
|
|
783
|
+
except ValueError:
|
|
784
|
+
pass
|
|
785
|
+
|
|
786
|
+
as_rest = ca.rest_after("as")
|
|
787
|
+
new_name = as_rest.split()[0] if as_rest else f"{col}_lead{n}"
|
|
788
|
+
|
|
789
|
+
session.snapshot()
|
|
790
|
+
session.df = df.with_columns(pl.col(col).shift(-n).alias(new_name))
|
|
791
|
+
return f"Created '{new_name}' (lead {n}). Use 'undo' to revert."
|
|
792
|
+
|
|
793
|
+
|
|
794
|
+
@command("append", usage="append using <file> [, force]")
|
|
795
|
+
def cmd_append(session: Session, args: str) -> str:
|
|
796
|
+
"""Append rows from another file to current dataset."""
|
|
797
|
+
df = session.require_data()
|
|
798
|
+
ca = CommandArgs(args)
|
|
799
|
+
|
|
800
|
+
# Parse: using <file> [, force]
|
|
801
|
+
rest = ca.rest_after("using")
|
|
802
|
+
if not rest:
|
|
803
|
+
return "Usage: append using <file> [, force]"
|
|
804
|
+
|
|
805
|
+
force = False
|
|
806
|
+
if ", force" in rest or ",force" in rest:
|
|
807
|
+
force = True
|
|
808
|
+
rest = rest.replace(", force", "").replace(",force", "").strip()
|
|
809
|
+
|
|
810
|
+
file_path = rest.strip().strip('"').strip("'")
|
|
811
|
+
if not file_path:
|
|
812
|
+
return "Usage: append using <file> [, force]"
|
|
813
|
+
|
|
814
|
+
try:
|
|
815
|
+
new_df = load_file(file_path)
|
|
816
|
+
except Exception as e:
|
|
817
|
+
return f"Error loading file: {e}"
|
|
818
|
+
|
|
819
|
+
# Check column compatibility
|
|
820
|
+
current_cols = set(df.columns)
|
|
821
|
+
new_cols = set(new_df.columns)
|
|
822
|
+
only_current = current_cols - new_cols
|
|
823
|
+
only_new = new_cols - current_cols
|
|
824
|
+
|
|
825
|
+
if (only_current or only_new) and not force:
|
|
826
|
+
lines = ["Column mismatch detected:"]
|
|
827
|
+
if only_current:
|
|
828
|
+
lines.append(f" Only in current data: {', '.join(sorted(only_current))}")
|
|
829
|
+
if only_new:
|
|
830
|
+
lines.append(f" Only in new file: {', '.join(sorted(only_new))}")
|
|
831
|
+
lines.append("Use 'append using <file>, force' to proceed (missing columns filled with null).")
|
|
832
|
+
return "\n".join(lines)
|
|
833
|
+
|
|
834
|
+
rows_before = df.height
|
|
835
|
+
session.snapshot()
|
|
836
|
+
session.df = pl.concat([df, new_df], how="diagonal")
|
|
837
|
+
rows_added = new_df.height
|
|
838
|
+
|
|
839
|
+
lines = [f"Appended {rows_added} rows (total: {rows_before + rows_added})."]
|
|
840
|
+
if only_current or only_new:
|
|
841
|
+
lines.append("Note: Missing columns filled with null.")
|
|
842
|
+
lines.append("Use 'undo' to revert.")
|
|
843
|
+
return "\n".join(lines)
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
@command("egen", usage="egen newvar = func(col) [, by(group)]")
|
|
847
|
+
def cmd_egen(session: Session, args: str) -> str:
|
|
848
|
+
"""Extended generate: create variables with group-wise or row-wise functions.
|
|
849
|
+
|
|
850
|
+
Supported functions:
|
|
851
|
+
mean, sum, min, max, median, count, rank, group,
|
|
852
|
+
rowtotal(x1 x2 ...), rowmean(x1 x2 ...)
|
|
853
|
+
"""
|
|
854
|
+
df = session.require_data()
|
|
855
|
+
|
|
856
|
+
# Parse: newvar = function(args) [, by(group)]
|
|
857
|
+
# Handle by() first
|
|
858
|
+
by_col = None
|
|
859
|
+
by_match = re.search(r',\s*by\((\w+)\)', args)
|
|
860
|
+
if by_match:
|
|
861
|
+
by_col = by_match.group(1)
|
|
862
|
+
if by_col not in df.columns:
|
|
863
|
+
return f"Column not found: {by_col}"
|
|
864
|
+
args_clean = args[:by_match.start()] + args[by_match.end():]
|
|
865
|
+
else:
|
|
866
|
+
args_clean = args
|
|
867
|
+
|
|
868
|
+
# Parse newvar = function(col_args)
|
|
869
|
+
m = re.match(r'\s*(\w+)\s*=\s*(\w+)\((.+?)\)\s*$', args_clean.strip())
|
|
870
|
+
if not m:
|
|
871
|
+
return (
|
|
872
|
+
"Usage: egen newvar = func(col) [, by(group)]\n"
|
|
873
|
+
"Functions: mean, sum, min, max, median, count, rank, group,\n"
|
|
874
|
+
" rowtotal(x1 x2 ...), rowmean(x1 x2 ...)"
|
|
875
|
+
)
|
|
876
|
+
|
|
877
|
+
new_var = m.group(1)
|
|
878
|
+
func_name = m.group(2).lower()
|
|
879
|
+
col_args = m.group(3).strip()
|
|
880
|
+
|
|
881
|
+
# Row-wise functions: rowtotal(x1 x2 x3), rowmean(x1 x2 x3)
|
|
882
|
+
if func_name in ("rowtotal", "rowmean"):
|
|
883
|
+
cols = col_args.split()
|
|
884
|
+
missing_cols = [c for c in cols if c not in df.columns]
|
|
885
|
+
if missing_cols:
|
|
886
|
+
return f"Columns not found: {', '.join(missing_cols)}"
|
|
887
|
+
|
|
888
|
+
session.snapshot()
|
|
889
|
+
if func_name == "rowtotal":
|
|
890
|
+
expr = pl.sum_horizontal([pl.col(c) for c in cols])
|
|
891
|
+
else: # rowmean
|
|
892
|
+
expr = pl.mean_horizontal([pl.col(c) for c in cols])
|
|
893
|
+
session.df = df.with_columns(expr.alias(new_var))
|
|
894
|
+
return f"Created '{new_var}' = {func_name}({' '.join(cols)}). Use 'undo' to revert."
|
|
895
|
+
|
|
896
|
+
# Group-level and group-id functions
|
|
897
|
+
col = col_args.strip()
|
|
898
|
+
|
|
899
|
+
# Special case: group() creates numeric group ID
|
|
900
|
+
if func_name == "group":
|
|
901
|
+
if col not in df.columns:
|
|
902
|
+
return f"Column not found: {col}"
|
|
903
|
+
session.snapshot()
|
|
904
|
+
# Create dense rank (group identifier)
|
|
905
|
+
session.df = df.with_columns(
|
|
906
|
+
pl.col(col).rank("dense").cast(pl.Int64).alias(new_var)
|
|
907
|
+
)
|
|
908
|
+
return f"Created '{new_var}' = group({col}). Use 'undo' to revert."
|
|
909
|
+
|
|
910
|
+
if col not in df.columns:
|
|
911
|
+
return f"Column not found: {col}"
|
|
912
|
+
|
|
913
|
+
# Map function name to polars expression
|
|
914
|
+
func_map = {
|
|
915
|
+
"mean": pl.col(col).mean(),
|
|
916
|
+
"sum": pl.col(col).sum(),
|
|
917
|
+
"min": pl.col(col).min(),
|
|
918
|
+
"max": pl.col(col).max(),
|
|
919
|
+
"median": pl.col(col).median(),
|
|
920
|
+
"count": pl.col(col).count(),
|
|
921
|
+
"rank": pl.col(col).rank("ordinal"),
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
if func_name not in func_map:
|
|
925
|
+
return (
|
|
926
|
+
f"Unknown function: {func_name}\n"
|
|
927
|
+
"Supported: mean, sum, min, max, median, count, rank, group, rowtotal, rowmean"
|
|
928
|
+
)
|
|
929
|
+
|
|
930
|
+
expr = func_map[func_name]
|
|
931
|
+
|
|
932
|
+
session.snapshot()
|
|
933
|
+
if by_col:
|
|
934
|
+
session.df = df.with_columns(expr.over(by_col).alias(new_var))
|
|
935
|
+
return f"Created '{new_var}' = {func_name}({col}), by({by_col}). Use 'undo' to revert."
|
|
936
|
+
else:
|
|
937
|
+
session.df = df.with_columns(expr.alias(new_var))
|
|
938
|
+
return f"Created '{new_var}' = {func_name}({col}). Use 'undo' to revert."
|
|
939
|
+
|
|
940
|
+
|
|
941
|
+
@command("sqlload", usage="sqlload <connection_url> [<query_or_table>]")
|
|
942
|
+
def cmd_sqlload(session: Session, args: str) -> str:
|
|
943
|
+
"""Load data from a SQL database using a connection URL.
|
|
944
|
+
|
|
945
|
+
Requires: pip install connectorx (or adbc-driver for some backends)
|
|
946
|
+
|
|
947
|
+
Examples:
|
|
948
|
+
sqlload sqlite:///mydata.db mytable
|
|
949
|
+
sqlload sqlite:///mydata.db "SELECT * FROM sales WHERE year = 2023"
|
|
950
|
+
sqlload postgresql://user:pass@localhost/mydb "SELECT * FROM employees LIMIT 1000"
|
|
951
|
+
sqlload mysql+pymysql://user:pass@localhost/mydb customers
|
|
952
|
+
"""
|
|
953
|
+
stripped = args.strip()
|
|
954
|
+
if not stripped:
|
|
955
|
+
return (
|
|
956
|
+
"Usage: sqlload <connection_url> [<query_or_table>]\n"
|
|
957
|
+
"Examples:\n"
|
|
958
|
+
" sqlload sqlite:///path.db mytable\n"
|
|
959
|
+
" sqlload sqlite:///path.db \"SELECT * FROM tbl WHERE x > 0\"\n"
|
|
960
|
+
" sqlload postgresql://user:pass@host/db \"SELECT * FROM tbl\""
|
|
961
|
+
)
|
|
962
|
+
|
|
963
|
+
# Split into URL and query/table (second token may be quoted)
|
|
964
|
+
import shlex
|
|
965
|
+
try:
|
|
966
|
+
parts = shlex.split(stripped)
|
|
967
|
+
except ValueError:
|
|
968
|
+
parts = stripped.split(None, 1)
|
|
969
|
+
|
|
970
|
+
url = parts[0]
|
|
971
|
+
query_or_table = parts[1] if len(parts) > 1 else None
|
|
972
|
+
|
|
973
|
+
# Build SQL query
|
|
974
|
+
if query_or_table is None:
|
|
975
|
+
return "Please specify a table name or SQL query after the connection URL."
|
|
976
|
+
|
|
977
|
+
sql = query_or_table if query_or_table.strip().lower().startswith("select") \
|
|
978
|
+
else f"SELECT * FROM {query_or_table}"
|
|
979
|
+
|
|
980
|
+
try:
|
|
981
|
+
df = pl.read_database_uri(query=sql, uri=url)
|
|
982
|
+
except ImportError as e:
|
|
983
|
+
return (
|
|
984
|
+
f"Missing optional dependency: {e}\n"
|
|
985
|
+
"Install with: pip install connectorx\n"
|
|
986
|
+
"Or for PostgreSQL/MySQL: pip install adbc-driver-postgresql"
|
|
987
|
+
)
|
|
988
|
+
except Exception as e:
|
|
989
|
+
return f"sqlload error: {e}"
|
|
990
|
+
|
|
991
|
+
session.snapshot()
|
|
992
|
+
session.df = df
|
|
993
|
+
session.dataset_path = url
|
|
994
|
+
session.dataset_name = query_or_table
|
|
995
|
+
session._undo_stack.clear()
|
|
996
|
+
return f"Loaded {session.shape_str} from SQL: {url}"
|