openstat-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. openstat/__init__.py +3 -0
  2. openstat/__main__.py +4 -0
  3. openstat/backends/__init__.py +16 -0
  4. openstat/backends/duckdb_backend.py +70 -0
  5. openstat/backends/polars_backend.py +52 -0
  6. openstat/cli.py +92 -0
  7. openstat/commands/__init__.py +82 -0
  8. openstat/commands/adv_stat_cmds.py +1255 -0
  9. openstat/commands/advanced_ml_cmds.py +576 -0
  10. openstat/commands/advreg_cmds.py +207 -0
  11. openstat/commands/alias_cmds.py +135 -0
  12. openstat/commands/arch_cmds.py +82 -0
  13. openstat/commands/arules_cmds.py +111 -0
  14. openstat/commands/automodel_cmds.py +212 -0
  15. openstat/commands/backend_cmds.py +82 -0
  16. openstat/commands/base.py +170 -0
  17. openstat/commands/bayes_cmds.py +71 -0
  18. openstat/commands/causal_cmds.py +269 -0
  19. openstat/commands/cluster_cmds.py +152 -0
  20. openstat/commands/data_cmds.py +996 -0
  21. openstat/commands/datamanip_cmds.py +672 -0
  22. openstat/commands/dataquality_cmds.py +174 -0
  23. openstat/commands/datetime_cmds.py +176 -0
  24. openstat/commands/dimreduce_cmds.py +184 -0
  25. openstat/commands/discrete_cmds.py +149 -0
  26. openstat/commands/dsl_cmds.py +143 -0
  27. openstat/commands/epi_cmds.py +93 -0
  28. openstat/commands/equiv_tobit_cmds.py +94 -0
  29. openstat/commands/esttab_cmds.py +196 -0
  30. openstat/commands/export_beamer_cmds.py +142 -0
  31. openstat/commands/export_cmds.py +201 -0
  32. openstat/commands/export_extra_cmds.py +240 -0
  33. openstat/commands/factor_cmds.py +180 -0
  34. openstat/commands/groupby_cmds.py +155 -0
  35. openstat/commands/help_cmds.py +237 -0
  36. openstat/commands/i18n_cmds.py +43 -0
  37. openstat/commands/import_extra_cmds.py +561 -0
  38. openstat/commands/influence_cmds.py +134 -0
  39. openstat/commands/iv_cmds.py +106 -0
  40. openstat/commands/manova_cmds.py +105 -0
  41. openstat/commands/mediate_cmds.py +233 -0
  42. openstat/commands/meta_cmds.py +284 -0
  43. openstat/commands/mi_cmds.py +228 -0
  44. openstat/commands/mixed_cmds.py +79 -0
  45. openstat/commands/mixture_changepoint_cmds.py +166 -0
  46. openstat/commands/ml_adv_cmds.py +147 -0
  47. openstat/commands/ml_cmds.py +178 -0
  48. openstat/commands/model_eval_cmds.py +142 -0
  49. openstat/commands/network_cmds.py +288 -0
  50. openstat/commands/nlquery_cmds.py +161 -0
  51. openstat/commands/nonparam_cmds.py +149 -0
  52. openstat/commands/outreg_cmds.py +247 -0
  53. openstat/commands/panel_cmds.py +141 -0
  54. openstat/commands/pdf_cmds.py +226 -0
  55. openstat/commands/pipeline_cmds.py +319 -0
  56. openstat/commands/plot_cmds.py +189 -0
  57. openstat/commands/plugin_cmds.py +79 -0
  58. openstat/commands/posthoc_cmds.py +153 -0
  59. openstat/commands/power_cmds.py +172 -0
  60. openstat/commands/profile_cmds.py +246 -0
  61. openstat/commands/rbridge_cmds.py +81 -0
  62. openstat/commands/regex_cmds.py +104 -0
  63. openstat/commands/report_cmds.py +48 -0
  64. openstat/commands/repro_cmds.py +129 -0
  65. openstat/commands/resampling_cmds.py +109 -0
  66. openstat/commands/reshape_cmds.py +223 -0
  67. openstat/commands/sem_cmds.py +177 -0
  68. openstat/commands/stat_cmds.py +1040 -0
  69. openstat/commands/stata_import_cmds.py +215 -0
  70. openstat/commands/string_cmds.py +124 -0
  71. openstat/commands/surv_cmds.py +145 -0
  72. openstat/commands/survey_cmds.py +153 -0
  73. openstat/commands/textanalysis_cmds.py +192 -0
  74. openstat/commands/ts_adv_cmds.py +136 -0
  75. openstat/commands/ts_cmds.py +195 -0
  76. openstat/commands/tui_cmds.py +111 -0
  77. openstat/commands/ux_cmds.py +191 -0
  78. openstat/commands/validate_cmds.py +270 -0
  79. openstat/commands/viz_adv_cmds.py +312 -0
  80. openstat/commands/viz_extra_cmds.py +251 -0
  81. openstat/commands/watch_cmds.py +69 -0
  82. openstat/config.py +106 -0
  83. openstat/dsl/__init__.py +0 -0
  84. openstat/dsl/parser.py +332 -0
  85. openstat/dsl/tokenizer.py +105 -0
  86. openstat/i18n.py +120 -0
  87. openstat/io/__init__.py +0 -0
  88. openstat/io/loader.py +187 -0
  89. openstat/jupyter/__init__.py +18 -0
  90. openstat/jupyter/display.py +18 -0
  91. openstat/jupyter/magic.py +60 -0
  92. openstat/logging_config.py +59 -0
  93. openstat/plots/__init__.py +0 -0
  94. openstat/plots/plotter.py +437 -0
  95. openstat/plots/surv_plots.py +32 -0
  96. openstat/plots/ts_plots.py +59 -0
  97. openstat/plugins/__init__.py +5 -0
  98. openstat/plugins/manager.py +69 -0
  99. openstat/repl.py +457 -0
  100. openstat/reporting/__init__.py +0 -0
  101. openstat/reporting/eda.py +208 -0
  102. openstat/reporting/report.py +67 -0
  103. openstat/script_runner.py +319 -0
  104. openstat/session.py +133 -0
  105. openstat/stats/__init__.py +0 -0
  106. openstat/stats/advanced_regression.py +269 -0
  107. openstat/stats/arch_garch.py +84 -0
  108. openstat/stats/bayesian.py +103 -0
  109. openstat/stats/causal.py +258 -0
  110. openstat/stats/clustering.py +206 -0
  111. openstat/stats/discrete.py +311 -0
  112. openstat/stats/epidemiology.py +119 -0
  113. openstat/stats/equiv_tobit.py +163 -0
  114. openstat/stats/factor.py +174 -0
  115. openstat/stats/imputation.py +282 -0
  116. openstat/stats/influence.py +78 -0
  117. openstat/stats/iv.py +131 -0
  118. openstat/stats/manova.py +124 -0
  119. openstat/stats/mixed.py +128 -0
  120. openstat/stats/ml.py +275 -0
  121. openstat/stats/ml_advanced.py +117 -0
  122. openstat/stats/model_eval.py +183 -0
  123. openstat/stats/models.py +1342 -0
  124. openstat/stats/nonparametric.py +130 -0
  125. openstat/stats/panel.py +179 -0
  126. openstat/stats/power.py +295 -0
  127. openstat/stats/resampling.py +203 -0
  128. openstat/stats/survey.py +213 -0
  129. openstat/stats/survival.py +196 -0
  130. openstat/stats/timeseries.py +142 -0
  131. openstat/stats/ts_advanced.py +114 -0
  132. openstat/types.py +11 -0
  133. openstat/web/__init__.py +1 -0
  134. openstat/web/app.py +117 -0
  135. openstat/web/session_manager.py +73 -0
  136. openstat/web/static/app.js +117 -0
  137. openstat/web/static/index.html +38 -0
  138. openstat/web/static/style.css +103 -0
  139. openstat_cli-1.0.0.dist-info/METADATA +748 -0
  140. openstat_cli-1.0.0.dist-info/RECORD +143 -0
  141. openstat_cli-1.0.0.dist-info/WHEEL +4 -0
  142. openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
  143. openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,996 @@
1
+ """Data manipulation commands: load, describe, head, filter, select, derive, dropna, sort, rename, count, tail, merge, pivot, melt, sample, replace, duplicates, unique, encode, recode."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ import polars as pl
8
+ from rich.console import Console
9
+ from rich.table import Table
10
+
11
+ from openstat.session import Session
12
+ from openstat.config import get_config
13
+ from openstat.io.loader import load_file, save_file
14
+ from openstat.dsl.parser import parse_expression, ParseError
15
+ from openstat.commands.base import command, CommandArgs, rich_to_str, friendly_error
16
+
17
+
18
+ @command("load", usage="load <path> [sheet=<name|index|list>]")
19
+ def cmd_load(session: Session, args: str) -> str:
20
+ """Load a dataset from CSV, Parquet, Stata (.dta), or Excel (.xlsx) file.
21
+
22
+ For Excel files, specify a sheet with sheet=<name|index>.
23
+ Use sheet=list to see available sheet names.
24
+
25
+ Examples:
26
+ load data.csv
27
+ load results.xlsx
28
+ load workbook.xlsx sheet=Sheet2
29
+ load workbook.xlsx sheet=1
30
+ load workbook.xlsx sheet=list
31
+ """
32
+ ca = CommandArgs(args)
33
+ if not ca.positional:
34
+ return "Usage: load <path> [sheet=<name|index|list>]"
35
+ path = ca.positional[0]
36
+ sheet = ca.options.get("sheet")
37
+
38
+ if sheet is not None and path.lower().endswith((".xlsx", ".xls")):
39
+ from openstat.io.loader import _load_excel
40
+ from pathlib import Path as _Path
41
+ # sheet=list → just show sheet names
42
+ if sheet == "list":
43
+ try:
44
+ _load_excel(_Path(path), sheet="list")
45
+ except ValueError as exc:
46
+ return str(exc)
47
+ # numeric index
48
+ sheet_arg: str | int = int(sheet) if sheet.isdigit() else sheet
49
+ session.df = _load_excel(_Path(path), sheet=sheet_arg)
50
+ else:
51
+ session.df = load_file(path, session=session)
52
+
53
+ session.dataset_path = path
54
+ session.dataset_name = path.split("/")[-1]
55
+ session._undo_stack.clear()
56
+ return f"Loaded {session.shape_str} from {path}"
57
+
58
+
59
+ @command("labels", usage="labels [column]")
60
+ def cmd_labels(session: Session, args: str) -> str:
61
+ """Show variable/value labels from SAS, SPSS, or Stata files."""
62
+ labels = session._variable_labels
63
+ if not labels:
64
+ return "No variable labels available. Labels are loaded from .dta, .sav, or .sas7bdat files."
65
+ col = args.strip()
66
+ if col:
67
+ if col not in labels:
68
+ return f"No labels for column '{col}'. Columns with labels: {', '.join(labels.keys())}"
69
+ mapping = labels[col]
70
+ lines = [f"Labels for '{col}':"]
71
+ for val, lbl in sorted(mapping.items(), key=lambda x: str(x[0])):
72
+ lines.append(f" {val} = {lbl}")
73
+ return "\n".join(lines)
74
+
75
+ def render(console: Console) -> None:
76
+ table = Table(title="Variable Value Labels")
77
+ table.add_column("Column", style="cyan")
78
+ table.add_column("# Labels", justify="right")
79
+ table.add_column("Sample", style="dim")
80
+ for var, mapping in sorted(labels.items()):
81
+ sample = ", ".join(f"{k}={v}" for k, v in list(mapping.items())[:3])
82
+ if len(mapping) > 3:
83
+ sample += ", ..."
84
+ table.add_row(var, str(len(mapping)), sample)
85
+ console.print(table)
86
+
87
+ return rich_to_str(render)
88
+
89
+
90
+ @command("describe", usage="describe")
91
+ def cmd_describe(session: Session, args: str) -> str:
92
+ """Show dataset structure: columns, types, missing values."""
93
+ df = session.require_data()
94
+
95
+ def render(console: Console) -> None:
96
+ table = Table(title=f"Dataset: {session.dataset_name or '(unnamed)'}")
97
+ table.add_column("Column", style="cyan")
98
+ table.add_column("Type", style="green")
99
+ table.add_column("Non-null", justify="right")
100
+ table.add_column("Missing", justify="right")
101
+ table.add_column("Unique", justify="right")
102
+
103
+ for col_name in df.columns:
104
+ col = df[col_name]
105
+ nulls = col.null_count()
106
+ non_null = df.height - nulls
107
+ unique = col.n_unique()
108
+ table.add_row(col_name, str(col.dtype), str(non_null), str(nulls), str(unique))
109
+ console.print(table)
110
+ console.print(f"Shape: {df.height:,} rows x {df.width} columns")
111
+
112
+ return rich_to_str(render)
113
+
114
+
115
+ @command("head", usage="head [N]")
116
+ def cmd_head(session: Session, args: str) -> str:
117
+ """Show first N rows (default from config)."""
118
+ df = session.require_data()
119
+ n = get_config().head_default
120
+ if args.strip():
121
+ try:
122
+ n = int(args.strip())
123
+ except ValueError:
124
+ return "Usage: head [N]"
125
+
126
+ def render(console: Console) -> None:
127
+ table = Table(title=f"First {min(n, df.height)} rows")
128
+ for col_name in df.columns:
129
+ table.add_column(col_name)
130
+ for row in df.head(n).iter_rows():
131
+ table.add_row(*[str(v) for v in row])
132
+ console.print(table)
133
+
134
+ return rich_to_str(render)
135
+
136
+
137
+ @command("tail", usage="tail [N]")
138
+ def cmd_tail(session: Session, args: str) -> str:
139
+ """Show last N rows (default 10)."""
140
+ df = session.require_data()
141
+ n = 10
142
+ if args.strip():
143
+ try:
144
+ n = int(args.strip())
145
+ except ValueError:
146
+ return "Usage: tail [N]"
147
+
148
+ def render(console: Console) -> None:
149
+ table = Table(title=f"Last {min(n, df.height)} rows")
150
+ for col_name in df.columns:
151
+ table.add_column(col_name)
152
+ for row in df.tail(n).iter_rows():
153
+ table.add_row(*[str(v) for v in row])
154
+ console.print(table)
155
+
156
+ return rich_to_str(render)
157
+
158
+
159
+ @command("count", usage="count")
160
+ def cmd_count(session: Session, args: str) -> str:
161
+ """Show the number of rows and columns."""
162
+ df = session.require_data()
163
+ return f"{df.height:,} rows x {df.width} columns"
164
+
165
+
166
+ @command("filter", usage="filter <expression>")
167
+ def cmd_filter(session: Session, args: str) -> str:
168
+ """Filter rows using an expression. Use 'undo' to revert."""
169
+ df = session.require_data()
170
+ if not args.strip():
171
+ return "Usage: filter <expression> (e.g. filter age > 30)"
172
+ try:
173
+ expr = parse_expression(args)
174
+ before = df.height
175
+ session.snapshot()
176
+ session.df = df.filter(expr)
177
+ after = session.df.height
178
+ return f"Filtered: {before:,} -> {after:,} rows ({before - after:,} dropped). Use 'undo' to revert."
179
+ except ParseError as e:
180
+ return f"Parse error: {e}"
181
+ except Exception as e:
182
+ return friendly_error(e, "Filter error")
183
+
184
+
185
+ @command("select", usage="select <col1> <col2> ...")
186
+ def cmd_select(session: Session, args: str) -> str:
187
+ """Select specific columns. Use 'undo' to revert."""
188
+ df = session.require_data()
189
+ cols = args.split()
190
+ if not cols:
191
+ return "Usage: select <col1> <col2> ..."
192
+ missing = [c for c in cols if c not in df.columns]
193
+ if missing:
194
+ return f"Columns not found: {', '.join(missing)}"
195
+ session.snapshot()
196
+ session.df = df.select(cols)
197
+ return f"Selected {len(cols)} columns. Shape: {session.shape_str}"
198
+
199
+
200
+ @command("derive", usage="derive <newcol> = <expression>")
201
+ def cmd_derive(session: Session, args: str) -> str:
202
+ """Create a new column from an expression. Use 'undo' to revert."""
203
+ df = session.require_data()
204
+ if "=" not in args:
205
+ return "Usage: derive <newcol> = <expression>"
206
+ name, expr_str = args.split("=", 1)
207
+ name = name.strip()
208
+ if not name:
209
+ return "Usage: derive <newcol> = <expression>"
210
+ try:
211
+ expr = parse_expression(expr_str.strip())
212
+ session.snapshot()
213
+ session.df = df.with_columns(expr.alias(name))
214
+ return f"Created column '{name}'. Shape: {session.shape_str}"
215
+ except ParseError as e:
216
+ return f"Parse error: {e}"
217
+ except Exception as e:
218
+ return friendly_error(e, "Derive error")
219
+
220
+
221
+ @command("dropna", usage="dropna [col1 col2 ...]")
222
+ def cmd_dropna(session: Session, args: str) -> str:
223
+ """Drop rows with missing values. Use 'undo' to revert."""
224
+ df = session.require_data()
225
+ cols = args.split() if args.strip() else None
226
+ before = df.height
227
+ if cols:
228
+ missing = [c for c in cols if c not in df.columns]
229
+ if missing:
230
+ return f"Columns not found: {', '.join(missing)}"
231
+ session.snapshot()
232
+ if cols:
233
+ session.df = df.drop_nulls(subset=cols)
234
+ else:
235
+ session.df = df.drop_nulls()
236
+ after = session.df.height
237
+ return f"Dropped nulls: {before:,} -> {after:,} rows ({before - after:,} removed)"
238
+
239
+
240
+ @command("sort", usage="sort <col> [--desc]")
241
+ def cmd_sort(session: Session, args: str) -> str:
242
+ """Sort dataset by one or more columns. Use --desc for descending."""
243
+ df = session.require_data()
244
+ ca = CommandArgs(args)
245
+ descending = ca.has_flag("--desc")
246
+ cols = ca.positional
247
+ if not cols:
248
+ return "Usage: sort <col1> [col2 ...] [--desc]"
249
+ missing = [c for c in cols if c not in df.columns]
250
+ if missing:
251
+ return f"Columns not found: {', '.join(missing)}"
252
+ session.snapshot()
253
+ session.df = df.sort(cols, descending=descending)
254
+ direction = "descending" if descending else "ascending"
255
+ return f"Sorted by {', '.join(cols)} ({direction}). {session.shape_str}"
256
+
257
+
258
+ @command("rename", usage="rename <old> <new>")
259
+ def cmd_rename(session: Session, args: str) -> str:
260
+ """Rename a column."""
261
+ df = session.require_data()
262
+ parts = args.split()
263
+ if len(parts) != 2:
264
+ return "Usage: rename <old_name> <new_name>"
265
+ old, new = parts
266
+ if old not in df.columns:
267
+ return f"Column not found: {old}"
268
+ if new in df.columns:
269
+ return f"Column already exists: {new}"
270
+ session.snapshot()
271
+ session.df = df.rename({old: new})
272
+ return f"Renamed '{old}' -> '{new}'"
273
+
274
+
275
+ @command("undo", usage="undo")
276
+ def cmd_undo(session: Session, args: str) -> str:
277
+ """Undo the last data-modifying command (filter, select, derive, dropna, sort, rename)."""
278
+ if session.undo():
279
+ return f"Undone. Data restored: {session.shape_str} (undo stack: {session.undo_depth} remaining)"
280
+ return "Nothing to undo."
281
+
282
+
283
+ @command("save", usage="save <path.csv|path.parquet>")
284
+ def cmd_save(session: Session, args: str) -> str:
285
+ """Save current dataset to CSV or Parquet."""
286
+ df = session.require_data()
287
+ path = args.strip()
288
+ if not path:
289
+ return "Usage: save <path.csv|path.parquet>"
290
+ try:
291
+ out = save_file(df, path)
292
+ return f"Saved {session.shape_str} to {out}"
293
+ except Exception as e:
294
+ return f"Save error: {e}"
295
+
296
+
297
+ @command("merge", usage="merge <path> on <key> [how=left|right|inner|outer]")
298
+ def cmd_merge(session: Session, args: str) -> str:
299
+ """Merge (join) current dataset with another file on a key column."""
300
+ df = session.require_data()
301
+ ca = CommandArgs(args)
302
+ how = ca.get_option("how", "inner")
303
+ on_rest = ca.rest_after("on")
304
+ if not on_rest:
305
+ return "Usage: merge <path> on <key_col> [how=left|right|inner|outer]"
306
+
307
+ # file_path is everything before "on" keyword (with options stripped)
308
+ clean = ca.strip_flags_and_options()
309
+ before_on = re.split(r"\bon\b", clean, maxsplit=1)
310
+ file_path = before_on[0].strip()
311
+ key_col = on_rest.split()[0] if on_rest.strip() else ""
312
+ if not file_path or not key_col:
313
+ return "Usage: merge <path> on <key_col> [how=left|right|inner|outer]"
314
+
315
+ # Polars uses "full" instead of "outer"
316
+ if how == "outer":
317
+ how = "full"
318
+ valid_how = {"left", "right", "inner", "full", "cross"}
319
+ if how not in valid_how:
320
+ return f"Invalid join type: {how}. Use: {', '.join(sorted(valid_how))}"
321
+
322
+ try:
323
+ other = load_file(file_path)
324
+ except Exception as e:
325
+ return f"Cannot load merge file: {e}"
326
+
327
+ if key_col not in df.columns:
328
+ return f"Key column '{key_col}' not found in current dataset"
329
+ if key_col not in other.columns:
330
+ return f"Key column '{key_col}' not found in merge file"
331
+
332
+ session.snapshot()
333
+ before = df.height
334
+ session.df = df.join(other, on=key_col, how=how, suffix="_right")
335
+ after = session.df.height
336
+
337
+ return (
338
+ f"Merged ({how}): {before:,} + {other.height:,} -> {after:,} rows, "
339
+ f"{session.df.width} columns. Use 'undo' to revert."
340
+ )
341
+
342
+
343
+ @command("pivot", usage="pivot <value_col> by <col_col> [over <row_col>] [agg=mean|sum|count|first]")
344
+ def cmd_pivot(session: Session, args: str) -> str:
345
+ """Pivot (reshape to wide format). Optional aggregation function."""
346
+ df = session.require_data()
347
+ ca = CommandArgs(args)
348
+ agg_func = ca.get_option("agg", "first")
349
+
350
+ AGG_MAP = {
351
+ "first": "first",
352
+ "mean": "mean",
353
+ "sum": "sum",
354
+ "count": "len",
355
+ "min": "min",
356
+ "max": "max",
357
+ }
358
+ if agg_func not in AGG_MAP:
359
+ return f"Unknown aggregation: {agg_func}. Available: {', '.join(AGG_MAP)}"
360
+
361
+ # Work with cleaned args (options/flags removed), split on word-boundary "by"
362
+ clean = ca.strip_flags_and_options()
363
+ by_parts = re.split(r"\bby\b", clean, maxsplit=1)
364
+ if len(by_parts) < 2:
365
+ return "Usage: pivot <value_col> by <column_col> [over <index_col>] [agg=mean|sum|count|first]"
366
+
367
+ value_col = by_parts[0].strip()
368
+ rest = by_parts[1].strip()
369
+
370
+ # Check for "over" keyword with word boundaries
371
+ over_parts = re.split(r"\bover\b", rest, maxsplit=1)
372
+ col_col = over_parts[0].strip()
373
+ index_col = over_parts[1].strip() if len(over_parts) > 1 else None
374
+
375
+ if not value_col or not col_col:
376
+ return "Usage: pivot <value_col> by <column_col> [over <index_col>] [agg=mean|sum|count|first]"
377
+
378
+ for c in [value_col, col_col] + ([index_col] if index_col else []):
379
+ if c not in df.columns:
380
+ return f"Column not found: {c}"
381
+
382
+ session.snapshot()
383
+ try:
384
+ pivot_kwargs: dict = dict(
385
+ on=col_col, values=value_col,
386
+ aggregate_function=AGG_MAP[agg_func],
387
+ )
388
+ if index_col:
389
+ pivot_kwargs["index"] = index_col
390
+ else:
391
+ others = [c for c in df.columns if c not in (value_col, col_col)]
392
+ if not others:
393
+ return "Need at least one column to serve as row index"
394
+ pivot_kwargs["index"] = others
395
+ session.df = df.pivot(**pivot_kwargs)
396
+ return f"Pivoted to wide format: {session.shape_str}. Use 'undo' to revert."
397
+ except Exception as e:
398
+ session.undo()
399
+ return friendly_error(e, "Pivot error")
400
+
401
+
402
+ @command("melt", usage="melt <id_cols>, <value_cols> [var_name=X] [value_name=Y]")
403
+ def cmd_melt(session: Session, args: str) -> str:
404
+ """Melt (reshape to long format). Separate id and value cols with a comma."""
405
+ df = session.require_data()
406
+ ca = CommandArgs(args)
407
+ var_name = ca.get_option("var_name", "variable")
408
+ value_name = ca.get_option("value_name", "value")
409
+
410
+ clean = ca.strip_flags_and_options()
411
+ if "," not in clean:
412
+ return "Usage: melt <id_col1> <id_col2>, <val_col1> <val_col2> [var_name=X] [value_name=Y]"
413
+
414
+ parts = clean.split(",", 1)
415
+ id_cols = parts[0].split()
416
+ val_cols = parts[1].split()
417
+
418
+ if not id_cols or not val_cols:
419
+ return "Usage: melt <id_col1> <id_col2>, <val_col1> <val_col2> [var_name=X] [value_name=Y]"
420
+
421
+ for c in id_cols + val_cols:
422
+ if c not in df.columns:
423
+ return f"Column not found: {c}"
424
+
425
+ session.snapshot()
426
+ try:
427
+ session.df = df.unpivot(
428
+ on=val_cols, index=id_cols,
429
+ variable_name=var_name, value_name=value_name,
430
+ )
431
+ return f"Melted to long format: {session.shape_str}. Use 'undo' to revert."
432
+ except Exception as e:
433
+ session.undo()
434
+ return friendly_error(e, "Melt error")
435
+
436
+
437
+ @command("sample", usage="sample <N|N%>")
438
+ def cmd_sample(session: Session, args: str) -> str:
439
+ """Take a random sample: N rows or N% of data."""
440
+ df = session.require_data()
441
+ arg = args.strip()
442
+ if not arg:
443
+ return "Usage: sample <N> or sample <N%>"
444
+
445
+ try:
446
+ if arg.endswith("%"):
447
+ pct = float(arg[:-1])
448
+ if not (0 < pct <= 100):
449
+ return "Percentage must be between 0 and 100"
450
+ n = max(1, int(df.height * pct / 100))
451
+ else:
452
+ n = int(arg)
453
+ if n <= 0:
454
+ return "Sample size must be positive"
455
+ except ValueError:
456
+ return "Usage: sample <N> or sample <N%>"
457
+
458
+ n = min(n, df.height)
459
+ session.snapshot()
460
+ session.df = df.sample(n=n, shuffle=True)
461
+ return f"Sampled {n:,} rows from {df.height:,}. {session.shape_str}. Use 'undo' to revert."
462
+
463
+
464
+ @command("replace", usage="replace <col> <old_value> <new_value>")
465
+ def cmd_replace(session: Session, args: str) -> str:
466
+ """Replace values in a column."""
467
+ df = session.require_data()
468
+ parts = args.split(None, 2)
469
+ if len(parts) < 3:
470
+ return "Usage: replace <col> <old_value> <new_value>"
471
+
472
+ col, old_val, new_val = parts[0], parts[1], parts[2]
473
+ if col not in df.columns:
474
+ return f"Column not found: {col}"
475
+
476
+ # Strip quotes from values
477
+ old_val = old_val.strip("\"'")
478
+ new_val = new_val.strip("\"'")
479
+
480
+ dtype = df[col].dtype
481
+ session.snapshot()
482
+
483
+ try:
484
+ if dtype in (pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64,
485
+ pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64):
486
+ old_v = float(old_val)
487
+ new_v = float(new_val)
488
+ count = df.filter(pl.col(col) == old_v).height
489
+ session.df = df.with_columns(
490
+ pl.when(pl.col(col) == old_v).then(pl.lit(new_v)).otherwise(pl.col(col)).alias(col)
491
+ )
492
+ else:
493
+ count = df.filter(pl.col(col) == old_val).height
494
+ session.df = df.with_columns(
495
+ pl.when(pl.col(col) == old_val).then(pl.lit(new_val)).otherwise(pl.col(col)).alias(col)
496
+ )
497
+ return f"Replaced {count:,} occurrence(s) in '{col}'. Use 'undo' to revert."
498
+ except Exception as e:
499
+ session.undo()
500
+ return friendly_error(e, "Replace error")
501
+
502
+
503
+ @command("duplicates", usage="duplicates [drop] [col1 col2 ...]")
504
+ def cmd_duplicates(session: Session, args: str) -> str:
505
+ """Find or drop duplicate rows. Use 'drop' to remove them."""
506
+ df = session.require_data()
507
+ parts = args.split()
508
+ drop = "drop" in parts
509
+ cols = [p for p in parts if p != "drop"]
510
+
511
+ if cols:
512
+ missing = [c for c in cols if c not in df.columns]
513
+ if missing:
514
+ return f"Columns not found: {', '.join(missing)}"
515
+ subset = cols
516
+ else:
517
+ subset = None
518
+
519
+ if drop:
520
+ before = df.height
521
+ session.snapshot()
522
+ session.df = df.unique(subset=subset, keep="first")
523
+ after = session.df.height
524
+ removed = before - after
525
+ return f"Dropped {removed:,} duplicate(s): {before:,} -> {after:,} rows. Use 'undo' to revert."
526
+ else:
527
+ n_dups = df.height - df.unique(subset=subset, keep="first").height
528
+ if n_dups == 0:
529
+ suffix = f" (on {', '.join(cols)})" if cols else ""
530
+ return f"No duplicates found{suffix}."
531
+ return f"Found {n_dups:,} duplicate row(s). Use 'duplicates drop' to remove them."
532
+
533
+
534
+ @command("unique", usage="unique <col>")
535
+ def cmd_unique(session: Session, args: str) -> str:
536
+ """Show unique values of a column."""
537
+ df = session.require_data()
538
+ col = args.strip()
539
+ if not col:
540
+ return "Usage: unique <col>"
541
+ if col not in df.columns:
542
+ return f"Column not found: {col}"
543
+
544
+ values = df[col].unique().sort().to_list()
545
+ n = len(values)
546
+ if n > 50:
547
+ shown = [str(v) for v in values[:50]]
548
+ return f"{col}: {n} unique values (showing first 50):\n{', '.join(shown)} ..."
549
+ return f"{col}: {n} unique values:\n{', '.join(str(v) for v in values)}"
550
+
551
+
552
+ @command("encode", usage="encode <col> [as <new_col>]")
553
+ def cmd_encode(session: Session, args: str) -> str:
554
+ """Encode a string column as numeric codes (label encoding)."""
555
+ df = session.require_data()
556
+ ca = CommandArgs(args)
557
+ if not ca.positional:
558
+ return "Usage: encode <col> [as <new_col>]"
559
+
560
+ col = ca.positional[0]
561
+ if col not in df.columns:
562
+ return f"Column not found: {col}"
563
+
564
+ as_rest = ca.rest_after("as")
565
+ new_col = as_rest.split()[0] if as_rest else col + "_code"
566
+
567
+ # Build mapping: sorted unique values → 0, 1, 2, ...
568
+ unique_vals = df[col].unique().sort().to_list()
569
+ mapping = {v: i for i, v in enumerate(unique_vals)}
570
+
571
+ session.snapshot()
572
+ session.df = df.with_columns(
573
+ pl.col(col).replace_strict(mapping).cast(pl.Int64).alias(new_col)
574
+ )
575
+
576
+ lines = [f"Encoded '{col}' -> '{new_col}' ({len(unique_vals)} levels):"]
577
+ for v, code in list(mapping.items())[:20]:
578
+ lines.append(f" {code} = {v}")
579
+ if len(mapping) > 20:
580
+ lines.append(f" ... ({len(mapping) - 20} more)")
581
+ lines.append("Use 'undo' to revert.")
582
+ return "\n".join(lines)
583
+
584
+
585
+ @command("recode", usage='recode <col> "old1"=new1 "old2"=new2 ...')
586
+ def cmd_recode(session: Session, args: str) -> str:
587
+ """Recode values in a column using mappings."""
588
+ df = session.require_data()
589
+ if "=" not in args:
590
+ return 'Usage: recode <col> "old1"=new1 "old2"=new2 ...'
591
+
592
+ parts = args.split(None, 1)
593
+ if len(parts) < 2:
594
+ return 'Usage: recode <col> "old1"=new1 "old2"=new2 ...'
595
+
596
+ col = parts[0]
597
+ if col not in df.columns:
598
+ return f"Column not found: {col}"
599
+
600
+ # Parse mapping pairs
601
+ mapping_str = parts[1]
602
+ mapping: dict[str, str] = {}
603
+ import shlex
604
+ try:
605
+ tokens = shlex.split(mapping_str)
606
+ except ValueError:
607
+ tokens = mapping_str.split()
608
+
609
+ for token in tokens:
610
+ if "=" not in token:
611
+ return f"Invalid mapping: {token}. Use old=new format."
612
+ old, new = token.split("=", 1)
613
+ old = old.strip("\"'")
614
+ new = new.strip("\"'")
615
+ mapping[old] = new
616
+
617
+ if not mapping:
618
+ return "No mappings provided."
619
+
620
+ session.snapshot()
621
+ dtype = df[col].dtype
622
+ is_numeric = dtype in (
623
+ pl.Float32, pl.Float64,
624
+ pl.Int8, pl.Int16, pl.Int32, pl.Int64,
625
+ pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
626
+ )
627
+
628
+ expr = pl.col(col)
629
+ for old_val, new_val in mapping.items():
630
+ if is_numeric:
631
+ try:
632
+ old_v = float(old_val)
633
+ new_v = float(new_val)
634
+ expr = pl.when(pl.col(col) == old_v).then(pl.lit(new_v)).otherwise(expr)
635
+ except ValueError:
636
+ session.undo()
637
+ return f"Cannot convert '{old_val}' or '{new_val}' to number for numeric column '{col}'"
638
+ else:
639
+ expr = pl.when(pl.col(col) == old_val).then(pl.lit(new_val)).otherwise(expr)
640
+
641
+ session.df = df.with_columns(expr.alias(col))
642
+ return f"Recoded {len(mapping)} value(s) in '{col}'. Use 'undo' to revert."
643
+
644
+
645
+ @command("fillna", usage="fillna <col> <strategy>")
646
+ def cmd_fillna(session: Session, args: str) -> str:
647
+ """Fill missing values: mean, median, mode, forward, backward, or value=N."""
648
+ df = session.require_data()
649
+ parts = args.split()
650
+ if len(parts) < 2:
651
+ return (
652
+ "Usage: fillna <col> <strategy>\n"
653
+ "Strategies: mean, median, mode, forward, backward, value=N"
654
+ )
655
+
656
+ col = parts[0]
657
+ strategy = parts[1]
658
+ if col not in df.columns:
659
+ return f"Column not found: {col}"
660
+
661
+ null_count = df[col].null_count()
662
+ if null_count == 0:
663
+ return f"No missing values in '{col}'."
664
+
665
+ session.snapshot()
666
+
667
+ try:
668
+ if strategy == "mean":
669
+ fill_val = df[col].mean()
670
+ session.df = df.with_columns(pl.col(col).fill_null(fill_val))
671
+ elif strategy == "median":
672
+ fill_val = df[col].median()
673
+ session.df = df.with_columns(pl.col(col).fill_null(fill_val))
674
+ elif strategy == "mode":
675
+ mode_val = df[col].drop_nulls().mode().to_list()
676
+ if not mode_val:
677
+ session.undo()
678
+ return "Cannot compute mode: no non-null values."
679
+ session.df = df.with_columns(pl.col(col).fill_null(mode_val[0]))
680
+ elif strategy == "forward":
681
+ session.df = df.with_columns(pl.col(col).forward_fill())
682
+ elif strategy == "backward":
683
+ session.df = df.with_columns(pl.col(col).backward_fill())
684
+ elif strategy.startswith("value="):
685
+ val_str = strategy.split("=", 1)[1]
686
+ try:
687
+ val = float(val_str)
688
+ except ValueError:
689
+ val = val_str
690
+ session.df = df.with_columns(pl.col(col).fill_null(val))
691
+ else:
692
+ session.undo()
693
+ return f"Unknown strategy: {strategy}. Use: mean, median, mode, forward, backward, value=N"
694
+
695
+ remaining = session.df[col].null_count()
696
+ filled = null_count - remaining
697
+ return f"Filled {filled:,} null(s) in '{col}' using {strategy}. Use 'undo' to revert."
698
+ except Exception as e:
699
+ session.undo()
700
+ return friendly_error(e, "Fillna error")
701
+
702
+
703
+ @command("cast", usage="cast <col> <type>")
704
+ def cmd_cast(session: Session, args: str) -> str:
705
+ """Convert column type: int, float, str, bool."""
706
+ df = session.require_data()
707
+ parts = args.split()
708
+ if len(parts) < 2:
709
+ return "Usage: cast <col> <type> (types: int, float, str, bool)"
710
+
711
+ col = parts[0]
712
+ target_type = parts[1].lower()
713
+ if col not in df.columns:
714
+ return f"Column not found: {col}"
715
+
716
+ TYPE_MAP = {
717
+ "int": pl.Int64,
718
+ "float": pl.Float64,
719
+ "str": pl.Utf8,
720
+ "string": pl.Utf8,
721
+ "bool": pl.Boolean,
722
+ }
723
+ pl_type = TYPE_MAP.get(target_type)
724
+ if pl_type is None:
725
+ return f"Unknown type: {target_type}. Available: {', '.join(TYPE_MAP)}"
726
+
727
+ session.snapshot()
728
+ try:
729
+ session.df = df.with_columns(pl.col(col).cast(pl_type))
730
+ return f"Cast '{col}' to {target_type}. Use 'undo' to revert."
731
+ except Exception as e:
732
+ session.undo()
733
+ return friendly_error(e, "Cast error")
734
+
735
+
736
+ @command("lag", usage="lag <col> [N] [as <name>]")
737
+ def cmd_lag(session: Session, args: str) -> str:
738
+ """Create a lagged variable (shift values down by N rows, default 1)."""
739
+ df = session.require_data()
740
+ ca = CommandArgs(args)
741
+ if not ca.positional:
742
+ return "Usage: lag <col> [N] [as <name>]"
743
+
744
+ col = ca.positional[0]
745
+ if col not in df.columns:
746
+ return f"Column not found: {col}"
747
+
748
+ n = 1
749
+ for p in ca.positional[1:]:
750
+ if p.lower() == "as":
751
+ break
752
+ try:
753
+ n = int(p)
754
+ except ValueError:
755
+ pass
756
+
757
+ as_rest = ca.rest_after("as")
758
+ new_name = as_rest.split()[0] if as_rest else f"{col}_lag{n}"
759
+
760
+ session.snapshot()
761
+ session.df = df.with_columns(pl.col(col).shift(n).alias(new_name))
762
+ return f"Created '{new_name}' (lag {n}). Use 'undo' to revert."
763
+
764
+
765
+ @command("lead", usage="lead <col> [N] [as <name>]")
766
+ def cmd_lead(session: Session, args: str) -> str:
767
+ """Create a lead variable (shift values up by N rows, default 1)."""
768
+ df = session.require_data()
769
+ ca = CommandArgs(args)
770
+ if not ca.positional:
771
+ return "Usage: lead <col> [N] [as <name>]"
772
+
773
+ col = ca.positional[0]
774
+ if col not in df.columns:
775
+ return f"Column not found: {col}"
776
+
777
+ n = 1
778
+ for p in ca.positional[1:]:
779
+ if p.lower() == "as":
780
+ break
781
+ try:
782
+ n = int(p)
783
+ except ValueError:
784
+ pass
785
+
786
+ as_rest = ca.rest_after("as")
787
+ new_name = as_rest.split()[0] if as_rest else f"{col}_lead{n}"
788
+
789
+ session.snapshot()
790
+ session.df = df.with_columns(pl.col(col).shift(-n).alias(new_name))
791
+ return f"Created '{new_name}' (lead {n}). Use 'undo' to revert."
792
+
793
+
794
+ @command("append", usage="append using <file> [, force]")
795
+ def cmd_append(session: Session, args: str) -> str:
796
+ """Append rows from another file to current dataset."""
797
+ df = session.require_data()
798
+ ca = CommandArgs(args)
799
+
800
+ # Parse: using <file> [, force]
801
+ rest = ca.rest_after("using")
802
+ if not rest:
803
+ return "Usage: append using <file> [, force]"
804
+
805
+ force = False
806
+ if ", force" in rest or ",force" in rest:
807
+ force = True
808
+ rest = rest.replace(", force", "").replace(",force", "").strip()
809
+
810
+ file_path = rest.strip().strip('"').strip("'")
811
+ if not file_path:
812
+ return "Usage: append using <file> [, force]"
813
+
814
+ try:
815
+ new_df = load_file(file_path)
816
+ except Exception as e:
817
+ return f"Error loading file: {e}"
818
+
819
+ # Check column compatibility
820
+ current_cols = set(df.columns)
821
+ new_cols = set(new_df.columns)
822
+ only_current = current_cols - new_cols
823
+ only_new = new_cols - current_cols
824
+
825
+ if (only_current or only_new) and not force:
826
+ lines = ["Column mismatch detected:"]
827
+ if only_current:
828
+ lines.append(f" Only in current data: {', '.join(sorted(only_current))}")
829
+ if only_new:
830
+ lines.append(f" Only in new file: {', '.join(sorted(only_new))}")
831
+ lines.append("Use 'append using <file>, force' to proceed (missing columns filled with null).")
832
+ return "\n".join(lines)
833
+
834
+ rows_before = df.height
835
+ session.snapshot()
836
+ session.df = pl.concat([df, new_df], how="diagonal")
837
+ rows_added = new_df.height
838
+
839
+ lines = [f"Appended {rows_added} rows (total: {rows_before + rows_added})."]
840
+ if only_current or only_new:
841
+ lines.append("Note: Missing columns filled with null.")
842
+ lines.append("Use 'undo' to revert.")
843
+ return "\n".join(lines)
844
+
845
+
846
+ @command("egen", usage="egen newvar = func(col) [, by(group)]")
847
+ def cmd_egen(session: Session, args: str) -> str:
848
+ """Extended generate: create variables with group-wise or row-wise functions.
849
+
850
+ Supported functions:
851
+ mean, sum, min, max, median, count, rank, group,
852
+ rowtotal(x1 x2 ...), rowmean(x1 x2 ...)
853
+ """
854
+ df = session.require_data()
855
+
856
+ # Parse: newvar = function(args) [, by(group)]
857
+ # Handle by() first
858
+ by_col = None
859
+ by_match = re.search(r',\s*by\((\w+)\)', args)
860
+ if by_match:
861
+ by_col = by_match.group(1)
862
+ if by_col not in df.columns:
863
+ return f"Column not found: {by_col}"
864
+ args_clean = args[:by_match.start()] + args[by_match.end():]
865
+ else:
866
+ args_clean = args
867
+
868
+ # Parse newvar = function(col_args)
869
+ m = re.match(r'\s*(\w+)\s*=\s*(\w+)\((.+?)\)\s*$', args_clean.strip())
870
+ if not m:
871
+ return (
872
+ "Usage: egen newvar = func(col) [, by(group)]\n"
873
+ "Functions: mean, sum, min, max, median, count, rank, group,\n"
874
+ " rowtotal(x1 x2 ...), rowmean(x1 x2 ...)"
875
+ )
876
+
877
+ new_var = m.group(1)
878
+ func_name = m.group(2).lower()
879
+ col_args = m.group(3).strip()
880
+
881
+ # Row-wise functions: rowtotal(x1 x2 x3), rowmean(x1 x2 x3)
882
+ if func_name in ("rowtotal", "rowmean"):
883
+ cols = col_args.split()
884
+ missing_cols = [c for c in cols if c not in df.columns]
885
+ if missing_cols:
886
+ return f"Columns not found: {', '.join(missing_cols)}"
887
+
888
+ session.snapshot()
889
+ if func_name == "rowtotal":
890
+ expr = pl.sum_horizontal([pl.col(c) for c in cols])
891
+ else: # rowmean
892
+ expr = pl.mean_horizontal([pl.col(c) for c in cols])
893
+ session.df = df.with_columns(expr.alias(new_var))
894
+ return f"Created '{new_var}' = {func_name}({' '.join(cols)}). Use 'undo' to revert."
895
+
896
+ # Group-level and group-id functions
897
+ col = col_args.strip()
898
+
899
+ # Special case: group() creates numeric group ID
900
+ if func_name == "group":
901
+ if col not in df.columns:
902
+ return f"Column not found: {col}"
903
+ session.snapshot()
904
+ # Create dense rank (group identifier)
905
+ session.df = df.with_columns(
906
+ pl.col(col).rank("dense").cast(pl.Int64).alias(new_var)
907
+ )
908
+ return f"Created '{new_var}' = group({col}). Use 'undo' to revert."
909
+
910
+ if col not in df.columns:
911
+ return f"Column not found: {col}"
912
+
913
+ # Map function name to polars expression
914
+ func_map = {
915
+ "mean": pl.col(col).mean(),
916
+ "sum": pl.col(col).sum(),
917
+ "min": pl.col(col).min(),
918
+ "max": pl.col(col).max(),
919
+ "median": pl.col(col).median(),
920
+ "count": pl.col(col).count(),
921
+ "rank": pl.col(col).rank("ordinal"),
922
+ }
923
+
924
+ if func_name not in func_map:
925
+ return (
926
+ f"Unknown function: {func_name}\n"
927
+ "Supported: mean, sum, min, max, median, count, rank, group, rowtotal, rowmean"
928
+ )
929
+
930
+ expr = func_map[func_name]
931
+
932
+ session.snapshot()
933
+ if by_col:
934
+ session.df = df.with_columns(expr.over(by_col).alias(new_var))
935
+ return f"Created '{new_var}' = {func_name}({col}), by({by_col}). Use 'undo' to revert."
936
+ else:
937
+ session.df = df.with_columns(expr.alias(new_var))
938
+ return f"Created '{new_var}' = {func_name}({col}). Use 'undo' to revert."
939
+
940
+
941
+ @command("sqlload", usage="sqlload <connection_url> [<query_or_table>]")
942
+ def cmd_sqlload(session: Session, args: str) -> str:
943
+ """Load data from a SQL database using a connection URL.
944
+
945
+ Requires: pip install connectorx (or adbc-driver for some backends)
946
+
947
+ Examples:
948
+ sqlload sqlite:///mydata.db mytable
949
+ sqlload sqlite:///mydata.db "SELECT * FROM sales WHERE year = 2023"
950
+ sqlload postgresql://user:pass@localhost/mydb "SELECT * FROM employees LIMIT 1000"
951
+ sqlload mysql+pymysql://user:pass@localhost/mydb customers
952
+ """
953
+ stripped = args.strip()
954
+ if not stripped:
955
+ return (
956
+ "Usage: sqlload <connection_url> [<query_or_table>]\n"
957
+ "Examples:\n"
958
+ " sqlload sqlite:///path.db mytable\n"
959
+ " sqlload sqlite:///path.db \"SELECT * FROM tbl WHERE x > 0\"\n"
960
+ " sqlload postgresql://user:pass@host/db \"SELECT * FROM tbl\""
961
+ )
962
+
963
+ # Split into URL and query/table (second token may be quoted)
964
+ import shlex
965
+ try:
966
+ parts = shlex.split(stripped)
967
+ except ValueError:
968
+ parts = stripped.split(None, 1)
969
+
970
+ url = parts[0]
971
+ query_or_table = parts[1] if len(parts) > 1 else None
972
+
973
+ # Build SQL query
974
+ if query_or_table is None:
975
+ return "Please specify a table name or SQL query after the connection URL."
976
+
977
+ sql = query_or_table if query_or_table.strip().lower().startswith("select") \
978
+ else f"SELECT * FROM {query_or_table}"
979
+
980
+ try:
981
+ df = pl.read_database_uri(query=sql, uri=url)
982
+ except ImportError as e:
983
+ return (
984
+ f"Missing optional dependency: {e}\n"
985
+ "Install with: pip install connectorx\n"
986
+ "Or for PostgreSQL/MySQL: pip install adbc-driver-postgresql"
987
+ )
988
+ except Exception as e:
989
+ return f"sqlload error: {e}"
990
+
991
+ session.snapshot()
992
+ session.df = df
993
+ session.dataset_path = url
994
+ session.dataset_name = query_or_table
995
+ session._undo_stack.clear()
996
+ return f"Loaded {session.shape_str} from SQL: {url}"