openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,561 @@
|
|
|
1
|
+
"""Extra import commands: URL, clipboard, SPSS syntax translation, REST webhook."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import sys
|
|
9
|
+
import tempfile
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from openstat.commands.base import command, CommandArgs, friendly_error
|
|
13
|
+
from openstat.session import Session
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
# import url
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
|
|
20
|
+
@command("import url", usage="import url <url> [--format=csv|json|parquet] [--sep=,]")
|
|
21
|
+
def cmd_import_url(session: Session, args: str) -> str:
|
|
22
|
+
"""Load data from an HTTP/HTTPS URL.
|
|
23
|
+
|
|
24
|
+
Auto-detects format from the URL extension if --format is not supplied.
|
|
25
|
+
Supported formats: csv, json, parquet.
|
|
26
|
+
|
|
27
|
+
Example: import url https://example.com/data.csv
|
|
28
|
+
Example: import url https://api.example.com/data.json --format=json
|
|
29
|
+
"""
|
|
30
|
+
import polars as pl
|
|
31
|
+
from openstat.io.loader import load_file
|
|
32
|
+
|
|
33
|
+
ca = CommandArgs(args)
|
|
34
|
+
if not ca.positional:
|
|
35
|
+
return "Usage: import url <url> [--format=csv|json|parquet] [--sep=,]"
|
|
36
|
+
|
|
37
|
+
url = ca.positional[0]
|
|
38
|
+
fmt = ca.options.get("format", "").lower()
|
|
39
|
+
sep = ca.options.get("sep", ",")
|
|
40
|
+
|
|
41
|
+
# Auto-detect format from extension
|
|
42
|
+
if not fmt:
|
|
43
|
+
url_path = url.split("?")[0].lower()
|
|
44
|
+
if url_path.endswith(".parquet"):
|
|
45
|
+
fmt = "parquet"
|
|
46
|
+
elif url_path.endswith(".json") or url_path.endswith(".jsonl"):
|
|
47
|
+
fmt = "json"
|
|
48
|
+
else:
|
|
49
|
+
fmt = "csv"
|
|
50
|
+
|
|
51
|
+
# Download to a temporary file
|
|
52
|
+
suffix = f".{fmt}"
|
|
53
|
+
try:
|
|
54
|
+
try:
|
|
55
|
+
import requests
|
|
56
|
+
resp = requests.get(url, timeout=60)
|
|
57
|
+
resp.raise_for_status()
|
|
58
|
+
data_bytes = resp.content
|
|
59
|
+
except ImportError:
|
|
60
|
+
import urllib.request as _urllib
|
|
61
|
+
with _urllib.urlopen(url, timeout=60) as resp: # noqa: S310
|
|
62
|
+
data_bytes = resp.read()
|
|
63
|
+
except Exception as exc:
|
|
64
|
+
return f"Failed to download '{url}': {exc}"
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
|
68
|
+
tmp.write(data_bytes)
|
|
69
|
+
tmp_path = tmp.name
|
|
70
|
+
|
|
71
|
+
if fmt == "parquet":
|
|
72
|
+
df = pl.read_parquet(tmp_path)
|
|
73
|
+
elif fmt == "json":
|
|
74
|
+
df = pl.read_json(tmp_path)
|
|
75
|
+
else:
|
|
76
|
+
actual_sep = "\t" if sep in ("\\t", "\t") else sep
|
|
77
|
+
df = pl.read_csv(tmp_path, separator=actual_sep)
|
|
78
|
+
except Exception as exc:
|
|
79
|
+
return friendly_error(exc, "import url")
|
|
80
|
+
finally:
|
|
81
|
+
try:
|
|
82
|
+
os.unlink(tmp_path)
|
|
83
|
+
except Exception:
|
|
84
|
+
pass
|
|
85
|
+
|
|
86
|
+
session.snapshot()
|
|
87
|
+
session.df = df
|
|
88
|
+
session.dataset_path = url
|
|
89
|
+
session.dataset_name = url.split("/")[-1].split("?")[0] or "url_import"
|
|
90
|
+
session._undo_stack.clear()
|
|
91
|
+
r, c = df.shape
|
|
92
|
+
return f"Loaded {r:,} rows x {c} columns from {url}"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# ---------------------------------------------------------------------------
|
|
96
|
+
# import clipboard
|
|
97
|
+
# ---------------------------------------------------------------------------
|
|
98
|
+
|
|
99
|
+
@command("import clipboard", usage="import clipboard [--sep=\\t|,]")
|
|
100
|
+
def cmd_import_clipboard(session: Session, args: str) -> str:
|
|
101
|
+
"""Load tabular data pasted from a spreadsheet (Excel, Google Sheets, etc.).
|
|
102
|
+
|
|
103
|
+
The default separator is a tab character, which is what Excel/Sheets
|
|
104
|
+
copies to the clipboard. Use --sep=, for comma-separated text.
|
|
105
|
+
|
|
106
|
+
Example: import clipboard
|
|
107
|
+
Example: import clipboard --sep=,
|
|
108
|
+
"""
|
|
109
|
+
import polars as pl
|
|
110
|
+
|
|
111
|
+
ca = CommandArgs(args)
|
|
112
|
+
sep_raw = ca.options.get("sep", "\\t")
|
|
113
|
+
sep = "\t" if sep_raw in ("\\t", "\t") else sep_raw
|
|
114
|
+
|
|
115
|
+
# Retrieve clipboard contents
|
|
116
|
+
text: str | None = None
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
import pyperclip
|
|
120
|
+
text = pyperclip.paste()
|
|
121
|
+
except ImportError:
|
|
122
|
+
pass
|
|
123
|
+
|
|
124
|
+
if text is None:
|
|
125
|
+
# Fallback: platform-specific subprocess
|
|
126
|
+
try:
|
|
127
|
+
import subprocess
|
|
128
|
+
platform = sys.platform
|
|
129
|
+
if platform == "darwin":
|
|
130
|
+
result = subprocess.run(["pbpaste"], capture_output=True, text=True, timeout=5)
|
|
131
|
+
text = result.stdout
|
|
132
|
+
elif platform.startswith("linux"):
|
|
133
|
+
result = subprocess.run(
|
|
134
|
+
["xclip", "-selection", "clipboard", "-o"],
|
|
135
|
+
capture_output=True, text=True, timeout=5,
|
|
136
|
+
)
|
|
137
|
+
if result.returncode != 0:
|
|
138
|
+
result = subprocess.run(
|
|
139
|
+
["xsel", "--clipboard", "--output"],
|
|
140
|
+
capture_output=True, text=True, timeout=5,
|
|
141
|
+
)
|
|
142
|
+
text = result.stdout
|
|
143
|
+
elif platform == "win32":
|
|
144
|
+
result = subprocess.run(
|
|
145
|
+
["powershell", "-command", "Get-Clipboard"],
|
|
146
|
+
capture_output=True, text=True, timeout=5,
|
|
147
|
+
)
|
|
148
|
+
text = result.stdout
|
|
149
|
+
except Exception as exc:
|
|
150
|
+
return (
|
|
151
|
+
f"Could not read clipboard: {exc}\n"
|
|
152
|
+
"Install pyperclip for reliable clipboard support: pip install pyperclip"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
if not text or not text.strip():
|
|
156
|
+
return "Clipboard is empty or contains no text."
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
df = pl.read_csv(
|
|
160
|
+
io.StringIO(text),
|
|
161
|
+
separator=sep,
|
|
162
|
+
infer_schema_length=1000,
|
|
163
|
+
)
|
|
164
|
+
except Exception as exc:
|
|
165
|
+
return friendly_error(exc, "import clipboard")
|
|
166
|
+
|
|
167
|
+
session.snapshot()
|
|
168
|
+
session.df = df
|
|
169
|
+
session.dataset_path = "clipboard"
|
|
170
|
+
session.dataset_name = "clipboard"
|
|
171
|
+
session._undo_stack.clear()
|
|
172
|
+
r, c = df.shape
|
|
173
|
+
sep_display = "tab" if sep == "\t" else repr(sep)
|
|
174
|
+
return f"Loaded {r:,} rows x {c} columns from clipboard (sep={sep_display})."
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# ---------------------------------------------------------------------------
|
|
178
|
+
# import spss
|
|
179
|
+
# ---------------------------------------------------------------------------
|
|
180
|
+
|
|
181
|
+
# Mapping of SPSS commands to OpenStat equivalents.
|
|
182
|
+
# Each entry: (regex_pattern, replacement_template | None)
|
|
183
|
+
# None means we emit an [untranslated] comment.
|
|
184
|
+
|
|
185
|
+
_SPSS_RULES: list[tuple[re.Pattern, str | None]] = [
|
|
186
|
+
# GET FILE = 'path/to/data.sav'.
|
|
187
|
+
(
|
|
188
|
+
re.compile(r"^GET\s+FILE\s*=\s*['\"]?([^'\".\s]+\S*?)['\"]?\s*\.?\s*$", re.IGNORECASE),
|
|
189
|
+
r"load \1",
|
|
190
|
+
),
|
|
191
|
+
# SAVE OUTFILE = 'path'.
|
|
192
|
+
(
|
|
193
|
+
re.compile(r"^SAVE\s+OUTFILE\s*=\s*['\"]?([^'\".\s]+\S*?)['\"]?\s*\.?\s*$", re.IGNORECASE),
|
|
194
|
+
r"save \1",
|
|
195
|
+
),
|
|
196
|
+
# REGRESSION VARIABLES = y x1 x2 / DEPENDENT y ...
|
|
197
|
+
(
|
|
198
|
+
re.compile(
|
|
199
|
+
r"^REGRESSION\s+.*VARIABLES\s*=\s*([\w\s]+?)(?:\s*/.*)?\.?\s*$",
|
|
200
|
+
re.IGNORECASE | re.DOTALL,
|
|
201
|
+
),
|
|
202
|
+
None, # complex to auto-translate; emit comment
|
|
203
|
+
),
|
|
204
|
+
# FREQUENCIES VARIABLES = col1 col2.
|
|
205
|
+
(
|
|
206
|
+
re.compile(r"^FREQUENCIES\s+VARIABLES\s*=\s*([\w\s]+?)\.?\s*$", re.IGNORECASE),
|
|
207
|
+
None, # -> tabulate (needs single col); emit comment
|
|
208
|
+
),
|
|
209
|
+
# DESCRIPTIVES VARIABLES = col1 col2.
|
|
210
|
+
(
|
|
211
|
+
re.compile(r"^DESCRIPTIVES\s+VARIABLES\s*=\s*([\w\s]+?)\.?\s*$", re.IGNORECASE),
|
|
212
|
+
None,
|
|
213
|
+
),
|
|
214
|
+
# CORRELATIONS VARIABLES = col1 col2.
|
|
215
|
+
(
|
|
216
|
+
re.compile(r"^CORRELATIONS\s+VARIABLES\s*=\s*([\w\s]+?)\.?\s*$", re.IGNORECASE),
|
|
217
|
+
None,
|
|
218
|
+
),
|
|
219
|
+
# COMPUTE newvar = expression.
|
|
220
|
+
(
|
|
221
|
+
re.compile(r"^COMPUTE\s+(\w+)\s*=\s*(.+?)\.?\s*$", re.IGNORECASE),
|
|
222
|
+
r"generate \1 = \2",
|
|
223
|
+
),
|
|
224
|
+
# SELECT IF (condition).
|
|
225
|
+
(
|
|
226
|
+
re.compile(r"^SELECT\s+IF\s+\((.+?)\)\.?\s*$", re.IGNORECASE),
|
|
227
|
+
None,
|
|
228
|
+
),
|
|
229
|
+
# RECODE var (old=new) ...
|
|
230
|
+
(
|
|
231
|
+
re.compile(r"^RECODE\s+.+$", re.IGNORECASE),
|
|
232
|
+
None,
|
|
233
|
+
),
|
|
234
|
+
]
|
|
235
|
+
|
|
236
|
+
# Simple direct substitution rules (SPSS keyword -> OpenStat command prefix)
|
|
237
|
+
_SPSS_SIMPLE: list[tuple[re.Pattern, str]] = [
|
|
238
|
+
(
|
|
239
|
+
re.compile(r"^FREQUENCIES\s+VARIABLES\s*=\s*([\w\s]+?)\.?\s*$", re.IGNORECASE),
|
|
240
|
+
"tabulate",
|
|
241
|
+
),
|
|
242
|
+
(
|
|
243
|
+
re.compile(r"^DESCRIPTIVES\s+VARIABLES\s*=\s*([\w\s]+?)\.?\s*$", re.IGNORECASE),
|
|
244
|
+
"summarize",
|
|
245
|
+
),
|
|
246
|
+
(
|
|
247
|
+
re.compile(r"^CORRELATIONS\s+VARIABLES\s*=\s*([\w\s]+?)\.?\s*$", re.IGNORECASE),
|
|
248
|
+
"correlate",
|
|
249
|
+
),
|
|
250
|
+
]
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _translate_spss_line(line: str) -> str:
|
|
254
|
+
"""Translate a single SPSS syntax line to an OpenStat line."""
|
|
255
|
+
stripped = line.strip()
|
|
256
|
+
if not stripped or stripped.startswith("*") or stripped.startswith("/*"):
|
|
257
|
+
# SPSS comment
|
|
258
|
+
return f"# {stripped.lstrip('*').strip()}" if stripped else ""
|
|
259
|
+
|
|
260
|
+
# GET FILE
|
|
261
|
+
m = re.match(r"^GET\s+FILE\s*=\s*['\"]?([^'\".\s]+\S*?)['\"]?\s*\.?\s*$", stripped, re.IGNORECASE)
|
|
262
|
+
if m:
|
|
263
|
+
return f"load {m.group(1)}"
|
|
264
|
+
|
|
265
|
+
# SAVE OUTFILE
|
|
266
|
+
m = re.match(r"^SAVE\s+OUTFILE\s*=\s*['\"]?([^'\".\s]+\S*?)['\"]?\s*\.?\s*$", stripped, re.IGNORECASE)
|
|
267
|
+
if m:
|
|
268
|
+
return f"save {m.group(1)}"
|
|
269
|
+
|
|
270
|
+
# COMPUTE newvar = expr.
|
|
271
|
+
m = re.match(r"^COMPUTE\s+(\w+)\s*=\s*(.+?)\.?\s*$", stripped, re.IGNORECASE)
|
|
272
|
+
if m:
|
|
273
|
+
return f"generate {m.group(1)} = {m.group(2)}"
|
|
274
|
+
|
|
275
|
+
# FREQUENCIES VARIABLES = ...
|
|
276
|
+
m = re.match(r"^FREQUENCIES\s+VARIABLES\s*=\s*([\w\s]+?)\.?\s*$", stripped, re.IGNORECASE)
|
|
277
|
+
if m:
|
|
278
|
+
first_col = m.group(1).split()[0]
|
|
279
|
+
rest = m.group(1).split()[1:]
|
|
280
|
+
extra = " ".join(rest)
|
|
281
|
+
note = f" # [note] original cols: {extra}" if extra else ""
|
|
282
|
+
return f"tabulate {first_col}{note}"
|
|
283
|
+
|
|
284
|
+
# DESCRIPTIVES VARIABLES = ...
|
|
285
|
+
m = re.match(r"^DESCRIPTIVES\s+VARIABLES\s*=\s*([\w\s]+?)\.?\s*$", stripped, re.IGNORECASE)
|
|
286
|
+
if m:
|
|
287
|
+
cols = m.group(1).strip()
|
|
288
|
+
return f"summarize {cols}"
|
|
289
|
+
|
|
290
|
+
# CORRELATIONS VARIABLES = ...
|
|
291
|
+
m = re.match(r"^CORRELATIONS\s+VARIABLES\s*=\s*([\w\s]+?)\.?\s*$", stripped, re.IGNORECASE)
|
|
292
|
+
if m:
|
|
293
|
+
cols = m.group(1).strip()
|
|
294
|
+
return f"correlate {cols}"
|
|
295
|
+
|
|
296
|
+
# REGRESSION — complex, emit annotated comment + best-effort ols
|
|
297
|
+
m = re.match(
|
|
298
|
+
r"^REGRESSION\s+.*?DEPENDENT\s*=?\s*(\w+)\s+.*?ENTER\s+([\w\s]+?)\.?\s*$",
|
|
299
|
+
stripped, re.IGNORECASE | re.DOTALL,
|
|
300
|
+
)
|
|
301
|
+
if m:
|
|
302
|
+
dep = m.group(1)
|
|
303
|
+
indeps = m.group(2).strip()
|
|
304
|
+
return f"ols {dep} {indeps} # [translated from REGRESSION]"
|
|
305
|
+
|
|
306
|
+
# SELECT IF
|
|
307
|
+
m = re.match(r"^SELECT\s+IF\s+\((.+?)\)\.?\s*$", stripped, re.IGNORECASE)
|
|
308
|
+
if m:
|
|
309
|
+
cond = m.group(1)
|
|
310
|
+
return (
|
|
311
|
+
f"# [untranslated] SELECT IF ({cond})\n"
|
|
312
|
+
f"# Manual equivalent: filter {cond}"
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# RECODE
|
|
316
|
+
if re.match(r"^RECODE\b", stripped, re.IGNORECASE):
|
|
317
|
+
return (
|
|
318
|
+
f"# [untranslated] {stripped}\n"
|
|
319
|
+
"# Manual equivalent: recode <col> old=new ..."
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
# Anything else
|
|
323
|
+
return f"# [untranslated] {stripped}"
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
@command("import spss", usage="import spss <syntax.sps> [--out=<script.ost>] [--run]")
|
|
327
|
+
def cmd_import_spss(session: Session, args: str) -> str:
|
|
328
|
+
"""Translate an SPSS syntax file (.sps) into an OpenStat script (.ost).
|
|
329
|
+
|
|
330
|
+
Translated commands:
|
|
331
|
+
GET FILE -> load
|
|
332
|
+
SAVE OUTFILE -> save
|
|
333
|
+
COMPUTE -> generate
|
|
334
|
+
FREQUENCIES -> tabulate
|
|
335
|
+
DESCRIPTIVES -> summarize
|
|
336
|
+
CORRELATIONS -> correlate
|
|
337
|
+
REGRESSION -> ols (best-effort)
|
|
338
|
+
SELECT IF -> filter (emitted as comment with hint)
|
|
339
|
+
RECODE -> replace (emitted as comment with hint)
|
|
340
|
+
All other lines are kept as [untranslated] comments.
|
|
341
|
+
|
|
342
|
+
Use --run to immediately execute the translated script.
|
|
343
|
+
|
|
344
|
+
Example: import spss analysis.sps --out=analysis.ost
|
|
345
|
+
"""
|
|
346
|
+
ca = CommandArgs(args)
|
|
347
|
+
if not ca.positional:
|
|
348
|
+
return "Usage: import spss <syntax.sps> [--out=<script.ost>] [--run]"
|
|
349
|
+
|
|
350
|
+
sps_path = ca.positional[0]
|
|
351
|
+
out_path = ca.options.get("out")
|
|
352
|
+
do_run = ca.has_flag("--run")
|
|
353
|
+
|
|
354
|
+
try:
|
|
355
|
+
raw = Path(sps_path).read_text(encoding="utf-8", errors="replace")
|
|
356
|
+
except FileNotFoundError:
|
|
357
|
+
return f"File not found: {sps_path}"
|
|
358
|
+
except Exception as exc:
|
|
359
|
+
return f"Cannot read '{sps_path}': {exc}"
|
|
360
|
+
|
|
361
|
+
# SPSS statements can span multiple lines ending with '.'; we join continuation lines.
|
|
362
|
+
# Simple heuristic: lines not ending with '.' that are not blank are joined to the next.
|
|
363
|
+
joined_lines: list[str] = []
|
|
364
|
+
buffer = ""
|
|
365
|
+
for raw_line in raw.splitlines():
|
|
366
|
+
stripped = raw_line.strip()
|
|
367
|
+
if not stripped:
|
|
368
|
+
if buffer:
|
|
369
|
+
joined_lines.append(buffer)
|
|
370
|
+
buffer = ""
|
|
371
|
+
joined_lines.append("")
|
|
372
|
+
continue
|
|
373
|
+
buffer = (buffer + " " + stripped).strip() if buffer else stripped
|
|
374
|
+
if buffer.endswith("."):
|
|
375
|
+
joined_lines.append(buffer.rstrip("."))
|
|
376
|
+
buffer = ""
|
|
377
|
+
if buffer:
|
|
378
|
+
joined_lines.append(buffer)
|
|
379
|
+
|
|
380
|
+
ost_lines: list[str] = [
|
|
381
|
+
f"# OpenStat script translated from SPSS: {sps_path}",
|
|
382
|
+
"#",
|
|
383
|
+
]
|
|
384
|
+
untranslated_count = 0
|
|
385
|
+
translated_count = 0
|
|
386
|
+
|
|
387
|
+
for line in joined_lines:
|
|
388
|
+
if not line.strip():
|
|
389
|
+
ost_lines.append("")
|
|
390
|
+
continue
|
|
391
|
+
translated = _translate_spss_line(line)
|
|
392
|
+
ost_lines.append(translated)
|
|
393
|
+
if "[untranslated]" in translated:
|
|
394
|
+
untranslated_count += 1
|
|
395
|
+
else:
|
|
396
|
+
translated_count += 1
|
|
397
|
+
|
|
398
|
+
ost_text = "\n".join(ost_lines)
|
|
399
|
+
|
|
400
|
+
if out_path:
|
|
401
|
+
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
|
|
402
|
+
Path(out_path).write_text(ost_text, encoding="utf-8")
|
|
403
|
+
result_msg = (
|
|
404
|
+
f"Translated '{sps_path}' -> '{out_path}'\n"
|
|
405
|
+
f" Translated: {translated_count} lines, "
|
|
406
|
+
f"untranslated (kept as comments): {untranslated_count} lines."
|
|
407
|
+
)
|
|
408
|
+
else:
|
|
409
|
+
result_msg = (
|
|
410
|
+
f"# --- Translated output (not saved) ---\n"
|
|
411
|
+
f"{ost_text}\n"
|
|
412
|
+
f"# --- Translated: {translated_count}, untranslated: {untranslated_count} ---"
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
if do_run and out_path:
|
|
416
|
+
try:
|
|
417
|
+
from openstat.script_runner import run_script
|
|
418
|
+
run_script(session, Path(out_path))
|
|
419
|
+
result_msg += "\nScript executed."
|
|
420
|
+
except Exception as exc:
|
|
421
|
+
result_msg += f"\nScript execution failed: {exc}"
|
|
422
|
+
|
|
423
|
+
return result_msg
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
# ---------------------------------------------------------------------------
|
|
427
|
+
# webhook
|
|
428
|
+
# ---------------------------------------------------------------------------
|
|
429
|
+
|
|
430
|
+
def _extract_json_path(data: object, dotpath: str) -> object:
|
|
431
|
+
"""Traverse a dotted path like 'data.records' through nested dicts/lists."""
|
|
432
|
+
parts = dotpath.strip().split(".")
|
|
433
|
+
current = data
|
|
434
|
+
for part in parts:
|
|
435
|
+
if isinstance(current, dict):
|
|
436
|
+
current = current.get(part)
|
|
437
|
+
elif isinstance(current, list) and part.isdigit():
|
|
438
|
+
current = current[int(part)]
|
|
439
|
+
else:
|
|
440
|
+
return None
|
|
441
|
+
if current is None:
|
|
442
|
+
return None
|
|
443
|
+
return current
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
@command("webhook", usage="webhook <url> [--method=GET|POST] [--params=key:val,key:val]")
|
|
447
|
+
def cmd_webhook(session: Session, args: str) -> str:
|
|
448
|
+
"""Fetch data from a REST API endpoint and load as a DataFrame.
|
|
449
|
+
|
|
450
|
+
Options:
|
|
451
|
+
--method=GET|POST HTTP method (default GET).
|
|
452
|
+
--params=key:val,key:val Request parameters (GET query string or POST body).
|
|
453
|
+
--token=<bearer_token> Authorization: Bearer <token> header.
|
|
454
|
+
--json_path=<dotpath> Navigate into nested JSON (e.g. data.records).
|
|
455
|
+
|
|
456
|
+
The response must be JSON. Arrays of objects are loaded directly;
|
|
457
|
+
nested structures are extracted with --json_path.
|
|
458
|
+
|
|
459
|
+
Example: webhook https://api.example.com/records
|
|
460
|
+
Example: webhook https://api.example.com/search --method=POST --params=q:hello,limit:100
|
|
461
|
+
Example: webhook https://api.example.com/v2/items --token=abc123 --json_path=data.items
|
|
462
|
+
"""
|
|
463
|
+
import polars as pl
|
|
464
|
+
|
|
465
|
+
ca = CommandArgs(args)
|
|
466
|
+
if not ca.positional:
|
|
467
|
+
return "Usage: webhook <url> [--method=GET|POST] [--params=key:val,key:val]"
|
|
468
|
+
|
|
469
|
+
url = ca.positional[0]
|
|
470
|
+
method = ca.options.get("method", "GET").upper()
|
|
471
|
+
token = ca.options.get("token")
|
|
472
|
+
json_path = ca.options.get("json_path")
|
|
473
|
+
params_raw = ca.options.get("params", "")
|
|
474
|
+
|
|
475
|
+
if method not in ("GET", "POST"):
|
|
476
|
+
return f"Unsupported method '{method}'. Use GET or POST."
|
|
477
|
+
|
|
478
|
+
# Parse params: key:val,key:val
|
|
479
|
+
params: dict[str, str] = {}
|
|
480
|
+
if params_raw:
|
|
481
|
+
for pair in params_raw.split(","):
|
|
482
|
+
pair = pair.strip()
|
|
483
|
+
if ":" not in pair:
|
|
484
|
+
return f"Invalid param '{pair}'. Use key:val format."
|
|
485
|
+
k, v = pair.split(":", 1)
|
|
486
|
+
params[k.strip()] = v.strip()
|
|
487
|
+
|
|
488
|
+
# Build headers
|
|
489
|
+
headers: dict[str, str] = {"Accept": "application/json"}
|
|
490
|
+
if token:
|
|
491
|
+
headers["Authorization"] = f"Bearer {token}"
|
|
492
|
+
|
|
493
|
+
try:
|
|
494
|
+
try:
|
|
495
|
+
import requests
|
|
496
|
+
if method == "GET":
|
|
497
|
+
resp = requests.get(url, params=params or None, headers=headers, timeout=60)
|
|
498
|
+
else:
|
|
499
|
+
resp = requests.post(url, json=params or None, headers=headers, timeout=60)
|
|
500
|
+
resp.raise_for_status()
|
|
501
|
+
payload = resp.json()
|
|
502
|
+
except ImportError:
|
|
503
|
+
# Fall back to urllib for GET without params easily serialisable
|
|
504
|
+
import urllib.request as _urllib
|
|
505
|
+
import urllib.parse as _urlparse
|
|
506
|
+
import json as _json
|
|
507
|
+
|
|
508
|
+
full_url = url
|
|
509
|
+
if method == "GET" and params:
|
|
510
|
+
full_url = url + "?" + _urlparse.urlencode(params)
|
|
511
|
+
|
|
512
|
+
req = _urllib.Request(full_url, headers=headers) # noqa: S310
|
|
513
|
+
if method == "POST" and params:
|
|
514
|
+
import json as _json2
|
|
515
|
+
body = _json2.dumps(params).encode()
|
|
516
|
+
req = _urllib.Request(full_url, data=body, headers={**headers, "Content-Type": "application/json"}) # noqa: S310
|
|
517
|
+
|
|
518
|
+
with _urllib.urlopen(req, timeout=60) as resp: # noqa: S310
|
|
519
|
+
payload = _json.loads(resp.read().decode("utf-8"))
|
|
520
|
+
|
|
521
|
+
except Exception as exc:
|
|
522
|
+
return f"Request to '{url}' failed: {exc}"
|
|
523
|
+
|
|
524
|
+
# Navigate to nested path if requested
|
|
525
|
+
if json_path:
|
|
526
|
+
payload = _extract_json_path(payload, json_path)
|
|
527
|
+
if payload is None:
|
|
528
|
+
return f"json_path '{json_path}' not found in response."
|
|
529
|
+
|
|
530
|
+
# Convert to DataFrame
|
|
531
|
+
try:
|
|
532
|
+
if isinstance(payload, list):
|
|
533
|
+
if not payload:
|
|
534
|
+
return "API returned an empty array."
|
|
535
|
+
df = pl.DataFrame(payload)
|
|
536
|
+
elif isinstance(payload, dict):
|
|
537
|
+
# Try to find an array field automatically
|
|
538
|
+
array_fields = [k for k, v in payload.items() if isinstance(v, list)]
|
|
539
|
+
if not array_fields:
|
|
540
|
+
# Treat the dict as a single-row DataFrame
|
|
541
|
+
df = pl.DataFrame([payload])
|
|
542
|
+
elif len(array_fields) == 1:
|
|
543
|
+
df = pl.DataFrame(payload[array_fields[0]])
|
|
544
|
+
else:
|
|
545
|
+
# Multiple arrays — ask user to specify
|
|
546
|
+
return (
|
|
547
|
+
f"Response contains multiple array fields: {', '.join(array_fields)}.\n"
|
|
548
|
+
f"Use --json_path=<field> to select one. Example: --json_path={array_fields[0]}"
|
|
549
|
+
)
|
|
550
|
+
else:
|
|
551
|
+
return f"Unexpected JSON type: {type(payload).__name__}. Expected list or object."
|
|
552
|
+
except Exception as exc:
|
|
553
|
+
return friendly_error(exc, "webhook")
|
|
554
|
+
|
|
555
|
+
session.snapshot()
|
|
556
|
+
session.df = df
|
|
557
|
+
session.dataset_path = url
|
|
558
|
+
session.dataset_name = url.split("/")[-1].split("?")[0] or "webhook"
|
|
559
|
+
session._undo_stack.clear()
|
|
560
|
+
r, c = df.shape
|
|
561
|
+
return f"Loaded {r:,} rows x {c} columns from {url} [{method}]."
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Influence and diagnostics commands: dfbeta, leverage, cooksd, outlier, avplot, coefplot."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from openstat.commands.base import command
|
|
8
|
+
from openstat.session import Session
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _stata_opts(raw: str) -> tuple[list[str], dict[str, str]]:
|
|
12
|
+
opts: dict[str, str] = {}
|
|
13
|
+
for m in re.finditer(r'(\w+)\(([^)]*)\)', raw):
|
|
14
|
+
opts[m.group(1).lower()] = m.group(2)
|
|
15
|
+
rest = re.sub(r'\w+\([^)]*\)', '', raw)
|
|
16
|
+
positional = [t.strip(',') for t in rest.split() if t.strip(',')]
|
|
17
|
+
return positional, opts
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _get_last_ols(session: Session):
|
|
21
|
+
m = session._last_model
|
|
22
|
+
if m is None:
|
|
23
|
+
return None, None, None
|
|
24
|
+
if hasattr(m, "model") and hasattr(m, "dep"):
|
|
25
|
+
return m, m.dep, m.indeps
|
|
26
|
+
if isinstance(m, dict) and "dep" in m and "indeps" in m:
|
|
27
|
+
return m, m["dep"], m["indeps"]
|
|
28
|
+
return None, None, None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@command("dfbeta", usage="dfbeta [dep indeps]")
|
|
32
|
+
def cmd_dfbeta(session: Session, args: str) -> str:
|
|
33
|
+
"""Compute DFBETAs for last OLS or specified variables."""
|
|
34
|
+
from openstat.stats.influence import compute_influence
|
|
35
|
+
df = session.require_data()
|
|
36
|
+
positional, opts = _stata_opts(args)
|
|
37
|
+
if len(positional) >= 2:
|
|
38
|
+
dep, indeps = positional[0], positional[1:]
|
|
39
|
+
else:
|
|
40
|
+
_, dep, indeps = _get_last_ols(session)
|
|
41
|
+
if dep is None:
|
|
42
|
+
return "Specify dep and indeps, or fit a model first."
|
|
43
|
+
indeps = [c for c in indeps if c in df.columns]
|
|
44
|
+
if not indeps or dep not in df.columns:
|
|
45
|
+
return "Invalid variables."
|
|
46
|
+
try:
|
|
47
|
+
r = compute_influence(df, dep, indeps)
|
|
48
|
+
lines = ["\nDFBETA Statistics", "-" * 50]
|
|
49
|
+
for name, vals in r["dfbetas"].items():
|
|
50
|
+
import numpy as np
|
|
51
|
+
arr = np.array(vals)
|
|
52
|
+
lines.append(f" {name:<20} max|DFBETA| = {np.abs(arr).max():.4f}")
|
|
53
|
+
lines.append(f"\n Threshold (2/sqrt(n)): {2/r['n_obs']**0.5:.4f}")
|
|
54
|
+
return "\n".join(lines)
|
|
55
|
+
except Exception as exc:
|
|
56
|
+
return f"dfbeta error: {exc}"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@command("leverage", usage="leverage [dep indeps]")
|
|
60
|
+
def cmd_leverage(session: Session, args: str) -> str:
|
|
61
|
+
"""Show leverage statistics for OLS regression."""
|
|
62
|
+
from openstat.stats.influence import compute_influence
|
|
63
|
+
df = session.require_data()
|
|
64
|
+
positional, opts = _stata_opts(args)
|
|
65
|
+
if len(positional) >= 2:
|
|
66
|
+
dep, indeps = positional[0], positional[1:]
|
|
67
|
+
else:
|
|
68
|
+
_, dep, indeps = _get_last_ols(session)
|
|
69
|
+
if dep is None:
|
|
70
|
+
return "Specify dep and indeps, or fit a model first."
|
|
71
|
+
indeps = [c for c in indeps if c in df.columns]
|
|
72
|
+
try:
|
|
73
|
+
r = compute_influence(df, dep, indeps)
|
|
74
|
+
lines = ["\nLeverage Statistics", "-" * 50]
|
|
75
|
+
lines.append(f" {'High leverage threshold':<35} {r['high_leverage_threshold']:.4f}")
|
|
76
|
+
lines.append(f" {'Observations with high leverage':<35} {r['n_high_leverage']}")
|
|
77
|
+
import numpy as np
|
|
78
|
+
lev = np.array(r["leverage"])
|
|
79
|
+
lines.append(f" {'Mean leverage':<35} {lev.mean():.4f}")
|
|
80
|
+
lines.append(f" {'Max leverage':<35} {lev.max():.4f}")
|
|
81
|
+
return "\n".join(lines)
|
|
82
|
+
except Exception as exc:
|
|
83
|
+
return f"leverage error: {exc}"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@command("cooksd", usage="cooksd [dep indeps]")
|
|
87
|
+
def cmd_cooksd(session: Session, args: str) -> str:
|
|
88
|
+
"""Compute Cook's distance for influence detection."""
|
|
89
|
+
from openstat.stats.influence import compute_influence
|
|
90
|
+
df = session.require_data()
|
|
91
|
+
positional, opts = _stata_opts(args)
|
|
92
|
+
if len(positional) >= 2:
|
|
93
|
+
dep, indeps = positional[0], positional[1:]
|
|
94
|
+
else:
|
|
95
|
+
_, dep, indeps = _get_last_ols(session)
|
|
96
|
+
if dep is None:
|
|
97
|
+
return "Specify dep and indeps, or fit a model first."
|
|
98
|
+
indeps = [c for c in indeps if c in df.columns]
|
|
99
|
+
try:
|
|
100
|
+
r = compute_influence(df, dep, indeps)
|
|
101
|
+
lines = ["\nCook's Distance", "-" * 50]
|
|
102
|
+
lines.append(f" {'Threshold (4/n)':<35} {r['high_cooks_threshold']:.4f}")
|
|
103
|
+
lines.append(f" {'Influential observations':<35} {r['n_high_cooks']}")
|
|
104
|
+
import numpy as np
|
|
105
|
+
cd = np.array(r["cooks_d"])
|
|
106
|
+
lines.append(f" {'Max Cook''s D':<35} {cd.max():.4f}")
|
|
107
|
+
if r["n_high_cooks"] > 0:
|
|
108
|
+
top = np.argsort(cd)[::-1][:5]
|
|
109
|
+
lines.append(f" Top influential obs (index): {top.tolist()}")
|
|
110
|
+
return "\n".join(lines)
|
|
111
|
+
except Exception as exc:
|
|
112
|
+
return f"cooksd error: {exc}"
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@command("outlier", usage="outlier dep indeps [threshold(3.0)]")
|
|
116
|
+
def cmd_outlier(session: Session, args: str) -> str:
|
|
117
|
+
"""Detect outliers by studentized residuals."""
|
|
118
|
+
from openstat.stats.influence import detect_outliers
|
|
119
|
+
df = session.require_data()
|
|
120
|
+
positional, opts = _stata_opts(args)
|
|
121
|
+
if len(positional) < 2:
|
|
122
|
+
return "Usage: outlier dep indeps [threshold(3.0)]"
|
|
123
|
+
dep = positional[0]
|
|
124
|
+
indeps = [c for c in positional[1:] if c in df.columns]
|
|
125
|
+
threshold = float(opts.get("threshold", 3.0))
|
|
126
|
+
try:
|
|
127
|
+
r = detect_outliers(df, dep, indeps, threshold=threshold)
|
|
128
|
+
lines = [f"\nOutlier Detection (|studentized resid| > {threshold})", "-" * 50]
|
|
129
|
+
lines.append(f" Outliers found: {r['n_outliers']}")
|
|
130
|
+
if r["outlier_indices"]:
|
|
131
|
+
lines.append(f" Outlier indices: {r['outlier_indices'][:20]}")
|
|
132
|
+
return "\n".join(lines)
|
|
133
|
+
except Exception as exc:
|
|
134
|
+
return f"outlier error: {exc}"
|