openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""Stata .do file importer: convert Stata syntax to OpenStat .ost script."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from openstat.commands.base import command, CommandArgs, friendly_error
|
|
9
|
+
from openstat.session import Session
|
|
10
|
+
|
|
11
|
+
# ── Stata → OpenStat translation rules ──────────────────────────────────────
|
|
12
|
+
# Each rule is (pattern, replacement_or_callable)
|
|
13
|
+
# Applied in order; first match wins for each line.
|
|
14
|
+
|
|
15
|
+
_RULES: list[tuple[re.Pattern, object]] = []
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _rule(pattern: str, repl):
|
|
19
|
+
_RULES.append((re.compile(pattern, re.IGNORECASE), repl))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Comments
|
|
23
|
+
_rule(r"^\s*\*.*$", lambda m: "# " + m.group(0).lstrip("* \t"))
|
|
24
|
+
_rule(r"^(\s*)//(.*)$", lambda m: m.group(1) + "# " + m.group(2))
|
|
25
|
+
|
|
26
|
+
# use → load
|
|
27
|
+
_rule(r"^\s*use\s+[\"']?(.+?)[\"']?\s*(?:,\s*clear)?\s*$",
|
|
28
|
+
lambda m: f"load {m.group(1).strip()}")
|
|
29
|
+
|
|
30
|
+
# save → save
|
|
31
|
+
_rule(r"^\s*save\s+[\"']?(.+?)[\"']?\s*(?:,\s*replace)?\s*$",
|
|
32
|
+
lambda m: f"save {m.group(1).strip()}")
|
|
33
|
+
|
|
34
|
+
# summarize → summarize
|
|
35
|
+
_rule(r"^\s*su(?:mmarize)?\s*(.*)", lambda m: f"summarize {m.group(1).strip()}")
|
|
36
|
+
|
|
37
|
+
# describe → describe
|
|
38
|
+
_rule(r"^\s*desc(?:ribe)?\s*(.*)", lambda m: f"describe")
|
|
39
|
+
|
|
40
|
+
# regress → ols
|
|
41
|
+
_rule(r"^\s*reg(?:ress)?\s+(\S+)\s+(.*)",
|
|
42
|
+
lambda m: f"ols {m.group(1)} {m.group(2).strip()}")
|
|
43
|
+
|
|
44
|
+
# logit → logit (same)
|
|
45
|
+
_rule(r"^\s*logit\s+(\S+)\s+(.*)",
|
|
46
|
+
lambda m: f"logit {m.group(1)} {m.group(2).strip()}")
|
|
47
|
+
|
|
48
|
+
# probit → probit (same)
|
|
49
|
+
_rule(r"^\s*probit\s+(\S+)\s+(.*)",
|
|
50
|
+
lambda m: f"probit {m.group(1)} {m.group(2).strip()}")
|
|
51
|
+
|
|
52
|
+
# poisson → poisson (same)
|
|
53
|
+
_rule(r"^\s*poisson\s+(\S+)\s+(.*)",
|
|
54
|
+
lambda m: f"poisson {m.group(1)} {m.group(2).strip()}")
|
|
55
|
+
|
|
56
|
+
# tab → tabulate
|
|
57
|
+
_rule(r"^\s*tab(?:ulate)?\s+(.*)", lambda m: f"tabulate {m.group(1).strip()}")
|
|
58
|
+
|
|
59
|
+
# cor → correlate
|
|
60
|
+
_rule(r"^\s*cor(?:relate)?\s+(.*)", lambda m: f"correlate {m.group(1).strip()}")
|
|
61
|
+
|
|
62
|
+
# drop if → filter
|
|
63
|
+
_rule(r"^\s*drop\s+if\s+(.*)",
|
|
64
|
+
lambda m: f"# [manual] drop if {m.group(1)} → filter {_stata_cond(m.group(1))}")
|
|
65
|
+
|
|
66
|
+
# keep if → filter
|
|
67
|
+
_rule(r"^\s*keep\s+if\s+(.*)",
|
|
68
|
+
lambda m: f"# [manual] keep if {m.group(1)} → filter {_stata_cond(m.group(1))}")
|
|
69
|
+
|
|
70
|
+
# drop varlist → drop
|
|
71
|
+
_rule(r"^\s*drop\s+(?!if)(.*)", lambda m: f"drop {m.group(1).strip()}")
|
|
72
|
+
|
|
73
|
+
# keep varlist → keep
|
|
74
|
+
_rule(r"^\s*keep\s+(?!if)(.*)", lambda m: f"keep {m.group(1).strip()}")
|
|
75
|
+
|
|
76
|
+
# gen/generate → generate
|
|
77
|
+
_rule(r"^\s*gen(?:erate)?\s+(.+)=(.+)",
|
|
78
|
+
lambda m: f"generate {m.group(1).strip()} = {m.group(2).strip()}")
|
|
79
|
+
|
|
80
|
+
# replace → replace
|
|
81
|
+
_rule(r"^\s*replace\s+(.+)=(.+)",
|
|
82
|
+
lambda m: f"replace {m.group(1).strip()} = {m.group(2).strip()}")
|
|
83
|
+
|
|
84
|
+
# rename → rename
|
|
85
|
+
_rule(r"^\s*rename\s+(\S+)\s+(\S+)", lambda m: f"rename {m.group(1)} {m.group(2)}")
|
|
86
|
+
|
|
87
|
+
# sort → sort
|
|
88
|
+
_rule(r"^\s*sort\s+(.*)", lambda m: f"sort {m.group(1).strip()}")
|
|
89
|
+
|
|
90
|
+
# set seed → set seed
|
|
91
|
+
_rule(r"^\s*set\s+seed\s+(\d+)", lambda m: f"set seed {m.group(1)}")
|
|
92
|
+
|
|
93
|
+
# xtset → xtset (panel)
|
|
94
|
+
_rule(r"^\s*xtset\s+(.*)", lambda m: f"xtset {m.group(1).strip()}")
|
|
95
|
+
|
|
96
|
+
# tsset → tsset
|
|
97
|
+
_rule(r"^\s*tsset\s+(.*)", lambda m: f"tsset {m.group(1).strip()}")
|
|
98
|
+
|
|
99
|
+
# stset → stset
|
|
100
|
+
_rule(r"^\s*stset\s+(.*)", lambda m: f"stset {m.group(1).strip()}")
|
|
101
|
+
|
|
102
|
+
# display → print (closest)
|
|
103
|
+
_rule(r"^\s*dis(?:play)?\s+(.*)", lambda m: f"# display {m.group(1).strip()}")
|
|
104
|
+
|
|
105
|
+
# local macro → define
|
|
106
|
+
_rule(r"^\s*local\s+(\w+)\s+(.*)", lambda m: f"define {m.group(1)} = {m.group(2).strip()}")
|
|
107
|
+
|
|
108
|
+
# forvalues
|
|
109
|
+
_rule(r"^\s*forvalues\s+(.*)", lambda m: f"forvalues {m.group(1).strip()} {{")
|
|
110
|
+
|
|
111
|
+
# foreach
|
|
112
|
+
_rule(r"^\s*foreach\s+(.*)", lambda m: f"foreach {m.group(1).strip()} {{")
|
|
113
|
+
|
|
114
|
+
# if / else blocks - pass through
|
|
115
|
+
_rule(r"^\s*\}\s*$", lambda m: "}")
|
|
116
|
+
|
|
117
|
+
# quietly → strip (run silently)
|
|
118
|
+
_rule(r"^\s*quietly\s+(.*)", lambda m: m.group(1).strip())
|
|
119
|
+
|
|
120
|
+
# capture → strip (ignore errors)
|
|
121
|
+
_rule(r"^\s*capture\s+(.*)", lambda m: m.group(1).strip())
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _stata_cond(cond: str) -> str:
|
|
125
|
+
"""Very rough Stata condition → Polars filter hint."""
|
|
126
|
+
# == → ==, != → !=, & → &, | → |
|
|
127
|
+
cond = cond.replace(" & ", " & ").replace(" | ", " | ")
|
|
128
|
+
return cond
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _translate_line(line: str) -> str:
|
|
132
|
+
"""Translate a single Stata .do line to OpenStat syntax."""
|
|
133
|
+
stripped = line.rstrip()
|
|
134
|
+
if not stripped.strip():
|
|
135
|
+
return ""
|
|
136
|
+
|
|
137
|
+
for pat, repl in _RULES:
|
|
138
|
+
m = pat.match(stripped)
|
|
139
|
+
if m:
|
|
140
|
+
if callable(repl):
|
|
141
|
+
return repl(m)
|
|
142
|
+
return repl
|
|
143
|
+
# No match — keep as comment with note
|
|
144
|
+
return f"# [untranslated] {stripped}"
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def convert_do_file(do_text: str) -> str:
|
|
148
|
+
"""Convert Stata .do file content to OpenStat .ost script."""
|
|
149
|
+
out_lines = ["# OpenStat script — converted from Stata .do file", ""]
|
|
150
|
+
for line in do_text.splitlines():
|
|
151
|
+
translated = _translate_line(line)
|
|
152
|
+
out_lines.append(translated)
|
|
153
|
+
return "\n".join(out_lines)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@command("import do", usage="import do <file.do> [--out=<file.ost>]")
|
|
157
|
+
def cmd_import_do(session: Session, args: str) -> str:
|
|
158
|
+
"""Convert a Stata .do file to an OpenStat .ost script.
|
|
159
|
+
|
|
160
|
+
Translates common Stata commands (use, regress, gen, etc.) to their
|
|
161
|
+
OpenStat equivalents. Untranslated lines are kept as comments.
|
|
162
|
+
|
|
163
|
+
Options:
|
|
164
|
+
--out=<path> output .ost file path (default: same name, .ost extension)
|
|
165
|
+
--run also execute the converted script immediately
|
|
166
|
+
|
|
167
|
+
Examples:
|
|
168
|
+
import do analysis.do
|
|
169
|
+
import do stata_script.do --out=my_analysis.ost --run
|
|
170
|
+
"""
|
|
171
|
+
ca = CommandArgs(args)
|
|
172
|
+
if not ca.positional:
|
|
173
|
+
return "Usage: import do <file.do> [--out=<path>] [--run]"
|
|
174
|
+
|
|
175
|
+
do_path = Path(ca.positional[0])
|
|
176
|
+
if not do_path.exists():
|
|
177
|
+
return f"File not found: {do_path}"
|
|
178
|
+
|
|
179
|
+
out_path_str = ca.options.get("out", str(do_path.with_suffix(".ost")))
|
|
180
|
+
out_path = Path(out_path_str)
|
|
181
|
+
run_after = "--run" in args
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
do_text = do_path.read_text(encoding="utf-8", errors="replace")
|
|
185
|
+
ost_text = convert_do_file(do_text)
|
|
186
|
+
|
|
187
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
188
|
+
out_path.write_text(ost_text, encoding="utf-8")
|
|
189
|
+
|
|
190
|
+
n_lines = len(do_text.splitlines())
|
|
191
|
+
n_translated = sum(
|
|
192
|
+
1 for line in ost_text.splitlines()
|
|
193
|
+
if line.strip() and not line.startswith("# [untranslated]")
|
|
194
|
+
and not line.startswith("# OpenStat")
|
|
195
|
+
)
|
|
196
|
+
n_untranslated = ost_text.count("# [untranslated]")
|
|
197
|
+
|
|
198
|
+
result = (
|
|
199
|
+
f"Converted: {do_path} → {out_path}\n"
|
|
200
|
+
f" Lines: {n_lines} | Translated: {n_translated} | "
|
|
201
|
+
f"Untranslated (kept as comments): {n_untranslated}"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
if run_after:
|
|
205
|
+
from openstat.script_runner import run_script_advanced
|
|
206
|
+
try:
|
|
207
|
+
run_script_advanced(str(out_path), session)
|
|
208
|
+
result += f"\nScript executed: {out_path}"
|
|
209
|
+
except Exception as exc:
|
|
210
|
+
result += f"\nScript error: {exc}"
|
|
211
|
+
|
|
212
|
+
return result
|
|
213
|
+
|
|
214
|
+
except Exception as e:
|
|
215
|
+
return friendly_error(e, "import do")
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""String and date manipulation commands."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
from openstat.commands.base import command
|
|
10
|
+
from openstat.session import Session
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _stata_opts(raw: str) -> tuple[list[str], dict[str, str]]:
|
|
14
|
+
opts: dict[str, str] = {}
|
|
15
|
+
for m in re.finditer(r'(\w+)\(([^)]*)\)', raw):
|
|
16
|
+
opts[m.group(1).lower()] = m.group(2)
|
|
17
|
+
rest = re.sub(r'\w+\([^)]*\)', '', raw)
|
|
18
|
+
positional = [t.strip(',') for t in rest.split() if t.strip(',')]
|
|
19
|
+
return positional, opts
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@command("split", usage="split varname [, sep(,) gen(newvar)]")
|
|
23
|
+
def cmd_split(session: Session, args: str) -> str:
|
|
24
|
+
"""Split a string column into multiple columns."""
|
|
25
|
+
df = session.require_data()
|
|
26
|
+
positional, opts = _stata_opts(args)
|
|
27
|
+
if not positional:
|
|
28
|
+
return "Usage: split varname [sep(,) gen(prefix)]"
|
|
29
|
+
var = positional[0]
|
|
30
|
+
if var not in df.columns:
|
|
31
|
+
return f"Column '{var}' not found."
|
|
32
|
+
sep = opts.get("sep", " ")
|
|
33
|
+
prefix = opts.get("gen", var)
|
|
34
|
+
session.snapshot()
|
|
35
|
+
try:
|
|
36
|
+
parts = df[var].str.split(sep)
|
|
37
|
+
max_parts = max(len(p) for p in parts.to_list())
|
|
38
|
+
new_df = df
|
|
39
|
+
for i in range(max_parts):
|
|
40
|
+
col_name = f"{prefix}{i+1}"
|
|
41
|
+
new_df = new_df.with_columns(
|
|
42
|
+
parts.list.get(i, null_on_oob=True).alias(col_name)
|
|
43
|
+
)
|
|
44
|
+
session.df = new_df
|
|
45
|
+
return f"Split '{var}' into {max_parts} columns: {[f'{prefix}{i+1}' for i in range(max_parts)]}"
|
|
46
|
+
except Exception as exc:
|
|
47
|
+
return f"split error: {exc}"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@command("strtrim", usage="strtrim varname [gen(newvar)]")
|
|
51
|
+
def cmd_strtrim(session: Session, args: str) -> str:
|
|
52
|
+
"""Trim whitespace from a string column."""
|
|
53
|
+
df = session.require_data()
|
|
54
|
+
positional, opts = _stata_opts(args)
|
|
55
|
+
if not positional:
|
|
56
|
+
return "Usage: strtrim varname [gen(newvar)]"
|
|
57
|
+
var = positional[0]
|
|
58
|
+
if var not in df.columns:
|
|
59
|
+
return f"Column '{var}' not found."
|
|
60
|
+
new_var = opts.get("gen", var)
|
|
61
|
+
session.snapshot()
|
|
62
|
+
try:
|
|
63
|
+
session.df = df.with_columns(pl.col(var).str.strip_chars().alias(new_var))
|
|
64
|
+
return f"Trimmed '{var}' → '{new_var}'"
|
|
65
|
+
except Exception as exc:
|
|
66
|
+
return f"strtrim error: {exc}"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@command("strupper", usage="strupper varname [gen(newvar)]")
|
|
70
|
+
def cmd_strupper(session: Session, args: str) -> str:
|
|
71
|
+
"""Convert string column to uppercase."""
|
|
72
|
+
df = session.require_data()
|
|
73
|
+
positional, opts = _stata_opts(args)
|
|
74
|
+
if not positional:
|
|
75
|
+
return "Usage: strupper varname [gen(newvar)]"
|
|
76
|
+
var = positional[0]
|
|
77
|
+
if var not in df.columns:
|
|
78
|
+
return f"Column '{var}' not found."
|
|
79
|
+
new_var = opts.get("gen", var)
|
|
80
|
+
session.snapshot()
|
|
81
|
+
try:
|
|
82
|
+
session.df = df.with_columns(pl.col(var).str.to_uppercase().alias(new_var))
|
|
83
|
+
return f"Uppercased '{var}' → '{new_var}'"
|
|
84
|
+
except Exception as exc:
|
|
85
|
+
return f"strupper error: {exc}"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@command("strlower", usage="strlower varname [gen(newvar)]")
|
|
89
|
+
def cmd_strlower(session: Session, args: str) -> str:
|
|
90
|
+
"""Convert string column to lowercase."""
|
|
91
|
+
df = session.require_data()
|
|
92
|
+
positional, opts = _stata_opts(args)
|
|
93
|
+
if not positional:
|
|
94
|
+
return "Usage: strlower varname [gen(newvar)]"
|
|
95
|
+
var = positional[0]
|
|
96
|
+
if var not in df.columns:
|
|
97
|
+
return f"Column '{var}' not found."
|
|
98
|
+
new_var = opts.get("gen", var)
|
|
99
|
+
session.snapshot()
|
|
100
|
+
try:
|
|
101
|
+
session.df = df.with_columns(pl.col(var).str.to_lowercase().alias(new_var))
|
|
102
|
+
return f"Lowercased '{var}' → '{new_var}'"
|
|
103
|
+
except Exception as exc:
|
|
104
|
+
return f"strlower error: {exc}"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@command("strreplace", usage="strreplace varname old new [gen(newvar)]")
|
|
108
|
+
def cmd_strreplace(session: Session, args: str) -> str:
|
|
109
|
+
"""Replace substring in a string column."""
|
|
110
|
+
df = session.require_data()
|
|
111
|
+
positional, opts = _stata_opts(args)
|
|
112
|
+
if len(positional) < 3:
|
|
113
|
+
return "Usage: strreplace varname old new [gen(newvar)]"
|
|
114
|
+
var, old_str, new_str = positional[0], positional[1], positional[2]
|
|
115
|
+
if var not in df.columns:
|
|
116
|
+
return f"Column '{var}' not found."
|
|
117
|
+
new_var = opts.get("gen", var)
|
|
118
|
+
session.snapshot()
|
|
119
|
+
try:
|
|
120
|
+
session.df = df.with_columns(pl.col(var).str.replace_all(old_str, new_str).alias(new_var))
|
|
121
|
+
return f"Replaced '{old_str}' with '{new_str}' in '{var}' → '{new_var}'"
|
|
122
|
+
except Exception as exc:
|
|
123
|
+
return f"strreplace error: {exc}"
|
|
124
|
+
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Survival analysis commands: stset, stcox, sts."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from openstat.session import Session, ModelResult
|
|
8
|
+
from openstat.commands.base import command, CommandArgs, friendly_error
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@command("stset", usage="stset <time_var>, failure(<event_var>)")
|
|
12
|
+
def cmd_stset(session: Session, args: str) -> str:
|
|
13
|
+
"""Declare survival time and failure event variables."""
|
|
14
|
+
df = session.require_data()
|
|
15
|
+
|
|
16
|
+
m = re.search(r'failure\((\w+)\)', args)
|
|
17
|
+
if not m:
|
|
18
|
+
return "Usage: stset <time_var>, failure(<event_var>)"
|
|
19
|
+
|
|
20
|
+
event_var = m.group(1)
|
|
21
|
+
# Time var is everything before the comma
|
|
22
|
+
time_part = args[:args.index(',')] if ',' in args else args[:m.start()]
|
|
23
|
+
time_var = time_part.strip()
|
|
24
|
+
|
|
25
|
+
if time_var not in df.columns:
|
|
26
|
+
return f"Column not found: {time_var}"
|
|
27
|
+
if event_var not in df.columns:
|
|
28
|
+
return f"Column not found: {event_var}"
|
|
29
|
+
|
|
30
|
+
session._surv_time_var = time_var
|
|
31
|
+
session._surv_event_var = event_var
|
|
32
|
+
|
|
33
|
+
n = df.height
|
|
34
|
+
events = df[event_var].sum()
|
|
35
|
+
return (
|
|
36
|
+
f"Survival time: {time_var}\n"
|
|
37
|
+
f"Failure event: {event_var}\n"
|
|
38
|
+
f"Observations: {n}, Events: {events}, Censored: {n - events}"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@command("stcox", usage="stcox x1 x2 [--robust]")
|
|
43
|
+
def cmd_stcox(session: Session, args: str) -> str:
|
|
44
|
+
"""Fit a Cox Proportional Hazards model."""
|
|
45
|
+
df = session.require_data()
|
|
46
|
+
|
|
47
|
+
if session._surv_time_var is None or session._surv_event_var is None:
|
|
48
|
+
return "Survival structure not set. Use: stset <time_var>, failure(<event_var>)"
|
|
49
|
+
|
|
50
|
+
ca = CommandArgs(args)
|
|
51
|
+
robust = ca.has_flag("--robust")
|
|
52
|
+
covariates = [p for p in ca.positional if not p.startswith("--")]
|
|
53
|
+
|
|
54
|
+
if not covariates:
|
|
55
|
+
return "Usage: stcox x1 x2 [--robust]"
|
|
56
|
+
|
|
57
|
+
missing = [c for c in covariates if c not in df.columns]
|
|
58
|
+
if missing:
|
|
59
|
+
return f"Columns not found: {', '.join(missing)}"
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
from openstat.stats.survival import fit_cox_ph
|
|
63
|
+
|
|
64
|
+
result, raw = fit_cox_ph(
|
|
65
|
+
df, session._surv_time_var, session._surv_event_var,
|
|
66
|
+
covariates, robust=robust,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
session._last_model = raw
|
|
70
|
+
session._last_model_vars = (session._surv_time_var, covariates)
|
|
71
|
+
session._last_fit_result = result
|
|
72
|
+
session._last_fit_kwargs = {"survival": True}
|
|
73
|
+
|
|
74
|
+
md = result.to_markdown() if hasattr(result, "to_markdown") else ""
|
|
75
|
+
session.results.append(ModelResult(
|
|
76
|
+
name="Cox PH", formula=result.formula,
|
|
77
|
+
table=md, details={
|
|
78
|
+
"n_obs": result.n_obs,
|
|
79
|
+
"params": dict(result.params),
|
|
80
|
+
"log_likelihood": result.log_likelihood,
|
|
81
|
+
},
|
|
82
|
+
))
|
|
83
|
+
|
|
84
|
+
output = result.summary_table()
|
|
85
|
+
if result.warnings:
|
|
86
|
+
output += "\n" + "\n".join(result.warnings)
|
|
87
|
+
return output
|
|
88
|
+
except ImportError as e:
|
|
89
|
+
return str(e)
|
|
90
|
+
except Exception as e:
|
|
91
|
+
return friendly_error(e, "stcox")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@command("sts", usage="sts graph [by=group] | sts test <group_var>")
|
|
95
|
+
def cmd_sts(session: Session, args: str) -> str:
|
|
96
|
+
"""Kaplan-Meier survival curves and log-rank test."""
|
|
97
|
+
df = session.require_data()
|
|
98
|
+
|
|
99
|
+
if session._surv_time_var is None or session._surv_event_var is None:
|
|
100
|
+
return "Survival structure not set. Use: stset <time_var>, failure(<event_var>)"
|
|
101
|
+
|
|
102
|
+
ca = CommandArgs(args)
|
|
103
|
+
subcmd = ca.positional[0].lower() if ca.positional else ""
|
|
104
|
+
|
|
105
|
+
if subcmd == "graph":
|
|
106
|
+
group_var = ca.get_option("by")
|
|
107
|
+
try:
|
|
108
|
+
from openstat.stats.survival import kaplan_meier
|
|
109
|
+
summary, kmf = kaplan_meier(
|
|
110
|
+
df, session._surv_time_var, session._surv_event_var, group_var,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Plot
|
|
114
|
+
try:
|
|
115
|
+
from openstat.plots.surv_plots import plot_km
|
|
116
|
+
path = plot_km(kmf, session.output_dir, group_var)
|
|
117
|
+
session.plot_paths.append(str(path))
|
|
118
|
+
summary += f"\nPlot saved: {path}"
|
|
119
|
+
except Exception:
|
|
120
|
+
pass
|
|
121
|
+
|
|
122
|
+
return summary
|
|
123
|
+
except ImportError as e:
|
|
124
|
+
return str(e)
|
|
125
|
+
except Exception as e:
|
|
126
|
+
return friendly_error(e, "sts graph")
|
|
127
|
+
|
|
128
|
+
elif subcmd == "test":
|
|
129
|
+
group_var = ca.positional[1] if len(ca.positional) > 1 else None
|
|
130
|
+
if not group_var:
|
|
131
|
+
return "Usage: sts test <group_var>"
|
|
132
|
+
if group_var not in df.columns:
|
|
133
|
+
return f"Column not found: {group_var}"
|
|
134
|
+
try:
|
|
135
|
+
from openstat.stats.survival import log_rank_test
|
|
136
|
+
return log_rank_test(
|
|
137
|
+
df, session._surv_time_var, session._surv_event_var, group_var,
|
|
138
|
+
)
|
|
139
|
+
except ImportError as e:
|
|
140
|
+
return str(e)
|
|
141
|
+
except Exception as e:
|
|
142
|
+
return friendly_error(e, "sts test")
|
|
143
|
+
|
|
144
|
+
else:
|
|
145
|
+
return "Usage: sts graph [by=group] | sts test <group_var>"
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Survey weighting commands: svyset, svy: prefix."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from openstat.session import Session, ModelResult
|
|
8
|
+
from openstat.commands.base import command, CommandArgs, friendly_error
|
|
9
|
+
from openstat.dsl.parser import parse_formula, ParseError
|
|
10
|
+
from openstat.types import NUMERIC_DTYPES
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@command("svyset", usage="svyset <psu> [pw=<weight>], strata(<strata>)")
|
|
14
|
+
def cmd_svyset(session: Session, args: str) -> str:
|
|
15
|
+
"""Declare survey design: PSU, sampling weights, and strata."""
|
|
16
|
+
df = session.require_data()
|
|
17
|
+
|
|
18
|
+
# Parse weight: [pw=weight_var]
|
|
19
|
+
m_pw = re.search(r'\[pw=(\w+)\]', args)
|
|
20
|
+
weight_var = m_pw.group(1) if m_pw else None
|
|
21
|
+
|
|
22
|
+
# Parse strata: strata(strata_var)
|
|
23
|
+
m_strata = re.search(r'strata\((\w+)\)', args)
|
|
24
|
+
strata_var = m_strata.group(1) if m_strata else None
|
|
25
|
+
|
|
26
|
+
# PSU is the first positional argument
|
|
27
|
+
clean = args
|
|
28
|
+
if m_pw:
|
|
29
|
+
clean = clean.replace(m_pw.group(0), "")
|
|
30
|
+
if m_strata:
|
|
31
|
+
clean = clean.replace(m_strata.group(0), "")
|
|
32
|
+
clean = clean.replace(",", "").strip()
|
|
33
|
+
psu_var = clean.split()[0] if clean.split() else None
|
|
34
|
+
|
|
35
|
+
# Validate columns exist
|
|
36
|
+
for var, label in [(psu_var, "PSU"), (weight_var, "weight"), (strata_var, "strata")]:
|
|
37
|
+
if var and var not in df.columns:
|
|
38
|
+
return f"{label} column not found: {var}"
|
|
39
|
+
|
|
40
|
+
session._svy_psu_var = psu_var
|
|
41
|
+
session._svy_weight_var = weight_var
|
|
42
|
+
session._svy_strata_var = strata_var
|
|
43
|
+
|
|
44
|
+
lines = ["Survey design set:"]
|
|
45
|
+
if psu_var:
|
|
46
|
+
lines.append(f" PSU: {psu_var} ({df[psu_var].n_unique()} clusters)")
|
|
47
|
+
if weight_var:
|
|
48
|
+
lines.append(f" Weight: {weight_var}")
|
|
49
|
+
if strata_var:
|
|
50
|
+
lines.append(f" Strata: {strata_var} ({df[strata_var].n_unique()} strata)")
|
|
51
|
+
return "\n".join(lines)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@command("svy:", usage="svy: summarize|ols|logit ...")
|
|
55
|
+
def cmd_svy(session: Session, args: str) -> str:
|
|
56
|
+
"""Run survey-weighted analysis. Requires svyset first."""
|
|
57
|
+
df = session.require_data()
|
|
58
|
+
|
|
59
|
+
if session._svy_weight_var is None:
|
|
60
|
+
return "Survey design not set. Use: svyset <psu> [pw=<weight>], strata(<strata>)"
|
|
61
|
+
|
|
62
|
+
parts = args.strip().split(None, 1)
|
|
63
|
+
subcmd = parts[0].lower() if parts else ""
|
|
64
|
+
rest = parts[1] if len(parts) > 1 else ""
|
|
65
|
+
|
|
66
|
+
if subcmd == "summarize":
|
|
67
|
+
return _svy_summarize(session, df, rest)
|
|
68
|
+
elif subcmd == "ols":
|
|
69
|
+
return _svy_ols(session, df, rest)
|
|
70
|
+
elif subcmd == "logit":
|
|
71
|
+
return _svy_logit(session, df, rest)
|
|
72
|
+
else:
|
|
73
|
+
return "Usage: svy: summarize [cols] | svy: ols y ~ x1 + x2 | svy: logit y ~ x1 + x2"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _svy_summarize(session: Session, df, args: str) -> str:
|
|
77
|
+
"""Weighted summary statistics."""
|
|
78
|
+
cols = args.split() if args.strip() else [c for c in df.columns if df[c].dtype in NUMERIC_DTYPES]
|
|
79
|
+
if not cols:
|
|
80
|
+
return "No numeric columns to summarize."
|
|
81
|
+
|
|
82
|
+
from openstat.stats.survey import weighted_summary
|
|
83
|
+
return weighted_summary(df, cols, session._svy_weight_var)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _svy_ols(session: Session, df, args: str) -> str:
|
|
87
|
+
"""Weighted OLS regression."""
|
|
88
|
+
try:
|
|
89
|
+
dep, indeps = parse_formula(args)
|
|
90
|
+
except ParseError as e:
|
|
91
|
+
return f"Formula error: {e}"
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
from openstat.stats.survey import fit_weighted_ols
|
|
95
|
+
result, raw = fit_weighted_ols(
|
|
96
|
+
df, dep, indeps, session._svy_weight_var,
|
|
97
|
+
session._svy_strata_var, session._svy_psu_var,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
session._last_model = raw
|
|
101
|
+
session._last_model_vars = (dep, indeps)
|
|
102
|
+
session._last_fit_result = result
|
|
103
|
+
session._last_fit_kwargs = {"survey": True}
|
|
104
|
+
|
|
105
|
+
md = result.to_markdown() if hasattr(result, "to_markdown") else ""
|
|
106
|
+
session.results.append(ModelResult(
|
|
107
|
+
name="Svy: OLS", formula=result.formula,
|
|
108
|
+
table=md, details={
|
|
109
|
+
"n_obs": result.n_obs,
|
|
110
|
+
"params": dict(result.params),
|
|
111
|
+
"r_squared": result.r_squared,
|
|
112
|
+
},
|
|
113
|
+
))
|
|
114
|
+
|
|
115
|
+
output = result.summary_table()
|
|
116
|
+
if result.warnings:
|
|
117
|
+
output += "\n" + "\n".join(result.warnings)
|
|
118
|
+
return output
|
|
119
|
+
except Exception as e:
|
|
120
|
+
return friendly_error(e, "svy: ols")
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _svy_logit(session: Session, df, args: str) -> str:
|
|
124
|
+
"""Weighted logistic regression."""
|
|
125
|
+
try:
|
|
126
|
+
dep, indeps = parse_formula(args)
|
|
127
|
+
except ParseError as e:
|
|
128
|
+
return f"Formula error: {e}"
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
from openstat.stats.survey import fit_weighted_logit
|
|
132
|
+
result, raw = fit_weighted_logit(df, dep, indeps, session._svy_weight_var)
|
|
133
|
+
|
|
134
|
+
session._last_model = raw
|
|
135
|
+
session._last_model_vars = (dep, indeps)
|
|
136
|
+
session._last_fit_result = result
|
|
137
|
+
session._last_fit_kwargs = {"survey": True}
|
|
138
|
+
|
|
139
|
+
md = result.to_markdown() if hasattr(result, "to_markdown") else ""
|
|
140
|
+
session.results.append(ModelResult(
|
|
141
|
+
name="Svy: Logit", formula=result.formula,
|
|
142
|
+
table=md, details={
|
|
143
|
+
"n_obs": result.n_obs,
|
|
144
|
+
"params": dict(result.params),
|
|
145
|
+
},
|
|
146
|
+
))
|
|
147
|
+
|
|
148
|
+
output = result.summary_table()
|
|
149
|
+
if result.warnings:
|
|
150
|
+
output += "\n" + "\n".join(result.warnings)
|
|
151
|
+
return output
|
|
152
|
+
except Exception as e:
|
|
153
|
+
return friendly_error(e, "svy: logit")
|