stata-cli 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stata_cli/__init__.py +1 -0
- stata_cli/__main__.py +5 -0
- stata_cli/daemon.py +367 -0
- stata_cli/engine.py +461 -0
- stata_cli/graph_artifacts.py +95 -0
- stata_cli/main.py +343 -0
- stata_cli/output_filter.py +239 -0
- stata_cli/smcl_parser.py +93 -0
- stata_cli/utils.py +85 -0
- stata_cli-0.2.0.dist-info/METADATA +338 -0
- stata_cli-0.2.0.dist-info/RECORD +14 -0
- stata_cli-0.2.0.dist-info/WHEEL +5 -0
- stata_cli-0.2.0.dist-info/entry_points.txt +2 -0
- stata_cli-0.2.0.dist-info/top_level.txt +1 -0
stata_cli/main.py
ADDED
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""Stata CLI - run Stata commands from the terminal."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
|
|
9
|
+
from . import __version__
|
|
10
|
+
from .engine import StataEngine, Result
|
|
11
|
+
from .output_filter import apply_compact_filter, check_token_limit, clean_log_wrapper
|
|
12
|
+
from .utils import detect_stata_path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Exit codes
|
|
16
|
+
EXIT_OK = 0
|
|
17
|
+
EXIT_STATA_ERROR = 1
|
|
18
|
+
EXIT_USAGE_ERROR = 2
|
|
19
|
+
EXIT_INIT_FAILURE = 3
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _exit(code: int) -> None:
|
|
23
|
+
"""Exit bypassing atexit hooks — PyStata registers one that resets the exit code to 0."""
|
|
24
|
+
sys.stdout.flush()
|
|
25
|
+
sys.stderr.flush()
|
|
26
|
+
os._exit(code)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@click.group()
|
|
30
|
+
@click.version_option(__version__, prog_name="stata-cli")
|
|
31
|
+
@click.option("--stata-path", envvar="STATA_PATH", default=None, help="Path to Stata installation directory.")
|
|
32
|
+
@click.option("--edition", type=click.Choice(["mp", "se", "be"], case_sensitive=False), default="mp", help="Stata edition.")
|
|
33
|
+
@click.option("--compact", is_flag=True, default=False, help="Apply compact output filter (strip verbose noise).")
|
|
34
|
+
@click.option("--json", "use_json", is_flag=True, default=False, help="Output results as JSON (for agent consumption).")
|
|
35
|
+
@click.option("--timeout", type=float, default=600.0, help="Execution timeout in seconds.")
|
|
36
|
+
@click.option("--max-tokens", type=int, default=0, help="Max output tokens (0=unlimited). Saves full output to file when exceeded.")
|
|
37
|
+
@click.option("--no-daemon", is_flag=True, default=False, help="Force direct execution, skip daemon.")
|
|
38
|
+
@click.option("--graphs-dir", envvar="STATA_CLI_GRAPHS_DIR", default=None, help="Graph export directory.")
|
|
39
|
+
@click.pass_context
|
|
40
|
+
def cli(ctx, stata_path, edition, compact, use_json, timeout, max_tokens, no_daemon, graphs_dir):
|
|
41
|
+
"""Command-line interface for Stata."""
|
|
42
|
+
ctx.ensure_object(dict)
|
|
43
|
+
ctx.obj["stata_path"] = stata_path
|
|
44
|
+
ctx.obj["edition"] = edition
|
|
45
|
+
ctx.obj["compact"] = compact
|
|
46
|
+
ctx.obj["json"] = use_json
|
|
47
|
+
ctx.obj["timeout"] = timeout
|
|
48
|
+
ctx.obj["max_tokens"] = max_tokens
|
|
49
|
+
ctx.obj["no_daemon"] = no_daemon
|
|
50
|
+
ctx.obj["graphs_dir"] = graphs_dir
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _get_engine(ctx) -> StataEngine:
|
|
54
|
+
stata_path = ctx.obj["stata_path"] or detect_stata_path()
|
|
55
|
+
if not stata_path:
|
|
56
|
+
click.echo("Error: Stata installation not found.", err=True)
|
|
57
|
+
click.echo("Set --stata-path or the STATA_PATH environment variable.", err=True)
|
|
58
|
+
_exit(EXIT_INIT_FAILURE)
|
|
59
|
+
try:
|
|
60
|
+
engine = StataEngine(stata_path, ctx.obj["edition"], graphs_dir=ctx.obj.get("graphs_dir"))
|
|
61
|
+
return engine
|
|
62
|
+
except Exception as exc:
|
|
63
|
+
click.echo(f"Error initializing Stata: {exc}", err=True)
|
|
64
|
+
_exit(EXIT_INIT_FAILURE)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _try_daemon(ctx, cmd_type: str, payload: dict) -> Result | None:
|
|
68
|
+
"""Try to route through daemon. Returns None if daemon unavailable."""
|
|
69
|
+
if ctx.obj.get("no_daemon"):
|
|
70
|
+
return None
|
|
71
|
+
try:
|
|
72
|
+
from .daemon import DaemonClient
|
|
73
|
+
client = DaemonClient()
|
|
74
|
+
if not client.is_running():
|
|
75
|
+
return None
|
|
76
|
+
if not client.connect():
|
|
77
|
+
return None
|
|
78
|
+
resp = client.send(cmd_type, payload)
|
|
79
|
+
client.close()
|
|
80
|
+
return Result(
|
|
81
|
+
success=resp.get("success", resp.get("status") == "success"),
|
|
82
|
+
output=resp.get("output", ""),
|
|
83
|
+
error=resp.get("error", ""),
|
|
84
|
+
execution_time=resp.get("execution_time", 0.0),
|
|
85
|
+
return_code=resp.get("return_code", 0),
|
|
86
|
+
extra=resp.get("extra", {}),
|
|
87
|
+
)
|
|
88
|
+
except Exception:
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _print_result(result, compact: bool, use_json: bool = False, max_tokens: int = 0, filter_echo: bool = False) -> None:
|
|
93
|
+
output = result.output
|
|
94
|
+
if output:
|
|
95
|
+
output = clean_log_wrapper(output)
|
|
96
|
+
if compact:
|
|
97
|
+
output = apply_compact_filter(output, filter_command_echo=filter_echo)
|
|
98
|
+
if max_tokens > 0:
|
|
99
|
+
output, _ = check_token_limit(output, max_tokens)
|
|
100
|
+
result.output = output
|
|
101
|
+
|
|
102
|
+
graphs = result.extra.get("graphs", []) if result.extra else []
|
|
103
|
+
|
|
104
|
+
if use_json:
|
|
105
|
+
click.echo(result.to_json())
|
|
106
|
+
if not result.success:
|
|
107
|
+
_exit(EXIT_STATA_ERROR)
|
|
108
|
+
return
|
|
109
|
+
|
|
110
|
+
if output and output.strip():
|
|
111
|
+
click.echo(output)
|
|
112
|
+
if graphs:
|
|
113
|
+
for g in graphs:
|
|
114
|
+
click.echo(f"[graph] {g.get('name', 'graph')}: {g.get('path', '')}")
|
|
115
|
+
if not result.success:
|
|
116
|
+
if result.error:
|
|
117
|
+
click.echo(result.error, err=True)
|
|
118
|
+
_exit(EXIT_STATA_ERROR)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
# ── Commands ─────────────────────────────────────────────────────────────
|
|
122
|
+
|
|
123
|
+
@cli.command()
|
|
124
|
+
@click.argument("code")
|
|
125
|
+
@click.pass_context
|
|
126
|
+
def run(ctx, code):
|
|
127
|
+
"""Execute a Stata code string.
|
|
128
|
+
|
|
129
|
+
Use '-' to read code from stdin (for piping).
|
|
130
|
+
|
|
131
|
+
\b
|
|
132
|
+
Examples:
|
|
133
|
+
stata-cli run "sysuse auto, clear"
|
|
134
|
+
stata-cli run "display 1+1"
|
|
135
|
+
echo "summarize price" | stata-cli run -
|
|
136
|
+
"""
|
|
137
|
+
if code == "-":
|
|
138
|
+
code = sys.stdin.read()
|
|
139
|
+
if not code.strip():
|
|
140
|
+
click.echo("Error: empty code.", err=True)
|
|
141
|
+
_exit(EXIT_USAGE_ERROR)
|
|
142
|
+
|
|
143
|
+
result = _try_daemon(ctx, "execute", {"code": code, "timeout": ctx.obj["timeout"]})
|
|
144
|
+
if result is None:
|
|
145
|
+
engine = _get_engine(ctx)
|
|
146
|
+
result = engine.run(code, timeout=ctx.obj["timeout"])
|
|
147
|
+
_print_result(result, ctx.obj["compact"], use_json=ctx.obj["json"], max_tokens=ctx.obj["max_tokens"])
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@cli.command("do")
|
|
151
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
152
|
+
@click.pass_context
|
|
153
|
+
def do_file(ctx, path):
|
|
154
|
+
"""Execute a Stata .do file.
|
|
155
|
+
|
|
156
|
+
\b
|
|
157
|
+
Examples:
|
|
158
|
+
stata-cli do analysis.do
|
|
159
|
+
stata-cli --compact do long_script.do
|
|
160
|
+
"""
|
|
161
|
+
result = _try_daemon(ctx, "execute_file", {"path": os.path.abspath(path), "timeout": ctx.obj["timeout"]})
|
|
162
|
+
if result is None:
|
|
163
|
+
engine = _get_engine(ctx)
|
|
164
|
+
result = engine.run_file(path, timeout=ctx.obj["timeout"])
|
|
165
|
+
_print_result(result, ctx.obj["compact"], use_json=ctx.obj["json"], max_tokens=ctx.obj["max_tokens"], filter_echo=True)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@cli.command()
|
|
169
|
+
@click.pass_context
|
|
170
|
+
def detect(ctx):
|
|
171
|
+
"""Print the auto-detected Stata installation path."""
|
|
172
|
+
stata_path = ctx.obj["stata_path"] or detect_stata_path()
|
|
173
|
+
if stata_path:
|
|
174
|
+
click.echo(stata_path)
|
|
175
|
+
else:
|
|
176
|
+
click.echo("Stata installation not found.", err=True)
|
|
177
|
+
_exit(EXIT_INIT_FAILURE)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@cli.command("data")
|
|
181
|
+
@click.option("--if", "if_condition", default=None, help="Stata if condition for filtering.")
|
|
182
|
+
@click.option("--rows", type=int, default=10000, help="Maximum rows to return.")
|
|
183
|
+
@click.pass_context
|
|
184
|
+
def data_cmd(ctx, if_condition, rows):
|
|
185
|
+
"""View the current dataset as JSON.
|
|
186
|
+
|
|
187
|
+
\b
|
|
188
|
+
Examples:
|
|
189
|
+
stata-cli data
|
|
190
|
+
stata-cli data --if "price>5000" --rows 50
|
|
191
|
+
"""
|
|
192
|
+
# Try daemon first
|
|
193
|
+
try:
|
|
194
|
+
from .daemon import DaemonClient
|
|
195
|
+
client = DaemonClient()
|
|
196
|
+
if not ctx.obj.get("no_daemon") and client.is_running() and client.connect():
|
|
197
|
+
resp = client.send("get_data", {"if_condition": if_condition, "max_rows": rows})
|
|
198
|
+
client.close()
|
|
199
|
+
click.echo(json.dumps(resp, ensure_ascii=False, indent=2))
|
|
200
|
+
return
|
|
201
|
+
except Exception:
|
|
202
|
+
pass
|
|
203
|
+
|
|
204
|
+
engine = _get_engine(ctx)
|
|
205
|
+
resp = engine.get_data(if_condition=if_condition, max_rows=rows)
|
|
206
|
+
click.echo(json.dumps(resp, ensure_ascii=False, indent=2))
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
@cli.command("help")
|
|
210
|
+
@click.argument("topic")
|
|
211
|
+
@click.pass_context
|
|
212
|
+
def help_cmd(ctx, topic):
|
|
213
|
+
"""Display Stata help for a topic.
|
|
214
|
+
|
|
215
|
+
\b
|
|
216
|
+
Examples:
|
|
217
|
+
stata-cli help regress
|
|
218
|
+
stata-cli help summarize
|
|
219
|
+
"""
|
|
220
|
+
result = _try_daemon(ctx, "help", {"topic": topic})
|
|
221
|
+
if result is None:
|
|
222
|
+
engine = _get_engine(ctx)
|
|
223
|
+
result = engine.help(topic)
|
|
224
|
+
if ctx.obj["json"]:
|
|
225
|
+
click.echo(result.to_json())
|
|
226
|
+
elif result.output and result.output.strip():
|
|
227
|
+
click.echo(result.output)
|
|
228
|
+
else:
|
|
229
|
+
click.echo(f"No help found for: {topic}", err=True)
|
|
230
|
+
_exit(EXIT_STATA_ERROR)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
@cli.command("stop")
|
|
234
|
+
@click.pass_context
|
|
235
|
+
def stop_cmd(ctx):
|
|
236
|
+
"""Interrupt a running Stata command (daemon mode)."""
|
|
237
|
+
try:
|
|
238
|
+
from .daemon import DaemonClient
|
|
239
|
+
client = DaemonClient()
|
|
240
|
+
if client.is_running() and client.connect():
|
|
241
|
+
resp = client.send("stop")
|
|
242
|
+
client.close()
|
|
243
|
+
click.echo(f"Stop signal: {resp.get('status', 'unknown')}")
|
|
244
|
+
return
|
|
245
|
+
except Exception:
|
|
246
|
+
pass
|
|
247
|
+
click.echo("Daemon not running.", err=True)
|
|
248
|
+
_exit(EXIT_USAGE_ERROR)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
# ── Daemon subcommands ───────────────────────────────────────────────────
|
|
252
|
+
|
|
253
|
+
@cli.group()
|
|
254
|
+
def daemon():
|
|
255
|
+
"""Manage the Stata daemon process."""
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
@daemon.command("start")
|
|
259
|
+
@click.option("--idle-timeout", type=int, default=3600, help="Auto-shutdown after N seconds idle.")
|
|
260
|
+
@click.pass_context
|
|
261
|
+
def daemon_start(ctx, idle_timeout):
|
|
262
|
+
"""Start the Stata daemon (keeps PyStata alive for fast execution)."""
|
|
263
|
+
stata_path = ctx.obj["stata_path"] or detect_stata_path()
|
|
264
|
+
if not stata_path:
|
|
265
|
+
click.echo("Error: Stata installation not found.", err=True)
|
|
266
|
+
_exit(EXIT_INIT_FAILURE)
|
|
267
|
+
|
|
268
|
+
from .daemon import start_daemon, DaemonClient
|
|
269
|
+
client = DaemonClient()
|
|
270
|
+
if client.is_running():
|
|
271
|
+
click.echo("Daemon already running.")
|
|
272
|
+
return
|
|
273
|
+
|
|
274
|
+
click.echo("Starting daemon...")
|
|
275
|
+
ok = start_daemon(stata_path, ctx.obj["edition"], graphs_dir=ctx.obj.get("graphs_dir"), idle_timeout=idle_timeout)
|
|
276
|
+
if ok:
|
|
277
|
+
click.echo("Daemon started.")
|
|
278
|
+
else:
|
|
279
|
+
click.echo("Failed to start daemon.", err=True)
|
|
280
|
+
_exit(EXIT_INIT_FAILURE)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
@daemon.command("stop")
|
|
284
|
+
def daemon_stop():
|
|
285
|
+
"""Stop the Stata daemon."""
|
|
286
|
+
from .daemon import stop_daemon, DaemonClient
|
|
287
|
+
client = DaemonClient()
|
|
288
|
+
if not client.is_running():
|
|
289
|
+
click.echo("Daemon not running.")
|
|
290
|
+
return
|
|
291
|
+
click.echo("Stopping daemon...")
|
|
292
|
+
stop_daemon()
|
|
293
|
+
click.echo("Daemon stopped.")
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
@daemon.command("status")
|
|
297
|
+
def daemon_status_cmd():
|
|
298
|
+
"""Show daemon status."""
|
|
299
|
+
from .daemon import daemon_status
|
|
300
|
+
info = daemon_status()
|
|
301
|
+
if not info:
|
|
302
|
+
click.echo("Daemon not running.")
|
|
303
|
+
return
|
|
304
|
+
uptime = info.get("uptime", 0)
|
|
305
|
+
idle = info.get("idle", 0)
|
|
306
|
+
click.echo(f"Daemon running (PID {info.get('pid', '?')})")
|
|
307
|
+
click.echo(f" Stata: {info.get('stata_path', '?')} ({info.get('edition', '?')})")
|
|
308
|
+
click.echo(f" Uptime: {int(uptime)}s")
|
|
309
|
+
click.echo(f" Idle: {int(idle)}s")
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
@daemon.command("restart")
|
|
313
|
+
@click.option("--idle-timeout", type=int, default=3600, help="Auto-shutdown after N seconds idle.")
|
|
314
|
+
@click.pass_context
|
|
315
|
+
def daemon_restart(ctx, idle_timeout):
|
|
316
|
+
"""Restart the Stata daemon."""
|
|
317
|
+
from .daemon import stop_daemon, start_daemon, DaemonClient
|
|
318
|
+
client = DaemonClient()
|
|
319
|
+
if client.is_running():
|
|
320
|
+
click.echo("Stopping daemon...")
|
|
321
|
+
stop_daemon()
|
|
322
|
+
|
|
323
|
+
stata_path = ctx.obj["stata_path"] or detect_stata_path()
|
|
324
|
+
if not stata_path:
|
|
325
|
+
click.echo("Error: Stata installation not found.", err=True)
|
|
326
|
+
_exit(EXIT_INIT_FAILURE)
|
|
327
|
+
|
|
328
|
+
click.echo("Starting daemon...")
|
|
329
|
+
ok = start_daemon(stata_path, ctx.obj["edition"], graphs_dir=ctx.obj.get("graphs_dir"), idle_timeout=idle_timeout)
|
|
330
|
+
if ok:
|
|
331
|
+
click.echo("Daemon restarted.")
|
|
332
|
+
else:
|
|
333
|
+
click.echo("Failed to restart daemon.", err=True)
|
|
334
|
+
_exit(EXIT_INIT_FAILURE)
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
# Allow running as `python -m stata_cli`
|
|
338
|
+
def main():
|
|
339
|
+
cli()
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
if __name__ == "__main__":
|
|
343
|
+
main()
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
"""Output filtering for Stata CLI.
|
|
2
|
+
|
|
3
|
+
Provides compact-mode filtering (strips verbose/redundant output) and
|
|
4
|
+
cleanup of the log-file wrapper lines injected by the engine.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import time
|
|
10
|
+
import tempfile
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
_BANNER_END_RE = re.compile(r"^-{40,}$")
|
|
14
|
+
_LOG_SCAFFOLD_PATTERNS = [
|
|
15
|
+
re.compile(r"^\s*\.?\s*capture\s+log\s+close", re.IGNORECASE),
|
|
16
|
+
re.compile(r"^\s*\.?\s*log\s+using\s+", re.IGNORECASE),
|
|
17
|
+
re.compile(r"^\s*(name|log|log type|opened on|closed on):", re.IGNORECASE),
|
|
18
|
+
# Continuation of a long log-using path that wraps to the next line
|
|
19
|
+
re.compile(r"^>\s.*\.log"),
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def clean_log_wrapper(output: str) -> str:
|
|
24
|
+
"""Remove Stata banner and ``log using`` / ``log close`` scaffolding."""
|
|
25
|
+
if not output:
|
|
26
|
+
return output
|
|
27
|
+
|
|
28
|
+
lines = output.split("\n")
|
|
29
|
+
|
|
30
|
+
# 1. Strip the startup banner (everything up to and including the "---…" separator)
|
|
31
|
+
start = 0
|
|
32
|
+
for i, line in enumerate(lines):
|
|
33
|
+
if _BANNER_END_RE.match(line.strip()):
|
|
34
|
+
start = i + 1
|
|
35
|
+
break
|
|
36
|
+
|
|
37
|
+
cleaned: list[str] = []
|
|
38
|
+
for line in lines[start:]:
|
|
39
|
+
if any(pat.match(line.strip()) for pat in _LOG_SCAFFOLD_PATTERNS):
|
|
40
|
+
continue
|
|
41
|
+
cleaned.append(line)
|
|
42
|
+
|
|
43
|
+
while cleaned and not cleaned[0].strip():
|
|
44
|
+
cleaned.pop(0)
|
|
45
|
+
while cleaned and not cleaned[-1].strip():
|
|
46
|
+
cleaned.pop()
|
|
47
|
+
return "\n".join(cleaned)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def apply_compact_filter(output: str, filter_command_echo: bool = False) -> str:
|
|
51
|
+
"""Strip verbose/redundant output to reduce noise.
|
|
52
|
+
|
|
53
|
+
Always filters:
|
|
54
|
+
- Program definition blocks
|
|
55
|
+
- Mata blocks
|
|
56
|
+
- Loop code echoes (keeps actual output)
|
|
57
|
+
- SMCL formatting tags
|
|
58
|
+
- Verbose messages like "(N real changes made)"
|
|
59
|
+
|
|
60
|
+
When *filter_command_echo* is True (e.g. for ``do`` files):
|
|
61
|
+
- Command echo lines (``". "`` prefix)
|
|
62
|
+
- Line continuations (``"> "``)
|
|
63
|
+
"""
|
|
64
|
+
if not output:
|
|
65
|
+
return output
|
|
66
|
+
|
|
67
|
+
output = output.replace("\r\n", "\n").replace("\r", "\n")
|
|
68
|
+
lines = output.split("\n")
|
|
69
|
+
filtered: list[str] = []
|
|
70
|
+
|
|
71
|
+
command_echo_pat = re.compile(r"^\.\s*$|^\.\s+\S")
|
|
72
|
+
numbered_line_pat = re.compile(r"^\s*\d+\.\s")
|
|
73
|
+
continuation_pat = re.compile(r"^>\s")
|
|
74
|
+
|
|
75
|
+
program_drop_pat = re.compile(
|
|
76
|
+
r"^\s*\.?\s*(capture\s+program\s+drop|cap\s+program\s+drop|cap\s+prog\s+drop)\s+\w+",
|
|
77
|
+
re.IGNORECASE,
|
|
78
|
+
)
|
|
79
|
+
program_define_pat = re.compile(
|
|
80
|
+
r"^\s*\.?\s*program\s+(define\s+)?(?!version|dir|drop|list|describe)\w+",
|
|
81
|
+
re.IGNORECASE,
|
|
82
|
+
)
|
|
83
|
+
mata_start_pat = re.compile(
|
|
84
|
+
r"^\s*(\d+\.)?\s*\.?\s*mata\s*:?\s*$|^-+\s*mata\s*\(",
|
|
85
|
+
re.IGNORECASE,
|
|
86
|
+
)
|
|
87
|
+
end_pat = re.compile(r"^\s*(\d+\.)?\s*[.:]*\s*end\s*$", re.IGNORECASE)
|
|
88
|
+
mata_sep_pat = re.compile(r"^-{20,}$")
|
|
89
|
+
|
|
90
|
+
loop_start_pat = re.compile(
|
|
91
|
+
r"^(\s*\d+\.)?\s*\.?\s*(foreach|forvalues|while)\s+.*\{\s*$",
|
|
92
|
+
re.IGNORECASE,
|
|
93
|
+
)
|
|
94
|
+
loop_end_pat = re.compile(r"^\s*\d+\.\s*\}\s*$")
|
|
95
|
+
|
|
96
|
+
real_changes_pat = re.compile(r"^\s*\([\d,]+\s+real\s+changes?\s+made\)\s*$", re.IGNORECASE)
|
|
97
|
+
missing_values_pat = re.compile(r"^\s*\([\d,]+\s+missing\s+values?\s+generated\)\s*$", re.IGNORECASE)
|
|
98
|
+
smcl_pat = re.compile(
|
|
99
|
+
r"\{(txt|res|err|inp|com|bf|it|sf|hline|c\s+\||\-+|break|col\s+\d+|right|center|ul|/ul)\}"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
in_program = False
|
|
103
|
+
in_mata = False
|
|
104
|
+
in_loop = False
|
|
105
|
+
program_end_depth = 0
|
|
106
|
+
loop_brace_depth = 0
|
|
107
|
+
|
|
108
|
+
i = 0
|
|
109
|
+
while i < len(lines):
|
|
110
|
+
line = lines[i]
|
|
111
|
+
|
|
112
|
+
if in_program:
|
|
113
|
+
if mata_start_pat.match(line):
|
|
114
|
+
program_end_depth += 1
|
|
115
|
+
if end_pat.match(line):
|
|
116
|
+
if program_end_depth > 0:
|
|
117
|
+
program_end_depth -= 1
|
|
118
|
+
else:
|
|
119
|
+
in_program = False
|
|
120
|
+
i += 1
|
|
121
|
+
continue
|
|
122
|
+
|
|
123
|
+
if in_mata:
|
|
124
|
+
if end_pat.match(line):
|
|
125
|
+
in_mata = False
|
|
126
|
+
if i + 1 < len(lines) and mata_sep_pat.match(lines[i + 1]):
|
|
127
|
+
i += 1
|
|
128
|
+
i += 1
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
if in_loop:
|
|
132
|
+
if loop_start_pat.match(line):
|
|
133
|
+
loop_brace_depth += 1
|
|
134
|
+
i += 1
|
|
135
|
+
continue
|
|
136
|
+
if loop_end_pat.match(line):
|
|
137
|
+
if loop_brace_depth > 0:
|
|
138
|
+
loop_brace_depth -= 1
|
|
139
|
+
else:
|
|
140
|
+
in_loop = False
|
|
141
|
+
i += 1
|
|
142
|
+
continue
|
|
143
|
+
if command_echo_pat.match(line) or numbered_line_pat.match(line) or continuation_pat.match(line):
|
|
144
|
+
i += 1
|
|
145
|
+
continue
|
|
146
|
+
if real_changes_pat.match(line) or missing_values_pat.match(line):
|
|
147
|
+
i += 1
|
|
148
|
+
continue
|
|
149
|
+
line = smcl_pat.sub("", line)
|
|
150
|
+
if line.strip():
|
|
151
|
+
filtered.append(line)
|
|
152
|
+
i += 1
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
if loop_start_pat.match(line):
|
|
156
|
+
in_loop = True
|
|
157
|
+
loop_brace_depth = 0
|
|
158
|
+
i += 1
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
if program_drop_pat.match(line):
|
|
162
|
+
i += 1
|
|
163
|
+
continue
|
|
164
|
+
if program_define_pat.match(line):
|
|
165
|
+
in_program = True
|
|
166
|
+
program_end_depth = 0
|
|
167
|
+
i += 1
|
|
168
|
+
continue
|
|
169
|
+
if mata_start_pat.match(line):
|
|
170
|
+
in_mata = True
|
|
171
|
+
i += 1
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
if real_changes_pat.match(line) or missing_values_pat.match(line):
|
|
175
|
+
i += 1
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
if filter_command_echo:
|
|
179
|
+
if command_echo_pat.match(line) or numbered_line_pat.match(line) or continuation_pat.match(line):
|
|
180
|
+
i += 1
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
line = smcl_pat.sub("", line)
|
|
184
|
+
filtered.append(line)
|
|
185
|
+
i += 1
|
|
186
|
+
|
|
187
|
+
# Collapse consecutive blank lines
|
|
188
|
+
result: list[str] = []
|
|
189
|
+
prev_blank = False
|
|
190
|
+
for line in filtered:
|
|
191
|
+
is_blank = not line.strip()
|
|
192
|
+
if is_blank:
|
|
193
|
+
if not prev_blank:
|
|
194
|
+
result.append(line)
|
|
195
|
+
prev_blank = True
|
|
196
|
+
else:
|
|
197
|
+
result.append(line)
|
|
198
|
+
prev_blank = False
|
|
199
|
+
|
|
200
|
+
while result and not result[-1].strip():
|
|
201
|
+
result.pop()
|
|
202
|
+
|
|
203
|
+
return "\n".join(result)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def check_token_limit(output: str, max_tokens: int) -> tuple[str, bool]:
|
|
207
|
+
"""Truncate output exceeding *max_tokens* (~4 chars/token).
|
|
208
|
+
|
|
209
|
+
Returns ``(output, was_truncated)``. When truncated the full output is
|
|
210
|
+
saved to a temp file and a summary with the file path is returned.
|
|
211
|
+
"""
|
|
212
|
+
if max_tokens <= 0 or not output:
|
|
213
|
+
return output, False
|
|
214
|
+
|
|
215
|
+
estimated_tokens = len(output) / 4
|
|
216
|
+
if estimated_tokens <= max_tokens:
|
|
217
|
+
return output, False
|
|
218
|
+
|
|
219
|
+
logs_dir = os.path.join(tempfile.gettempdir(), "stata_cli_logs")
|
|
220
|
+
os.makedirs(logs_dir, exist_ok=True)
|
|
221
|
+
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
|
222
|
+
log_path = os.path.join(logs_dir, f"stata_output_{timestamp}.log")
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
with open(log_path, "w", encoding="utf-8") as fh:
|
|
226
|
+
fh.write(output)
|
|
227
|
+
except OSError:
|
|
228
|
+
max_chars = max_tokens * 4
|
|
229
|
+
return output[:max_chars] + f"\n\n... [Output truncated at {max_tokens} tokens]", True
|
|
230
|
+
|
|
231
|
+
preview = output[:1000]
|
|
232
|
+
if len(output) > 1000:
|
|
233
|
+
preview += "\n... [truncated]"
|
|
234
|
+
msg = (
|
|
235
|
+
f"Output exceeded token limit ({int(estimated_tokens)} tokens > {max_tokens} max).\n"
|
|
236
|
+
f"Full output saved to: {log_path}\n\n"
|
|
237
|
+
f"--- Preview ---\n{preview}"
|
|
238
|
+
)
|
|
239
|
+
return msg, True
|
stata_cli/smcl_parser.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Simplified SMCL-to-plain-text converter for Stata help files."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
_CHAR_CODES = {
|
|
6
|
+
"S|": "$", "'g": "`", "-(": "{", ")-": "}",
|
|
7
|
+
"-": "─", "|": "│", "+": "┼",
|
|
8
|
+
"TT": "┬", "BT": "┴", "LT": "├", "RT": "┤",
|
|
9
|
+
"TLC": "┌", "TRC": "┐", "BRC": "┘", "BLC": "└",
|
|
10
|
+
"a'": "á", "e'": "é", "i'": "í", "o'": "ó", "u'": "ú",
|
|
11
|
+
"n~": "ñ", "ss": "ß", "c,": "ç",
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _resolve_char(code: str) -> str:
|
|
16
|
+
code = code.strip()
|
|
17
|
+
if code in _CHAR_CODES:
|
|
18
|
+
return _CHAR_CODES[code]
|
|
19
|
+
if code.startswith("0x") or code.startswith("0X"):
|
|
20
|
+
try:
|
|
21
|
+
return chr(int(code[2:], 16))
|
|
22
|
+
except (ValueError, OverflowError):
|
|
23
|
+
return code
|
|
24
|
+
try:
|
|
25
|
+
n = int(code)
|
|
26
|
+
if 1 <= n <= 0x10FFFF:
|
|
27
|
+
return chr(n)
|
|
28
|
+
except (ValueError, OverflowError):
|
|
29
|
+
pass
|
|
30
|
+
return code
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Tags that are simply stripped (content kept)
|
|
34
|
+
_STRIP_TAGS = re.compile(
|
|
35
|
+
r"\{/?(?:txt|res|err|inp|com|bf|it|sf|ul|smcl|s6hlp|"
|
|
36
|
+
r"p_end|pstd|phang|pmore|pin|p2colset[^}]*|p2col[^}]*|"
|
|
37
|
+
r"marker[^}]*|dlgtab[^}]*|synoptset[^}]*|syntab[^}]*|"
|
|
38
|
+
r"synopt[^}]*|synopthdr[^}]*|"
|
|
39
|
+
r"col\s+\d+|right|center|break|reset|"
|
|
40
|
+
r"bind\s+[^}]*)\}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# {hline} or {hline N} -> dashes
|
|
44
|
+
_HLINE_RE = re.compile(r"\{hline(?:\s+(\d+))?\}")
|
|
45
|
+
|
|
46
|
+
# {help topic}, {help topic:text}, {manhelp topic section}
|
|
47
|
+
_HELP_RE = re.compile(r"\{(?:help|manhelp)\s+([^}:]+?)(?::([^}]+))?\}")
|
|
48
|
+
|
|
49
|
+
# {browse "url":text} or {browse "url"}
|
|
50
|
+
_BROWSE_RE = re.compile(r'\{browse\s+"([^"]*)"(?::([^}]+))?\}')
|
|
51
|
+
|
|
52
|
+
# {cmd:text}, {opt:text}, {hi:text}, {title:text}, {it:text}, {bf:text}
|
|
53
|
+
_STYLED_RE = re.compile(r"\{(?:cmd|opt|hi|title|input|stata)\s*:\s*([^}]*)\}")
|
|
54
|
+
_STYLED2_RE = re.compile(r"\{(?:it|bf|ul)\s*:\s*([^}]*)\}")
|
|
55
|
+
|
|
56
|
+
# {c CODE}
|
|
57
|
+
_CHAR_RE = re.compile(r"\{c\s+([^}]+)\}")
|
|
58
|
+
|
|
59
|
+
# {space N}
|
|
60
|
+
_SPACE_RE = re.compile(r"\{space\s+(\d+)\}")
|
|
61
|
+
|
|
62
|
+
# Catch-all: any remaining {tag ...} or {tag:...}
|
|
63
|
+
_CATCHALL_RE = re.compile(r"\{[a-zA-Z_][^}]*\}")
|
|
64
|
+
|
|
65
|
+
# SMCL header line
|
|
66
|
+
_SMCL_HEADER_RE = re.compile(r"^\{smcl\}\s*$", re.MULTILINE)
|
|
67
|
+
|
|
68
|
+
# Star-bang lines in starbang output
|
|
69
|
+
_STARBANG_RE = re.compile(r"^\*!\s?", re.MULTILINE)
|
|
70
|
+
|
|
71
|
+
# INCLUDE directives (Stata-internal cross-references)
|
|
72
|
+
_INCLUDE_RE = re.compile(r"^INCLUDE\s+help\s+\S+.*$", re.MULTILINE)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def smcl_to_text(raw: str) -> str:
|
|
76
|
+
"""Convert SMCL markup to readable plain text."""
|
|
77
|
+
text = _SMCL_HEADER_RE.sub("", raw)
|
|
78
|
+
text = _STARBANG_RE.sub("", text)
|
|
79
|
+
text = _INCLUDE_RE.sub("", text)
|
|
80
|
+
|
|
81
|
+
text = _HLINE_RE.sub(lambda m: "-" * int(m.group(1) or 78), text)
|
|
82
|
+
text = _HELP_RE.sub(lambda m: m.group(2) or m.group(1), text)
|
|
83
|
+
text = _BROWSE_RE.sub(lambda m: m.group(2) or m.group(1), text)
|
|
84
|
+
text = _STYLED_RE.sub(r"\1", text)
|
|
85
|
+
text = _STYLED2_RE.sub(r"\1", text)
|
|
86
|
+
text = _CHAR_RE.sub(lambda m: _resolve_char(m.group(1)), text)
|
|
87
|
+
text = _SPACE_RE.sub(lambda m: " " * int(m.group(1)), text)
|
|
88
|
+
text = _STRIP_TAGS.sub("", text)
|
|
89
|
+
text = _CATCHALL_RE.sub("", text)
|
|
90
|
+
|
|
91
|
+
# Collapse runs of >2 blank lines
|
|
92
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
93
|
+
return text.strip() + "\n"
|