mcp-stata 1.20.0__cp311-abi3-macosx_11_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcp-stata might be problematic. Click here for more details.

@@ -0,0 +1,3699 @@
1
+ from __future__ import annotations
2
+ import asyncio
3
+ import io
4
+ import inspect
5
+ import json
6
+ import logging
7
+ import os
8
+ import platform
9
+ import re
10
+ import subprocess
11
+ import sys
12
+ import tempfile
13
+ import threading
14
+ import time
15
+ import uuid
16
+ from contextlib import contextmanager, redirect_stdout, redirect_stderr
17
+ from importlib.metadata import PackageNotFoundError, version
18
+ from io import StringIO
19
+ from typing import Any, Awaitable, Callable, Dict, Generator, List, Optional, Tuple
20
+
21
+ import anyio
22
+ from anyio import get_cancelled_exc_class
23
+
24
+ from .discovery import find_stata_candidates
25
+ from .config import MAX_LIMIT
26
+ from .models import (
27
+ CommandResponse,
28
+ ErrorEnvelope,
29
+ GraphExport,
30
+ GraphExportResponse,
31
+ GraphInfo,
32
+ GraphListResponse,
33
+ VariableInfo,
34
+ VariablesResponse,
35
+ )
36
+ from .smcl.smcl2html import smcl_to_markdown
37
+ from .streaming_io import FileTeeIO, TailBuffer
38
+ from .graph_detector import StreamingGraphCache
39
+ from .native_ops import fast_scan_log, compute_filter_indices
40
+
41
+ logger = logging.getLogger("mcp_stata")
42
+
43
+ _POLARS_AVAILABLE: Optional[bool] = None
44
+
45
+ def _check_polars_available() -> bool:
46
+ """
47
+ Check if Polars can be safely imported.
48
+ Must detect problematic platforms BEFORE attempting import,
49
+ since the crash is a fatal signal, not a catchable exception.
50
+ """
51
+ if sys.platform == "win32" and platform.machine().lower() in ("arm64", "aarch64"):
52
+ return False
53
+
54
+ try:
55
+ import polars # noqa: F401
56
+ return True
57
+ except ImportError:
58
+ return False
59
+
60
+
61
+ def _get_polars_available() -> bool:
62
+ global _POLARS_AVAILABLE
63
+ if _POLARS_AVAILABLE is None:
64
+ _POLARS_AVAILABLE = _check_polars_available()
65
+ return _POLARS_AVAILABLE
66
+
67
+ # ============================================================================
68
+ # MODULE-LEVEL DISCOVERY CACHE
69
+ # ============================================================================
70
+ # This cache ensures Stata discovery runs exactly once per process lifetime
71
+ _discovery_lock = threading.Lock()
72
+ _discovery_result: Optional[Tuple[str, str]] = None # (path, edition)
73
+ _discovery_candidates: Optional[List[Tuple[str, str]]] = None
74
+ _discovery_attempted = False
75
+ _discovery_error: Optional[Exception] = None
76
+
77
+
78
+ def _get_discovery_candidates() -> List[Tuple[str, str]]:
79
+ """
80
+ Get ordered discovery candidates, running discovery only once.
81
+
82
+ Returns:
83
+ List of (stata_executable_path, edition) ordered by preference.
84
+
85
+ Raises:
86
+ RuntimeError: If Stata discovery fails
87
+ """
88
+ global _discovery_result, _discovery_candidates, _discovery_attempted, _discovery_error
89
+
90
+ with _discovery_lock:
91
+ # If we've already successfully discovered Stata, return cached result
92
+ if _discovery_result is not None:
93
+ return _discovery_candidates or [_discovery_result]
94
+
95
+ if _discovery_candidates is not None:
96
+ return _discovery_candidates
97
+
98
+ # If we've already attempted and failed, re-raise the cached error
99
+ if _discovery_attempted and _discovery_error is not None:
100
+ raise RuntimeError(f"Stata binary not found: {_discovery_error}") from _discovery_error
101
+
102
+ # This is the first attempt - run discovery
103
+ _discovery_attempted = True
104
+
105
+ try:
106
+ # Log environment state once at first discovery
107
+ env_path = os.getenv("STATA_PATH")
108
+ if env_path:
109
+ logger.info("STATA_PATH env provided (raw): %s", env_path)
110
+ else:
111
+ logger.info("STATA_PATH env not set; attempting auto-discovery")
112
+
113
+ # Run discovery
114
+ candidates = find_stata_candidates()
115
+
116
+ # Cache the successful result
117
+ _discovery_candidates = candidates
118
+ if candidates:
119
+ _discovery_result = candidates[0]
120
+ logger.info("Discovery found Stata at: %s (%s)", _discovery_result[0], _discovery_result[1])
121
+ else:
122
+ raise FileNotFoundError("No Stata candidates discovered")
123
+
124
+ return candidates
125
+
126
+ except FileNotFoundError as e:
127
+ _discovery_error = e
128
+ raise RuntimeError(f"Stata binary not found: {e}") from e
129
+ except PermissionError as e:
130
+ _discovery_error = e
131
+ raise RuntimeError(
132
+ f"Stata binary is not executable: {e}. "
133
+ "Point STATA_PATH directly to the Stata binary (e.g., .../Contents/MacOS/stata-mp)."
134
+ ) from e
135
+
136
+
137
+ def _get_discovered_stata() -> Tuple[str, str]:
138
+ """
139
+ Preserve existing API: return the highest-priority discovered Stata candidate.
140
+ """
141
+ candidates = _get_discovery_candidates()
142
+ if not candidates:
143
+ raise RuntimeError("Stata binary not found: no candidates discovered")
144
+ return candidates[0]
145
+
146
+
147
+ class StataClient:
148
+ _initialized = False
149
+ _exec_lock: threading.Lock
150
+ _cache_init_lock = threading.Lock() # Class-level lock for cache initialization
151
+ _is_executing = False # Flag to prevent recursive Stata calls
152
+ MAX_DATA_ROWS = MAX_LIMIT
153
+ MAX_GRAPH_BYTES = 50 * 1024 * 1024 # Maximum graph exports (~50MB)
154
+ MAX_CACHE_SIZE = 100 # Maximum number of graphs to cache
155
+ MAX_CACHE_BYTES = 500 * 1024 * 1024 # Maximum cache size in bytes (~500MB)
156
+ LIST_GRAPHS_TTL = 0.075 # TTL for list_graphs cache (75ms)
157
+
158
+ def __init__(self):
159
+ self._exec_lock = threading.RLock()
160
+ self._is_executing = False
161
+ self._command_idx = 0 # Counter for user-initiated commands
162
+ self._initialized = False
163
+ from .graph_detector import GraphCreationDetector
164
+ self._graph_detector = GraphCreationDetector(self)
165
+
166
+ def __new__(cls):
167
+ inst = super(StataClient, cls).__new__(cls)
168
+ inst._exec_lock = threading.RLock()
169
+ inst._is_executing = False
170
+ inst._command_idx = 0
171
+ from .graph_detector import GraphCreationDetector
172
+ inst._graph_detector = GraphCreationDetector(inst)
173
+ return inst
174
+
175
+ def _increment_command_idx(self) -> int:
176
+ """Increment and return the command counter."""
177
+ self._command_idx += 1
178
+ return self._command_idx
179
+
180
+ @contextmanager
181
+ def _redirect_io(self, out_buf, err_buf):
182
+ """Safely redirect stdout/stderr for the duration of a Stata call."""
183
+ backup_stdout, backup_stderr = sys.stdout, sys.stderr
184
+ sys.stdout, sys.stderr = out_buf, err_buf
185
+ try:
186
+ yield
187
+ finally:
188
+ sys.stdout, sys.stderr = backup_stdout, backup_stderr
189
+
190
+
191
+ @staticmethod
192
+ def _stata_quote(value: str) -> str:
193
+ """Return a Stata double-quoted string literal for value."""
194
+ # Stata uses doubled quotes to represent a quote character inside a string.
195
+ v = (value or "")
196
+ v = v.replace('"', '""')
197
+ # Use compound double quotes to avoid tokenization issues with spaces and
198
+ # punctuation in contexts like graph names.
199
+ return f'`"{v}"\''
200
+
201
+ @contextmanager
202
+ def _redirect_io_streaming(self, out_stream, err_stream):
203
+ backup_stdout, backup_stderr = sys.stdout, sys.stderr
204
+ sys.stdout, sys.stderr = out_stream, err_stream
205
+ try:
206
+ yield
207
+ finally:
208
+ sys.stdout, sys.stderr = backup_stdout, backup_stderr
209
+
210
+ @staticmethod
211
+ def _safe_unlink(path: str) -> None:
212
+ if not path:
213
+ return
214
+ try:
215
+ if os.path.exists(path):
216
+ os.unlink(path)
217
+ except Exception:
218
+ pass
219
+
220
+ def _create_smcl_log_path(
221
+ self,
222
+ *,
223
+ prefix: str = "mcp_smcl_",
224
+ max_hex: Optional[int] = None,
225
+ base_dir: Optional[str] = None,
226
+ ) -> str:
227
+ hex_id = uuid.uuid4().hex if max_hex is None else uuid.uuid4().hex[:max_hex]
228
+ base = os.path.realpath(tempfile.gettempdir())
229
+ smcl_path = os.path.join(base, f"{prefix}{hex_id}.smcl")
230
+ self._safe_unlink(smcl_path)
231
+ return smcl_path
232
+
233
+ @staticmethod
234
+ def _make_smcl_log_name() -> str:
235
+ return f"_mcp_smcl_{uuid.uuid4().hex[:8]}"
236
+
237
+ def _open_smcl_log(self, smcl_path: str, log_name: str, *, quiet: bool = False) -> bool:
238
+ path_for_stata = smcl_path.replace("\\", "/")
239
+ base_cmd = f"log using \"{path_for_stata}\", replace smcl name({log_name})"
240
+ unnamed_cmd = f"log using \"{path_for_stata}\", replace smcl"
241
+ for attempt in range(4):
242
+ try:
243
+ logger.debug(
244
+ "_open_smcl_log attempt=%s log_name=%s path=%s",
245
+ attempt + 1,
246
+ log_name,
247
+ smcl_path,
248
+ )
249
+ logger.warning(
250
+ "SMCL open attempt %s cwd=%s path=%s",
251
+ attempt + 1,
252
+ os.getcwd(),
253
+ smcl_path,
254
+ )
255
+ logger.debug(
256
+ "SMCL open attempt=%s cwd=%s path=%s cmd=%s",
257
+ attempt + 1,
258
+ os.getcwd(),
259
+ smcl_path,
260
+ base_cmd,
261
+ )
262
+ try:
263
+ close_ret = self.stata.run("capture log close _all", echo=False)
264
+ if close_ret:
265
+ logger.warning("SMCL close_all output: %s", close_ret)
266
+ except Exception:
267
+ pass
268
+ cmd = f"{'quietly ' if quiet else ''}{base_cmd}"
269
+ try:
270
+ output_buf = StringIO()
271
+ with redirect_stdout(output_buf), redirect_stderr(output_buf):
272
+ self.stata.run(cmd, echo=False)
273
+ ret = output_buf.getvalue().strip()
274
+ if ret:
275
+ logger.warning("SMCL log open output: %s", ret)
276
+ except Exception as e:
277
+ logger.warning("SMCL log open failed (attempt %s): %s", attempt + 1, e)
278
+ logger.warning("SMCL log open failed: %r", e)
279
+ try:
280
+ retry_buf = StringIO()
281
+ with redirect_stdout(retry_buf), redirect_stderr(retry_buf):
282
+ self.stata.run(base_cmd, echo=False)
283
+ ret = retry_buf.getvalue().strip()
284
+ if ret:
285
+ logger.warning("SMCL log open output (no quiet): %s", ret)
286
+ except Exception as inner:
287
+ logger.warning("SMCL log open retry failed: %s", inner)
288
+ query_buf = StringIO()
289
+ try:
290
+ with redirect_stdout(query_buf), redirect_stderr(query_buf):
291
+ self.stata.run("log query", echo=False)
292
+ except Exception as query_err:
293
+ query_buf.write(f"log query failed: {query_err!r}")
294
+ query_ret = query_buf.getvalue().strip()
295
+ logger.warning("SMCL log query output: %s", query_ret)
296
+
297
+ if query_ret:
298
+ query_lower = query_ret.lower()
299
+ log_confirmed = "log:" in query_lower and "smcl" in query_lower and " on" in query_lower
300
+ if log_confirmed:
301
+ self._last_smcl_log_named = True
302
+ logger.info("SMCL log confirmed: %s", path_for_stata)
303
+ return True
304
+ logger.warning("SMCL log not confirmed after open; query_ret=%s", query_ret)
305
+ try:
306
+ unnamed_output = StringIO()
307
+ with redirect_stdout(unnamed_output), redirect_stderr(unnamed_output):
308
+ self.stata.run(unnamed_cmd, echo=False)
309
+ unnamed_ret = unnamed_output.getvalue().strip()
310
+ if unnamed_ret:
311
+ logger.warning("SMCL log open output (unnamed): %s", unnamed_ret)
312
+ except Exception as e:
313
+ logger.warning("SMCL log open failed (unnamed, attempt %s): %s", attempt + 1, e)
314
+ unnamed_query_buf = StringIO()
315
+ try:
316
+ with redirect_stdout(unnamed_query_buf), redirect_stderr(unnamed_query_buf):
317
+ self.stata.run("log query", echo=False)
318
+ except Exception as query_err:
319
+ unnamed_query_buf.write(f"log query failed: {query_err!r}")
320
+ unnamed_query = unnamed_query_buf.getvalue().strip()
321
+ if unnamed_query:
322
+ unnamed_lower = unnamed_query.lower()
323
+ unnamed_confirmed = "log:" in unnamed_lower and "smcl" in unnamed_lower and " on" in unnamed_lower
324
+ if unnamed_confirmed:
325
+ self._last_smcl_log_named = False
326
+ logger.info("SMCL log confirmed (unnamed): %s", path_for_stata)
327
+ return True
328
+ except Exception as e:
329
+ logger.warning("Failed to open SMCL log (attempt %s): %s", attempt + 1, e)
330
+ if attempt < 3:
331
+ time.sleep(0.1)
332
+ logger.warning("Failed to open SMCL log with cmd: %s", cmd)
333
+ return False
334
+
335
+ def _close_smcl_log(self, log_name: str) -> None:
336
+ try:
337
+ use_named = getattr(self, "_last_smcl_log_named", None)
338
+ if use_named is False:
339
+ self.stata.run("capture log close", echo=False)
340
+ else:
341
+ self.stata.run(f"capture log close {log_name}", echo=False)
342
+ except Exception:
343
+ pass
344
+
345
+ def _restore_results_from_hold(self, hold_attr: str) -> None:
346
+ if not hasattr(self, hold_attr):
347
+ return
348
+ hold_name = getattr(self, hold_attr)
349
+ try:
350
+ self.stata.run(f"capture _return restore {hold_name}", echo=False)
351
+ self._last_results = self.get_stored_results(force_fresh=True)
352
+ except Exception:
353
+ pass
354
+ finally:
355
+ try:
356
+ delattr(self, hold_attr)
357
+ except Exception:
358
+ pass
359
+
360
+ def _create_streaming_log(self, *, trace: bool) -> tuple[tempfile.NamedTemporaryFile, str, TailBuffer, FileTeeIO]:
361
+ log_file = tempfile.NamedTemporaryFile(
362
+ prefix="mcp_stata_",
363
+ suffix=".log",
364
+ delete=False,
365
+ mode="w",
366
+ encoding="utf-8",
367
+ errors="replace",
368
+ buffering=1,
369
+ )
370
+ log_path = log_file.name
371
+ tail = TailBuffer(max_chars=200000 if trace else 20000)
372
+ tee = FileTeeIO(log_file, tail)
373
+ return log_file, log_path, tail, tee
374
+
375
+ def _init_streaming_graph_cache(
376
+ self,
377
+ auto_cache_graphs: bool,
378
+ on_graph_cached: Optional[Callable[[str, bool], Awaitable[None]]],
379
+ notify_log: Callable[[str], Awaitable[None]],
380
+ ) -> Optional[StreamingGraphCache]:
381
+ if not auto_cache_graphs:
382
+ return None
383
+ graph_cache = StreamingGraphCache(self, auto_cache=True)
384
+ graph_cache_callback = self._create_graph_cache_callback(on_graph_cached, notify_log)
385
+ graph_cache.add_cache_callback(graph_cache_callback)
386
+ return graph_cache
387
+
388
+ def _capture_graph_state(
389
+ self,
390
+ graph_cache: Optional[StreamingGraphCache],
391
+ emit_graph_ready: bool,
392
+ ) -> Optional[dict[str, str]]:
393
+ # Capture initial graph state BEFORE execution starts
394
+ if graph_cache:
395
+ # Clear detection state for the new command (detected/removed sets)
396
+ # but preserve _last_graph_state signatures for modification detection.
397
+ graph_cache.detector.clear_detection_state()
398
+ try:
399
+ graph_cache._initial_graphs = set(self.list_graphs(force_refresh=True))
400
+ logger.debug(f"Initial graph state captured: {graph_cache._initial_graphs}")
401
+ except Exception as e:
402
+ logger.debug(f"Failed to capture initial graph state: {e}")
403
+ graph_cache._initial_graphs = set()
404
+
405
+ graph_ready_initial = None
406
+ if emit_graph_ready:
407
+ try:
408
+ graph_ready_initial = {}
409
+ for graph_name in self.list_graphs(force_refresh=True):
410
+ graph_ready_initial[graph_name] = self._get_graph_signature(graph_name)
411
+ logger.debug("Graph-ready initial state captured: %s", set(graph_ready_initial))
412
+ except Exception as e:
413
+ logger.debug("Failed to capture graph-ready state: %s", e)
414
+ graph_ready_initial = {}
415
+ return graph_ready_initial
416
+
417
+ async def _cache_new_graphs(
418
+ self,
419
+ graph_cache: Optional[StreamingGraphCache],
420
+ *,
421
+ notify_progress: Optional[Callable[[float, Optional[float], Optional[str]], Awaitable[None]]],
422
+ total_lines: int,
423
+ completed_label: str,
424
+ ) -> None:
425
+ if not graph_cache or not graph_cache.auto_cache:
426
+ return
427
+ try:
428
+ cached_graphs = []
429
+ # Use detector to find new OR modified graphs
430
+ pystata_detected = await anyio.to_thread.run_sync(graph_cache.detector._detect_graphs_via_pystata)
431
+
432
+ # Combine with any pending graphs in queue
433
+ with graph_cache._lock:
434
+ to_process = set(pystata_detected) | set(graph_cache._graphs_to_cache)
435
+ graph_cache._graphs_to_cache.clear()
436
+
437
+ if to_process:
438
+ logger.info(f"Detected {len(to_process)} new or modified graph(s): {sorted(to_process)}")
439
+
440
+ for graph_name in to_process:
441
+ if graph_name in graph_cache._cached_graphs:
442
+ continue
443
+
444
+ try:
445
+ cache_result = await anyio.to_thread.run_sync(
446
+ self.cache_graph_on_creation,
447
+ graph_name,
448
+ )
449
+ if cache_result:
450
+ cached_graphs.append(graph_name)
451
+ graph_cache._cached_graphs.add(graph_name)
452
+
453
+ for callback in graph_cache._cache_callbacks:
454
+ try:
455
+ result = callback(graph_name, cache_result)
456
+ if inspect.isawaitable(result):
457
+ await result
458
+ except Exception:
459
+ pass
460
+ except Exception as e:
461
+ logger.error(f"Error caching graph {graph_name}: {e}")
462
+
463
+ if cached_graphs and notify_progress:
464
+ await notify_progress(
465
+ float(total_lines) if total_lines > 0 else 1,
466
+ float(total_lines) if total_lines > 0 else 1,
467
+ f"{completed_label} completed. Cached {len(cached_graphs)} graph(s): {', '.join(cached_graphs)}",
468
+ )
469
+ except Exception as e:
470
+ logger.error(f"Post-execution graph detection failed: {e}")
471
+
472
+ def _emit_graph_ready_task(
473
+ self,
474
+ *,
475
+ emit_graph_ready: bool,
476
+ graph_ready_initial: Optional[dict[str, str]],
477
+ notify_log: Callable[[str], Awaitable[None]],
478
+ graph_ready_task_id: Optional[str],
479
+ graph_ready_format: str,
480
+ ) -> None:
481
+ if emit_graph_ready and graph_ready_initial is not None:
482
+ try:
483
+ asyncio.create_task(
484
+ self._emit_graph_ready_events(
485
+ graph_ready_initial,
486
+ notify_log,
487
+ graph_ready_task_id,
488
+ graph_ready_format,
489
+ )
490
+ )
491
+ except Exception as e:
492
+ logger.warning("graph_ready emission failed to start: %s", e)
493
+
494
+ async def _stream_smcl_log(
495
+ self,
496
+ *,
497
+ smcl_path: str,
498
+ notify_log: Callable[[str], Awaitable[None]],
499
+ done: anyio.Event,
500
+ on_chunk: Optional[Callable[[str], Awaitable[None]]] = None,
501
+ ) -> None:
502
+ last_pos = 0
503
+ emitted_debug_chunks = 0
504
+ # Wait for Stata to create the SMCL file
505
+ while not done.is_set() and not os.path.exists(smcl_path):
506
+ await anyio.sleep(0.05)
507
+
508
+ try:
509
+ def _read_content() -> str:
510
+ try:
511
+ with open(smcl_path, "r", encoding="utf-8", errors="replace") as f:
512
+ f.seek(last_pos)
513
+ return f.read()
514
+ except PermissionError:
515
+ if os.name == "nt":
516
+ try:
517
+ res = subprocess.run(f'type "{smcl_path}"', shell=True, capture_output=True)
518
+ full_content = res.stdout.decode("utf-8", errors="replace")
519
+ if len(full_content) > last_pos:
520
+ return full_content[last_pos:]
521
+ return ""
522
+ except Exception:
523
+ return ""
524
+ return ""
525
+ except FileNotFoundError:
526
+ return ""
527
+
528
+ while not done.is_set():
529
+ chunk = await anyio.to_thread.run_sync(_read_content)
530
+ if chunk:
531
+ last_pos += len(chunk)
532
+ try:
533
+ await notify_log(chunk)
534
+ except Exception as exc:
535
+ logger.debug("notify_log failed: %s", exc)
536
+ if on_chunk is not None:
537
+ try:
538
+ await on_chunk(chunk)
539
+ except Exception as exc:
540
+ logger.debug("on_chunk callback failed: %s", exc)
541
+ await anyio.sleep(0.05)
542
+
543
+ chunk = await anyio.to_thread.run_sync(_read_content)
544
+ if on_chunk is not None:
545
+ # Final check even if last chunk is empty, to ensure
546
+ # graphs created at the very end are detected.
547
+ try:
548
+ await on_chunk(chunk or "")
549
+ except Exception as exc:
550
+ logger.debug("final on_chunk check failed: %s", exc)
551
+
552
+ if chunk:
553
+ last_pos += len(chunk)
554
+ try:
555
+ await notify_log(chunk)
556
+ except Exception as exc:
557
+ logger.debug("notify_log failed: %s", exc)
558
+
559
+ except Exception as e:
560
+ logger.warning(f"Log streaming failed: {e}")
561
+
562
+ def _run_streaming_blocking(
563
+ self,
564
+ *,
565
+ command: str,
566
+ tee: FileTeeIO,
567
+ cwd: Optional[str],
568
+ trace: bool,
569
+ echo: bool,
570
+ smcl_path: str,
571
+ smcl_log_name: str,
572
+ hold_attr: str,
573
+ require_smcl_log: bool = False,
574
+ ) -> tuple[int, Optional[Exception]]:
575
+ rc = -1
576
+ exc: Optional[Exception] = None
577
+ with self._exec_lock:
578
+ self._is_executing = True
579
+ try:
580
+ from sfi import Scalar, SFIToolkit # Import SFI tools
581
+ with self._temp_cwd(cwd):
582
+ logger.debug(
583
+ "opening SMCL log name=%s path=%s cwd=%s",
584
+ smcl_log_name,
585
+ smcl_path,
586
+ os.getcwd(),
587
+ )
588
+ try:
589
+ log_opened = self._open_smcl_log(smcl_path, smcl_log_name, quiet=True)
590
+ except Exception as e:
591
+ log_opened = False
592
+ logger.warning("_open_smcl_log raised: %r", e)
593
+ logger.info("SMCL log_opened=%s path=%s", log_opened, smcl_path)
594
+ if require_smcl_log and not log_opened:
595
+ exc = RuntimeError("Failed to open SMCL log")
596
+ logger.error("SMCL log open failed for %s", smcl_path)
597
+ rc = 1
598
+ if exc is None:
599
+ try:
600
+ with self._redirect_io_streaming(tee, tee):
601
+ try:
602
+ if trace:
603
+ self.stata.run("set trace on")
604
+ logger.debug("running Stata command echo=%s: %s", echo, command)
605
+ ret = self.stata.run(command, echo=echo)
606
+ if ret:
607
+ logger.debug("stata.run output: %s", ret)
608
+
609
+ setattr(self, hold_attr, f"mcp_hold_{uuid.uuid4().hex[:8]}")
610
+ self.stata.run(
611
+ f"capture _return hold {getattr(self, hold_attr)}",
612
+ echo=False,
613
+ )
614
+
615
+ if isinstance(ret, str) and ret:
616
+ try:
617
+ tee.write(ret)
618
+ except Exception:
619
+ pass
620
+ try:
621
+ rc = self._get_rc_from_scalar(Scalar)
622
+ except Exception:
623
+ pass
624
+ except Exception as e:
625
+ exc = e
626
+ logger.error("stata.run failed: %r", e)
627
+ if rc in (-1, 0):
628
+ rc = 1
629
+ finally:
630
+ if trace:
631
+ try:
632
+ self.stata.run("set trace off")
633
+ except Exception:
634
+ pass
635
+ finally:
636
+ self._close_smcl_log(smcl_log_name)
637
+ self._restore_results_from_hold(hold_attr)
638
+ return rc, exc
639
+ # If we get here, SMCL log failed and we're required to stop.
640
+ return rc, exc
641
+ finally:
642
+ self._is_executing = False
643
+ return rc, exc
644
+
645
+ def _resolve_do_file_path(
646
+ self,
647
+ path: str,
648
+ cwd: Optional[str],
649
+ ) -> tuple[Optional[str], Optional[str], Optional[CommandResponse]]:
650
+ if cwd is not None and not os.path.isdir(cwd):
651
+ return None, None, CommandResponse(
652
+ command=f'do "{path}"',
653
+ rc=601,
654
+ stdout="",
655
+ stderr=None,
656
+ success=False,
657
+ error=ErrorEnvelope(
658
+ message=f"cwd not found: {cwd}",
659
+ rc=601,
660
+ command=path,
661
+ ),
662
+ )
663
+
664
+ effective_path = path
665
+ if cwd is not None and not os.path.isabs(path):
666
+ effective_path = os.path.abspath(os.path.join(cwd, path))
667
+
668
+ if not os.path.exists(effective_path):
669
+ return None, None, CommandResponse(
670
+ command=f'do "{effective_path}"',
671
+ rc=601,
672
+ stdout="",
673
+ stderr=None,
674
+ success=False,
675
+ error=ErrorEnvelope(
676
+ message=f"Do-file not found: {effective_path}",
677
+ rc=601,
678
+ command=effective_path,
679
+ ),
680
+ )
681
+
682
+ path_for_stata = effective_path.replace("\\", "/")
683
+ command = f'do "{path_for_stata}"'
684
+ return effective_path, command, None
685
+
686
+ @contextmanager
687
+ def _smcl_log_capture(self) -> "Generator[Tuple[str, str], None, None]":
688
+ """
689
+ Context manager that wraps command execution in a named SMCL log.
690
+
691
+ This runs alongside any user logs (named logs can coexist).
692
+ Yields (log_name, log_path) tuple for use within the context.
693
+ The SMCL file is NOT deleted automatically - caller should clean up.
694
+
695
+ Usage:
696
+ with self._smcl_log_capture() as (log_name, smcl_path):
697
+ self.stata.run(cmd)
698
+ # After context, read smcl_path for raw SMCL output
699
+ """
700
+ # Use a unique name but DO NOT join start with mkstemp to avoid existing file locks.
701
+ # Stata will create the file.
702
+ smcl_path = self._create_smcl_log_path()
703
+ # Unique log name to avoid collisions with user logs
704
+ log_name = self._make_smcl_log_name()
705
+
706
+ try:
707
+ # Open named SMCL log (quietly to avoid polluting output)
708
+ log_opened = self._open_smcl_log(smcl_path, log_name, quiet=True)
709
+ if not log_opened:
710
+ # Still yield, consumer might see empty file or handle error,
711
+ # but we can't do much if Stata refuses to log.
712
+ pass
713
+
714
+ yield log_name, smcl_path
715
+ finally:
716
+ # Always close our named log
717
+ self._close_smcl_log(log_name)
718
+
719
+ def _read_smcl_file(self, path: str) -> str:
720
+ """Read SMCL file contents, handling encoding issues and Windows file locks."""
721
+ try:
722
+ with open(path, 'r', encoding='utf-8', errors='replace') as f:
723
+ return f.read()
724
+ except PermissionError:
725
+ if os.name == "nt":
726
+ # Windows Fallback: Try to use 'type' command to bypass exclusive lock
727
+ try:
728
+ res = subprocess.run(f'type "{path}"', shell=True, capture_output=True)
729
+ if res.returncode == 0:
730
+ return res.stdout.decode('utf-8', errors='replace')
731
+ except Exception as e:
732
+ logger.debug(f"Combined fallback read failed: {e}")
733
+ logger.warning(f"Failed to read SMCL file {path} due to lock")
734
+ return ""
735
+ except Exception as e:
736
+ logger.warning(f"Failed to read SMCL file {path}: {e}")
737
+ return ""
738
+
739
+ def _extract_error_from_smcl(self, smcl_content: str, rc: int) -> Tuple[str, str]:
740
+ """
741
+ Extract error message and context from raw SMCL output.
742
+
743
+ Uses {err} tags as the authoritative source for error detection.
744
+
745
+ Returns:
746
+ Tuple of (error_message, context_string)
747
+ """
748
+ if not smcl_content:
749
+ return f"Stata error r({rc})", ""
750
+
751
+ # Try Rust optimization
752
+ native_res = fast_scan_log(smcl_content, rc)
753
+ if native_res:
754
+ error_msg, context, _ = native_res
755
+ return error_msg, context
756
+
757
+ lines = smcl_content.splitlines()
758
+
759
+ # Search backwards for {err} tags - they indicate error lines
760
+ error_lines = []
761
+ error_start_idx = -1
762
+
763
+ for i in range(len(lines) - 1, -1, -1):
764
+ line = lines[i]
765
+ if '{err}' in line:
766
+ if error_start_idx == -1:
767
+ error_start_idx = i
768
+ # Walk backwards to find consecutive {err} lines
769
+ j = i
770
+ while j >= 0 and '{err}' in lines[j]:
771
+ error_lines.insert(0, lines[j])
772
+ j -= 1
773
+ break
774
+
775
+ if error_lines:
776
+ # Clean SMCL tags from error message
777
+ clean_lines = []
778
+ for line in error_lines:
779
+ # Remove SMCL tags but keep the text content
780
+ cleaned = re.sub(r'\{[^}]*\}', '', line).strip()
781
+ if cleaned:
782
+ clean_lines.append(cleaned)
783
+
784
+ error_msg = " ".join(clean_lines) or f"Stata error r({rc})"
785
+
786
+ # Context is everything from error start to end
787
+ context_start = max(0, error_start_idx - 5) # Include 5 lines before error
788
+ context = "\n".join(lines[context_start:])
789
+
790
+ return error_msg, context
791
+
792
+ # Fallback: no {err} found, return last 30 lines as context
793
+ context_start = max(0, len(lines) - 30)
794
+ context = "\n".join(lines[context_start:])
795
+
796
+ return f"Stata error r({rc})", context
797
+
798
+ def _parse_rc_from_smcl(self, smcl_content: str) -> Optional[int]:
799
+ """Parse return code from SMCL content using specific structural patterns."""
800
+ if not smcl_content:
801
+ return None
802
+
803
+ # Try Rust optimization
804
+ native_res = fast_scan_log(smcl_content, 0)
805
+ if native_res:
806
+ _, _, rc = native_res
807
+ if rc is not None:
808
+ return rc
809
+
810
+ # 1. Primary check: SMCL search tag {search r(N), ...}
811
+ # This is the most authoritative interactive indicator
812
+ matches = list(re.finditer(r'\{search r\((\d+)\)', smcl_content))
813
+ if matches:
814
+ try:
815
+ return int(matches[-1].group(1))
816
+ except Exception:
817
+ pass
818
+
819
+ # 2. Secondary check: Standalone r(N); pattern
820
+ # This appears at the end of command blocks
821
+ matches = list(re.finditer(r'(?<!\w)r\((\d+)\);?', smcl_content))
822
+ if matches:
823
+ try:
824
+ return int(matches[-1].group(1))
825
+ except Exception:
826
+ pass
827
+
828
+ return None
829
+
830
+ @staticmethod
831
+ def _create_graph_cache_callback(on_graph_cached, notify_log):
832
+ """Create a standardized graph cache callback with proper error handling."""
833
+ async def graph_cache_callback(graph_name: str, success: bool) -> None:
834
+ try:
835
+ if on_graph_cached:
836
+ await on_graph_cached(graph_name, success)
837
+ except Exception as e:
838
+ logger.error(f"Graph cache callback failed: {e}")
839
+
840
+ try:
841
+ # Also notify via log channel
842
+ await notify_log(json.dumps({
843
+ "event": "graph_cached",
844
+ "graph": graph_name,
845
+ "success": success
846
+ }))
847
+ except Exception as e:
848
+ logger.error(f"Failed to notify about graph cache: {e}")
849
+
850
+ return graph_cache_callback
851
+
852
+ def _get_cached_graph_path(self, graph_name: str) -> Optional[str]:
853
+ if not hasattr(self, "_cache_lock") or not hasattr(self, "_preemptive_cache"):
854
+ return None
855
+ try:
856
+ with self._cache_lock:
857
+ cache_path = self._preemptive_cache.get(graph_name)
858
+ if not cache_path:
859
+ return None
860
+
861
+ # Double-check validity (e.g. signature match for current command)
862
+ if not self._is_cache_valid(graph_name, cache_path):
863
+ return None
864
+
865
+ return cache_path
866
+ except Exception:
867
+ return None
868
+
869
+ async def _emit_graph_ready_for_graphs(
870
+ self,
871
+ graph_names: List[str],
872
+ *,
873
+ notify_log: Callable[[str], Awaitable[None]],
874
+ task_id: Optional[str],
875
+ export_format: str,
876
+ graph_ready_initial: Optional[dict[str, str]],
877
+ ) -> None:
878
+ if not graph_names:
879
+ return
880
+ fmt = (export_format or "svg").strip().lower()
881
+ for graph_name in graph_names:
882
+ signature = self._get_graph_signature(graph_name)
883
+ if graph_ready_initial is not None:
884
+ previous = graph_ready_initial.get(graph_name)
885
+ if previous is not None and previous == signature:
886
+ continue
887
+ try:
888
+ export_path = None
889
+ if fmt == "svg":
890
+ export_path = self._get_cached_graph_path(graph_name)
891
+ if not export_path:
892
+ export_path = await anyio.to_thread.run_sync(
893
+ lambda: self.export_graph(graph_name, format=fmt)
894
+ )
895
+ payload = {
896
+ "event": "graph_ready",
897
+ "task_id": task_id,
898
+ "graph": {
899
+ "name": graph_name,
900
+ "path": export_path,
901
+ "label": graph_name,
902
+ },
903
+ }
904
+ await notify_log(json.dumps(payload))
905
+ if graph_ready_initial is not None:
906
+ graph_ready_initial[graph_name] = signature
907
+ except Exception as e:
908
+ logger.warning("graph_ready export failed for %s: %s", graph_name, e)
909
+
910
+ async def _maybe_cache_graphs_on_chunk(
911
+ self,
912
+ *,
913
+ graph_cache: Optional[StreamingGraphCache],
914
+ emit_graph_ready: bool,
915
+ notify_log: Callable[[str], Awaitable[None]],
916
+ graph_ready_task_id: Optional[str],
917
+ graph_ready_format: str,
918
+ graph_ready_initial: Optional[dict[str, str]],
919
+ last_check: List[float],
920
+ force: bool = False,
921
+ ) -> None:
922
+ if not graph_cache or not graph_cache.auto_cache:
923
+ return
924
+ if self._is_executing and not force:
925
+ # Skip polling if Stata is busy; it will block on _exec_lock anyway.
926
+ # During final check (force=True), we know it's safe because _run_streaming_blocking has finished.
927
+ return
928
+ now = time.monotonic()
929
+ if not force and last_check and now - last_check[0] < 0.25:
930
+ return
931
+ if last_check:
932
+ last_check[0] = now
933
+ try:
934
+ cached_names = await graph_cache.cache_detected_graphs_with_pystata()
935
+ except Exception as e:
936
+ logger.debug("graph_ready polling failed: %s", e)
937
+ return
938
+ if emit_graph_ready and cached_names:
939
+ await self._emit_graph_ready_for_graphs(
940
+ cached_names,
941
+ notify_log=notify_log,
942
+ task_id=graph_ready_task_id,
943
+ export_format=graph_ready_format,
944
+ graph_ready_initial=graph_ready_initial,
945
+ )
946
+
947
+ async def _emit_graph_ready_events(
948
+ self,
949
+ initial_graphs: dict[str, str],
950
+ notify_log: Callable[[str], Awaitable[None]],
951
+ task_id: Optional[str],
952
+ export_format: str,
953
+ ) -> None:
954
+ try:
955
+ current_graphs = list(self.list_graphs(force_refresh=True))
956
+ except Exception as e:
957
+ logger.warning("graph_ready: list_graphs failed: %s", e)
958
+ return
959
+
960
+ if not current_graphs:
961
+ return
962
+
963
+ for graph_name in current_graphs:
964
+ signature = self._get_graph_signature(graph_name)
965
+ previous = initial_graphs.get(graph_name)
966
+ if previous is not None and previous == signature:
967
+ continue
968
+ try:
969
+ export_path = None
970
+ if export_format == "svg":
971
+ export_path = self._get_cached_graph_path(graph_name)
972
+
973
+ if not export_path:
974
+ export_path = await anyio.to_thread.run_sync(
975
+ lambda: self.export_graph(graph_name, format=export_format)
976
+ )
977
+ payload = {
978
+ "event": "graph_ready",
979
+ "task_id": task_id,
980
+ "graph": {
981
+ "name": graph_name,
982
+ "path": export_path,
983
+ "label": graph_name,
984
+ },
985
+ }
986
+ await notify_log(json.dumps(payload))
987
+ initial_graphs[graph_name] = signature
988
+ except Exception as e:
989
+ logger.warning("graph_ready export failed for %s: %s", graph_name, e)
990
+
991
+ def _get_graph_signature(self, graph_name: str) -> str:
992
+ """
993
+ Get a stable signature for a graph without calling Stata.
994
+ Consistent with GraphCreationDetector implementation.
995
+ """
996
+ if not graph_name:
997
+ return ""
998
+ cmd_idx = getattr(self, "_command_idx", 0)
999
+ # Include command index for all graphs to detect re-creation/modification
1000
+ # between commands. The detector's internal set will handle deduplication
1001
+ # within a single command execution stream.
1002
+ return f"{graph_name}_{cmd_idx}"
1003
+
1004
+ def _request_break_in(self) -> None:
1005
+ """
1006
+ Attempt to interrupt a running Stata command when cancellation is requested.
1007
+
1008
+ Uses the Stata sfi.breakIn hook when available; errors are swallowed because
1009
+ cancellation should never crash the host process.
1010
+ """
1011
+ try:
1012
+ import sfi # type: ignore[import-not-found]
1013
+
1014
+ break_fn = getattr(sfi, "breakIn", None) or getattr(sfi, "break_in", None)
1015
+ if callable(break_fn):
1016
+ try:
1017
+ break_fn()
1018
+ logger.info("Sent breakIn() to Stata for cancellation")
1019
+ except Exception as e: # pragma: no cover - best-effort
1020
+ logger.warning(f"Failed to send breakIn() to Stata: {e}")
1021
+ else: # pragma: no cover - environment without Stata runtime
1022
+ logger.debug("sfi.breakIn not available; cannot interrupt Stata")
1023
+ except Exception as e: # pragma: no cover - import failure or other
1024
+ logger.debug(f"Unable to import sfi for cancellation: {e}")
1025
+
1026
+ async def _wait_for_stata_stop(self, timeout: float = 2.0) -> bool:
1027
+ """
1028
+ After requesting a break, poll the Stata interface so it can surface BreakError
1029
+ and return control. This is best-effort and time-bounded.
1030
+ """
1031
+ deadline = time.monotonic() + timeout
1032
+ try:
1033
+ import sfi # type: ignore[import-not-found]
1034
+
1035
+ toolkit = getattr(sfi, "SFIToolkit", None)
1036
+ poll = getattr(toolkit, "pollnow", None) or getattr(toolkit, "pollstd", None)
1037
+ BreakError = getattr(sfi, "BreakError", None)
1038
+ except Exception: # pragma: no cover
1039
+ return False
1040
+
1041
+ if not callable(poll):
1042
+ return False
1043
+
1044
+ last_exc: Optional[Exception] = None
1045
+ while time.monotonic() < deadline:
1046
+ try:
1047
+ poll()
1048
+ except Exception as e: # pragma: no cover - depends on Stata runtime
1049
+ last_exc = e
1050
+ if BreakError is not None and isinstance(e, BreakError):
1051
+ logger.info("Stata BreakError detected; cancellation acknowledged by Stata")
1052
+ return True
1053
+ # If Stata already stopped, break on any other exception.
1054
+ break
1055
+ await anyio.sleep(0.05)
1056
+
1057
+ if last_exc:
1058
+ logger.debug(f"Cancellation poll exited with {last_exc}")
1059
+ return False
1060
+
1061
+ @contextmanager
1062
+ def _temp_cwd(self, cwd: Optional[str]):
1063
+ if cwd is None:
1064
+ yield
1065
+ return
1066
+ prev = os.getcwd()
1067
+ os.chdir(cwd)
1068
+ try:
1069
+ yield
1070
+ finally:
1071
+ os.chdir(prev)
1072
+
1073
+ @contextmanager
1074
+ def _safe_redirect_fds(self):
1075
+ """Redirects fd 1 (stdout) to fd 2 (stderr) at the OS level."""
1076
+ # Save original stdout fd
1077
+ try:
1078
+ stdout_fd = os.dup(1)
1079
+ except Exception:
1080
+ # Fallback if we can't dup (e.g. strange environment)
1081
+ yield
1082
+ return
1083
+
1084
+ try:
1085
+ # Redirect OS-level stdout to stderr
1086
+ os.dup2(2, 1)
1087
+ yield
1088
+ finally:
1089
+ # Restore stdout
1090
+ try:
1091
+ os.dup2(stdout_fd, 1)
1092
+ os.close(stdout_fd)
1093
+ except Exception:
1094
+ pass
1095
+
1096
+ def init(self):
1097
+ """Initializes usage of pystata using cached discovery results."""
1098
+ if self._initialized:
1099
+ return
1100
+
1101
+ # Suppress any non-UTF8 banner output from PyStata on stdout, which breaks MCP stdio transport
1102
+ from contextlib import redirect_stdout, redirect_stderr
1103
+
1104
+ try:
1105
+ import stata_setup
1106
+
1107
+ # Get discovered Stata paths (cached from first call)
1108
+ discovery_candidates = _get_discovery_candidates()
1109
+ if not discovery_candidates:
1110
+ raise RuntimeError("No Stata candidates found during discovery")
1111
+
1112
+ logger.info("Initializing Stata engine (attempting up to %d candidate binaries)...", len(discovery_candidates))
1113
+
1114
+ # Diagnostic: force faulthandler to output to stderr for C crashes
1115
+ import faulthandler
1116
+ faulthandler.enable(file=sys.stderr)
1117
+ import subprocess
1118
+
1119
+ success = False
1120
+ last_error = None
1121
+ chosen_exec: Optional[Tuple[str, str]] = None
1122
+
1123
+ for stata_exec_path, edition in discovery_candidates:
1124
+ candidates = []
1125
+ # Prefer the binary directory first (documented input for stata_setup)
1126
+ bin_dir = os.path.dirname(stata_exec_path)
1127
+
1128
+ # 2. App Bundle: .../StataMP.app (macOS only)
1129
+ curr = bin_dir
1130
+ app_bundle = None
1131
+ while len(curr) > 1:
1132
+ if curr.endswith(".app"):
1133
+ app_bundle = curr
1134
+ break
1135
+ parent = os.path.dirname(curr)
1136
+ if parent == curr:
1137
+ break
1138
+ curr = parent
1139
+
1140
+ ordered_candidates = []
1141
+ if app_bundle:
1142
+ # On macOS, the parent of the .app is often the correct install path
1143
+ # (e.g., /Applications/StataNow containing StataMP.app)
1144
+ parent_dir = os.path.dirname(app_bundle)
1145
+ if parent_dir and parent_dir != "/":
1146
+ ordered_candidates.append(parent_dir)
1147
+ ordered_candidates.append(app_bundle)
1148
+
1149
+ if bin_dir:
1150
+ ordered_candidates.append(bin_dir)
1151
+
1152
+ # Deduplicate preserving order
1153
+ seen = set()
1154
+ candidates = []
1155
+ for c in ordered_candidates:
1156
+ if c not in seen:
1157
+ seen.add(c)
1158
+ candidates.append(c)
1159
+
1160
+ for path in candidates:
1161
+ try:
1162
+ # 1. Pre-flight check in a subprocess to capture hard exits/crashes
1163
+ sys.stderr.write(f"[mcp_stata] DEBUG: Pre-flight check for path '{path}'\n")
1164
+ sys.stderr.flush()
1165
+
1166
+ preflight_code = f"""
1167
+ import sys
1168
+ import stata_setup
1169
+ from contextlib import redirect_stdout, redirect_stderr
1170
+ with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
1171
+ try:
1172
+ stata_setup.config({repr(path)}, {repr(edition)})
1173
+ from pystata import stata
1174
+ # Minimal verification of engine health
1175
+ stata.run('display 1', echo=False)
1176
+ print('PREFLIGHT_OK')
1177
+ except Exception as e:
1178
+ print(f'PREFLIGHT_FAIL: {{e}}', file=sys.stderr)
1179
+ sys.exit(1)
1180
+ """
1181
+
1182
+ try:
1183
+ # Use shorter timeout for pre-flight if feasible,
1184
+ # but keep it safe for slow environments. 15s is usually enough for a ping.
1185
+ res = subprocess.run(
1186
+ [sys.executable, "-c", preflight_code],
1187
+ capture_output=True, text=True, timeout=20
1188
+ )
1189
+ if res.returncode != 0:
1190
+ sys.stderr.write(f"[mcp_stata] Pre-flight failed (rc={res.returncode}) for '{path}'\n")
1191
+ if res.stdout.strip():
1192
+ sys.stderr.write(f"--- Pre-flight stdout ---\n{res.stdout.strip()}\n")
1193
+ if res.stderr.strip():
1194
+ sys.stderr.write(f"--- Pre-flight stderr ---\n{res.stderr.strip()}\n")
1195
+ sys.stderr.flush()
1196
+ last_error = f"Pre-flight failed: {res.stdout.strip()} {res.stderr.strip()}"
1197
+ continue
1198
+ else:
1199
+ sys.stderr.write(f"[mcp_stata] Pre-flight succeeded for '{path}'. Proceeding to in-process init.\n")
1200
+ sys.stderr.flush()
1201
+ except Exception as pre_e:
1202
+ sys.stderr.write(f"[mcp_stata] Pre-flight execution error for '{path}': {repr(pre_e)}\n")
1203
+ sys.stderr.flush()
1204
+ last_error = pre_e
1205
+ continue
1206
+
1207
+ msg = f"[mcp_stata] DEBUG: In-process stata_setup.config('{path}', '{edition}')\n"
1208
+ sys.stderr.write(msg)
1209
+ sys.stderr.flush()
1210
+ # Redirect both sys.stdout/err AND the raw fds to our stderr pipe.
1211
+ with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr), self._safe_redirect_fds():
1212
+ stata_setup.config(path, edition)
1213
+
1214
+ sys.stderr.write(f"[mcp_stata] DEBUG: stata_setup.config succeeded for path: {path}\n")
1215
+ sys.stderr.flush()
1216
+ success = True
1217
+ chosen_exec = (stata_exec_path, edition)
1218
+ logger.info("stata_setup.config succeeded with path: %s", path)
1219
+ break
1220
+ except BaseException as e:
1221
+ last_error = e
1222
+ sys.stderr.write(f"[mcp_stata] WARNING: In-process stata_setup.config caught: {repr(e)}\n")
1223
+ sys.stderr.flush()
1224
+ logger.warning("stata_setup.config failed for path '%s': %s", path, e)
1225
+ if isinstance(e, SystemExit):
1226
+ break
1227
+ continue
1228
+
1229
+ if success:
1230
+ # Cache winning candidate for subsequent lookups
1231
+ global _discovery_result
1232
+ if chosen_exec:
1233
+ _discovery_result = chosen_exec
1234
+ break
1235
+
1236
+ if not success:
1237
+ error_msg = (
1238
+ f"stata_setup.config failed to initialize Stata. "
1239
+ f"Tried candidates: {discovery_candidates}. "
1240
+ f"Last error: {repr(last_error)}"
1241
+ )
1242
+ sys.stderr.write(f"[mcp_stata] ERROR: {error_msg}\n")
1243
+ sys.stderr.flush()
1244
+ logger.error(error_msg)
1245
+ raise RuntimeError(error_msg)
1246
+
1247
+ # Cache the binary path for later use (e.g., PNG export on Windows)
1248
+ self._stata_exec_path = os.path.abspath(stata_exec_path)
1249
+
1250
+ try:
1251
+ sys.stderr.write("[mcp_stata] DEBUG: Importing pystata and warming up...\n")
1252
+ sys.stderr.flush()
1253
+ with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr), self._safe_redirect_fds():
1254
+ from pystata import stata # type: ignore[import-not-found]
1255
+ # Warm up the engine and swallow any late splash screen output
1256
+ stata.run("display 1", echo=False)
1257
+ self.stata = stata
1258
+ self._initialized = True
1259
+ sys.stderr.write("[mcp_stata] DEBUG: pystata warmed up successfully\n")
1260
+ sys.stderr.flush()
1261
+ except BaseException as e:
1262
+ sys.stderr.write(f"[mcp_stata] ERROR: Failed to load pystata or run initial command: {repr(e)}\n")
1263
+ sys.stderr.flush()
1264
+ logger.error("Failed to load pystata or run initial command: %s", e)
1265
+ raise
1266
+
1267
+ # Initialize list_graphs TTL cache
1268
+ self._list_graphs_cache = None
1269
+ self._list_graphs_cache_time = 0
1270
+ self._list_graphs_cache_lock = threading.Lock()
1271
+
1272
+ # Map user-facing graph names (may include spaces/punctuation) to valid
1273
+ # internal Stata graph names.
1274
+ self._graph_name_aliases: Dict[str, str] = {}
1275
+ self._graph_name_reverse: Dict[str, str] = {}
1276
+
1277
+ logger.info("StataClient initialized successfully with %s (%s)", stata_exec_path, edition)
1278
+
1279
+ except ImportError as e:
1280
+ raise RuntimeError(
1281
+ f"Failed to import stata_setup or pystata: {e}. "
1282
+ "Ensure they are installed (pip install pystata stata-setup)."
1283
+ ) from e
1284
+
1285
+ def _make_valid_stata_name(self, name: str) -> str:
1286
+ """Create a valid Stata name (<=32 chars, [A-Za-z_][A-Za-z0-9_]*)."""
1287
+ base = re.sub(r"[^A-Za-z0-9_]", "_", name or "")
1288
+ if not base:
1289
+ base = "Graph"
1290
+ if not re.match(r"^[A-Za-z_]", base):
1291
+ base = f"G_{base}"
1292
+ base = base[:32]
1293
+
1294
+ # Avoid collisions.
1295
+ candidate = base
1296
+ i = 1
1297
+ while candidate in getattr(self, "_graph_name_reverse", {}):
1298
+ suffix = f"_{i}"
1299
+ candidate = (base[: max(0, 32 - len(suffix))] + suffix)[:32]
1300
+ i += 1
1301
+ return candidate
1302
+
1303
+ def _resolve_graph_name_for_stata(self, name: str) -> str:
1304
+ """Return internal Stata graph name for a user-facing name."""
1305
+ if not name:
1306
+ return name
1307
+ aliases = getattr(self, "_graph_name_aliases", None)
1308
+ if aliases and name in aliases:
1309
+ return aliases[name]
1310
+ return name
1311
+
1312
+ def _maybe_rewrite_graph_name_in_command(self, code: str) -> str:
1313
+ """Rewrite name("...") to a valid Stata name and store alias mapping."""
1314
+ if not code:
1315
+ return code
1316
+ if not hasattr(self, "_graph_name_aliases"):
1317
+ self._graph_name_aliases = {}
1318
+ self._graph_name_reverse = {}
1319
+
1320
+ # Handle common patterns: name("..." ...) or name(`"..."' ...)
1321
+ pat = re.compile(r"name\(\s*(?:`\"(?P<cq>[^\"]*)\"'|\"(?P<dq>[^\"]*)\")\s*(?P<rest>[^)]*)\)")
1322
+
1323
+ def repl(m: re.Match) -> str:
1324
+ original = m.group("cq") if m.group("cq") is not None else m.group("dq")
1325
+ original = original or ""
1326
+ internal = self._graph_name_aliases.get(original)
1327
+ if not internal:
1328
+ internal = self._make_valid_stata_name(original)
1329
+ self._graph_name_aliases[original] = internal
1330
+ self._graph_name_reverse[internal] = original
1331
+ rest = m.group("rest") or ""
1332
+ return f"name({internal}{rest})"
1333
+
1334
+ return pat.sub(repl, code)
1335
+
1336
+ def _get_rc_from_scalar(self, Scalar=None) -> int:
1337
+ """Safely get return code, handling None values."""
1338
+ try:
1339
+ from sfi import Macro
1340
+ # In PyStata, the last return code is stored in the _rc macro
1341
+ # We first ensure it's copied to a known global macro for reliable retrieval
1342
+ self.stata.run("macro define mcp_last_rc = _rc", echo=False)
1343
+ rc_val = Macro.getGlobal("mcp_last_rc")
1344
+ if rc_val is None:
1345
+ return -1
1346
+ return int(float(rc_val))
1347
+ except Exception:
1348
+ return -1
1349
+
1350
+ def _parse_rc_from_text(self, text: str) -> Optional[int]:
1351
+ """Parse return code from plain text using structural patterns."""
1352
+ if not text:
1353
+ return None
1354
+
1355
+ # 1. Primary check: 'search r(N)' pattern (SMCL tag potentially stripped)
1356
+ matches = list(re.finditer(r'search r\((\d+)\)', text))
1357
+ if matches:
1358
+ try:
1359
+ return int(matches[-1].group(1))
1360
+ except Exception:
1361
+ pass
1362
+
1363
+ # 2. Secondary check: Standalone r(N); pattern
1364
+ # This appears at the end of command blocks
1365
+ matches = list(re.finditer(r'(?<!\w)r\((\d+)\);?', text))
1366
+ if matches:
1367
+ try:
1368
+ return int(matches[-1].group(1))
1369
+ except Exception:
1370
+ pass
1371
+
1372
+ return None
1373
+
1374
+ def _parse_line_from_text(self, text: str) -> Optional[int]:
1375
+ match = re.search(r"line\s+(\d+)", text, re.IGNORECASE)
1376
+ if match:
1377
+ try:
1378
+ return int(match.group(1))
1379
+ except Exception:
1380
+ return None
1381
+ return None
1382
+
1383
+ def _read_log_backwards_until_error(self, path: str, max_bytes: int = 5_000_000) -> str:
1384
+ """
1385
+ Read log file backwards in chunks, stopping when we find {err} tags or reach the start.
1386
+
1387
+ This is more efficient and robust than reading huge fixed tails, as we only read
1388
+ what we need to find the error.
1389
+
1390
+ Args:
1391
+ path: Path to the log file
1392
+ max_bytes: Maximum total bytes to read (safety limit, default 5MB)
1393
+
1394
+ Returns:
1395
+ The relevant portion of the log containing the error and context
1396
+ """
1397
+ try:
1398
+ chunk_size = 50_000 # Read 50KB chunks at a time
1399
+ total_read = 0
1400
+ chunks = []
1401
+
1402
+ with open(path, 'rb') as f:
1403
+ # Get file size
1404
+ f.seek(0, os.SEEK_END)
1405
+ file_size = f.tell()
1406
+
1407
+ if file_size == 0:
1408
+ return ""
1409
+
1410
+ # Start from the end
1411
+ position = file_size
1412
+
1413
+ while position > 0 and total_read < max_bytes:
1414
+ # Calculate how much to read in this chunk
1415
+ read_size = min(chunk_size, position, max_bytes - total_read)
1416
+ position -= read_size
1417
+
1418
+ # Seek and read
1419
+ f.seek(position)
1420
+ chunk = f.read(read_size)
1421
+ chunks.insert(0, chunk)
1422
+ total_read += read_size
1423
+
1424
+ # Decode and check for error tags
1425
+ try:
1426
+ accumulated = b''.join(chunks).decode('utf-8', errors='replace')
1427
+
1428
+ # Check if we've found an error tag
1429
+ if '{err}' in accumulated:
1430
+ # Found it! Read one more chunk for context before the error
1431
+ if position > 0 and total_read < max_bytes:
1432
+ extra_read = min(chunk_size, position, max_bytes - total_read)
1433
+ position -= extra_read
1434
+ f.seek(position)
1435
+ extra_chunk = f.read(extra_read)
1436
+ chunks.insert(0, extra_chunk)
1437
+
1438
+ return b''.join(chunks).decode('utf-8', errors='replace')
1439
+
1440
+ except UnicodeDecodeError:
1441
+ # Continue reading if we hit a decode error (might be mid-character)
1442
+ continue
1443
+
1444
+ # Read everything we've accumulated
1445
+ return b''.join(chunks).decode('utf-8', errors='replace')
1446
+
1447
+ except Exception as e:
1448
+ logger.warning(f"Error reading log backwards: {e}")
1449
+ # Fallback to regular tail read
1450
+ return self._read_log_tail(path, 200_000)
1451
+
1452
+ def _read_log_tail_smart(self, path: str, rc: int, trace: bool = False) -> str:
1453
+ """
1454
+ Smart log tail reader that adapts based on whether an error occurred.
1455
+
1456
+ - If rc == 0: Read normal tail (20KB without trace, 200KB with trace)
1457
+ - If rc != 0: Search backwards dynamically to find the error
1458
+
1459
+ Args:
1460
+ path: Path to the log file
1461
+ rc: Return code from Stata
1462
+ trace: Whether trace mode was enabled
1463
+
1464
+ Returns:
1465
+ Relevant log content
1466
+ """
1467
+ if rc != 0:
1468
+ # Error occurred - search backwards for {err} tags
1469
+ return self._read_log_backwards_until_error(path)
1470
+ else:
1471
+ # Success - just read normal tail
1472
+ tail_size = 200_000 if trace else 20_000
1473
+ return self._read_log_tail(path, tail_size)
1474
+
1475
+ def _read_log_tail(self, path: str, max_chars: int) -> str:
1476
+ try:
1477
+ with open(path, "rb") as f:
1478
+ f.seek(0, os.SEEK_END)
1479
+ size = f.tell()
1480
+
1481
+ if size <= 0:
1482
+ return ""
1483
+ read_size = min(size, max_chars)
1484
+ f.seek(-read_size, os.SEEK_END)
1485
+ data = f.read(read_size)
1486
+ return data.decode("utf-8", errors="replace")
1487
+ except Exception:
1488
+ return ""
1489
+
1490
+ def _build_combined_log(
1491
+ self,
1492
+ tail: TailBuffer,
1493
+ path: str,
1494
+ rc: int,
1495
+ trace: bool,
1496
+ exc: Optional[Exception],
1497
+ ) -> str:
1498
+ tail_text = tail.get_value()
1499
+ log_tail = self._read_log_tail_smart(path, rc, trace)
1500
+ if log_tail and len(log_tail) > len(tail_text):
1501
+ tail_text = log_tail
1502
+ return (tail_text or "") + (f"\n{exc}" if exc else "")
1503
+
1504
+ def _truncate_command_output(
1505
+ self,
1506
+ result: CommandResponse,
1507
+ max_output_lines: Optional[int],
1508
+ ) -> CommandResponse:
1509
+ if max_output_lines is None or not result.stdout:
1510
+ return result
1511
+ lines = result.stdout.splitlines()
1512
+ if len(lines) <= max_output_lines:
1513
+ return result
1514
+ truncated_lines = lines[:max_output_lines]
1515
+ truncated_lines.append(
1516
+ f"\n... (output truncated: showing {max_output_lines} of {len(lines)} lines)"
1517
+ )
1518
+ truncated_stdout = "\n".join(truncated_lines)
1519
+ if hasattr(result, "model_copy"):
1520
+ return result.model_copy(update={"stdout": truncated_stdout})
1521
+ return result.copy(update={"stdout": truncated_stdout})
1522
+
1523
+ def _run_plain_capture(self, code: str) -> str:
1524
+ """
1525
+ Run a Stata command while capturing output using a named SMCL log.
1526
+ This is the most reliable way to capture output (like return list)
1527
+ without interfering with user logs or being affected by stdout redirection issues.
1528
+ """
1529
+ if not self._initialized:
1530
+ self.init()
1531
+
1532
+ with self._exec_lock:
1533
+ hold_name = f"mcp_hold_{uuid.uuid4().hex[:8]}"
1534
+ # Hold results BEFORE opening the capture log
1535
+ self.stata.run(f"capture _return hold {hold_name}", echo=False)
1536
+
1537
+ try:
1538
+ with self._smcl_log_capture() as (log_name, smcl_path):
1539
+ # Restore results INSIDE the capture log so return list can see them
1540
+ self.stata.run(f"capture _return restore {hold_name}", echo=False)
1541
+ try:
1542
+ self.stata.run(code, echo=True)
1543
+ except Exception:
1544
+ pass
1545
+ except Exception:
1546
+ # Cleanup hold if log capture failed to open
1547
+ self.stata.run(f"capture _return drop {hold_name}", echo=False)
1548
+ content = ""
1549
+ smcl_path = None
1550
+ else:
1551
+ # Read SMCL content and convert to text
1552
+ content = self._read_smcl_file(smcl_path)
1553
+ # Remove the temp file
1554
+ self._safe_unlink(smcl_path)
1555
+
1556
+ return self._smcl_to_text(content)
1557
+
1558
+ def _count_do_file_lines(self, path: str) -> int:
1559
+ """
1560
+ Count the number of executable lines in a .do file for progress inference.
1561
+
1562
+ Blank lines and comment-only lines (starting with * or //) are ignored.
1563
+ """
1564
+ try:
1565
+ with open(path, "r", encoding="utf-8", errors="replace") as f:
1566
+ lines = f.read().splitlines()
1567
+ except Exception:
1568
+ return 0
1569
+
1570
+ total = 0
1571
+ for line in lines:
1572
+ s = line.strip()
1573
+ if not s:
1574
+ continue
1575
+ if s.startswith("*"):
1576
+ continue
1577
+ if s.startswith("//"):
1578
+ continue
1579
+ total += 1
1580
+ return total
1581
+
1582
+ def _smcl_to_text(self, smcl: str) -> str:
1583
+ """Convert simple SMCL markup into plain text for LLM-friendly help."""
1584
+ # First, keep inline directive content if present (e.g., {bf:word} -> word)
1585
+ cleaned = re.sub(r"\{[^}:]+:([^}]*)\}", r"\1", smcl)
1586
+ # Remove remaining SMCL brace commands like {smcl}, {vieweralsosee ...}, {txt}, {p}
1587
+ cleaned = re.sub(r"\{[^}]*\}", "", cleaned)
1588
+ # Normalize whitespace
1589
+ cleaned = cleaned.replace("\r", "")
1590
+ lines = [line.rstrip() for line in cleaned.splitlines()]
1591
+ return "\n".join(lines).strip()
1592
+
1593
+ def _extract_error_and_context(self, log_content: str, rc: int) -> Tuple[str, str]:
1594
+ """
1595
+ Extracts the error message and trace context using {err} SMCL tags.
1596
+ """
1597
+ if not log_content:
1598
+ return f"Stata error r({rc})", ""
1599
+
1600
+ lines = log_content.splitlines()
1601
+
1602
+ # Search backwards for the {err} tag
1603
+ for i in range(len(lines) - 1, -1, -1):
1604
+ line = lines[i]
1605
+ if '{err}' in line:
1606
+ # Found the (last) error line.
1607
+ # Walk backwards to find the start of the error block (consecutive {err} lines)
1608
+ start_idx = i
1609
+ while start_idx > 0 and '{err}' in lines[start_idx-1]:
1610
+ start_idx -= 1
1611
+
1612
+ # The full error message is the concatenation of all {err} lines in this block
1613
+ error_lines = []
1614
+ for j in range(start_idx, i + 1):
1615
+ error_lines.append(lines[j].strip())
1616
+
1617
+ clean_msg = " ".join(filter(None, error_lines)) or f"Stata error r({rc})"
1618
+
1619
+ # Capture everything from the start of the error block to the end
1620
+ context_str = "\n".join(lines[start_idx:])
1621
+ return clean_msg, context_str
1622
+
1623
+ # Fallback: grab the last 30 lines
1624
+ context_start = max(0, len(lines) - 30)
1625
+ context_str = "\n".join(lines[context_start:])
1626
+
1627
+ return f"Stata error r({rc})", context_str
1628
+
1629
+ def _exec_with_capture(self, code: str, echo: bool = True, trace: bool = False, cwd: Optional[str] = None) -> CommandResponse:
1630
+ if not self._initialized:
1631
+ self.init()
1632
+
1633
+ self._increment_command_idx()
1634
+ # Rewrite graph names with special characters to internal aliases
1635
+ code = self._maybe_rewrite_graph_name_in_command(code)
1636
+
1637
+ output_buffer = StringIO()
1638
+ error_buffer = StringIO()
1639
+ rc = 0
1640
+ sys_error = None
1641
+ error_envelope = None
1642
+ smcl_content = ""
1643
+ smcl_path = None
1644
+
1645
+ with self._exec_lock:
1646
+ try:
1647
+ from sfi import Scalar, SFIToolkit
1648
+ with self._temp_cwd(cwd):
1649
+ # Create SMCL log for authoritative output capture
1650
+ # Use shorter unique path to avoid Windows path issues
1651
+ smcl_path = self._create_smcl_log_path(prefix="mcp_", max_hex=16, base_dir=cwd)
1652
+ log_name = self._make_smcl_log_name()
1653
+ self._open_smcl_log(smcl_path, log_name)
1654
+
1655
+ try:
1656
+ with self._redirect_io(output_buffer, error_buffer):
1657
+ try:
1658
+ if trace:
1659
+ self.stata.run("set trace on")
1660
+
1661
+ # Run the user code
1662
+ self.stata.run(code, echo=echo)
1663
+
1664
+ # Hold results IMMEDIATELY to prevent clobbering by cleanup
1665
+ self._hold_name = f"mcp_hold_{uuid.uuid4().hex[:8]}"
1666
+ self.stata.run(f"capture _return hold {self._hold_name}", echo=False)
1667
+
1668
+ finally:
1669
+ if trace:
1670
+ try:
1671
+ self.stata.run("set trace off")
1672
+ except Exception:
1673
+ pass
1674
+ finally:
1675
+ # Close SMCL log AFTER output redirection
1676
+ self._close_smcl_log(log_name)
1677
+ # Restore and capture results while still inside the lock
1678
+ self._restore_results_from_hold("_hold_name")
1679
+
1680
+ except Exception as e:
1681
+ sys_error = str(e)
1682
+ # Try to parse RC from exception message
1683
+ parsed_rc = self._parse_rc_from_text(sys_error)
1684
+ rc = parsed_rc if parsed_rc is not None else 1
1685
+
1686
+ # Read SMCL content as the authoritative source
1687
+ if smcl_path:
1688
+ smcl_content = self._read_smcl_file(smcl_path)
1689
+ # Clean up SMCL file
1690
+ self._safe_unlink(smcl_path)
1691
+
1692
+ stdout_content = output_buffer.getvalue()
1693
+ stderr_content = error_buffer.getvalue()
1694
+
1695
+ # If RC wasn't captured or is generic, try to parse from SMCL
1696
+ if rc in (0, 1, -1) and smcl_content:
1697
+ parsed_rc = self._parse_rc_from_smcl(smcl_content)
1698
+ if parsed_rc is not None and parsed_rc != 0:
1699
+ rc = parsed_rc
1700
+ elif rc == -1:
1701
+ rc = 0
1702
+
1703
+ # If stdout is empty but SMCL has content AND command succeeded, use SMCL as stdout
1704
+ # This handles cases where Stata writes to log but not to redirected stdout
1705
+ # For errors, we keep stdout empty and error info goes to ErrorEnvelope
1706
+ if rc == 0 and not stdout_content and smcl_content:
1707
+ # Convert SMCL to plain text for stdout
1708
+ stdout_content = self._smcl_to_text(smcl_content)
1709
+
1710
+ if rc != 0:
1711
+ if sys_error:
1712
+ msg = sys_error
1713
+ context = sys_error
1714
+ else:
1715
+ # Extract error from SMCL (authoritative source)
1716
+ msg, context = self._extract_error_from_smcl(smcl_content, rc)
1717
+
1718
+ error_envelope = ErrorEnvelope(
1719
+ message=msg,
1720
+ rc=rc,
1721
+ context=context,
1722
+ snippet=smcl_content[-800:] if smcl_content else (stdout_content + stderr_content)[-800:],
1723
+ smcl_output=smcl_content # Include raw SMCL for debugging
1724
+ )
1725
+ stderr_content = context
1726
+
1727
+ resp = CommandResponse(
1728
+ command=code,
1729
+ rc=rc,
1730
+ stdout=stdout_content,
1731
+ stderr=stderr_content,
1732
+ success=(rc == 0),
1733
+ error=error_envelope,
1734
+ log_path=smcl_path if smcl_path else None,
1735
+ smcl_output=smcl_content,
1736
+ )
1737
+
1738
+ # Capture results immediately after execution, INSIDE the lock
1739
+ try:
1740
+ self._last_results = self.get_stored_results(force_fresh=True)
1741
+ except Exception:
1742
+ self._last_results = None
1743
+
1744
+ return resp
1745
+
1746
+ def _exec_no_capture(self, code: str, echo: bool = False, trace: bool = False) -> CommandResponse:
1747
+ """Execute Stata code while leaving stdout/stderr alone."""
1748
+ if not self._initialized:
1749
+ self.init()
1750
+
1751
+ exc: Optional[Exception] = None
1752
+ ret_text: Optional[str] = None
1753
+ rc = 0
1754
+
1755
+ with self._exec_lock:
1756
+ try:
1757
+ from sfi import Scalar # Import SFI tools
1758
+ if trace:
1759
+ self.stata.run("set trace on")
1760
+ ret = self.stata.run(code, echo=echo)
1761
+ if isinstance(ret, str) and ret:
1762
+ ret_text = ret
1763
+ parsed_rc = self._parse_rc_from_text(ret_text)
1764
+ if parsed_rc is not None:
1765
+ rc = parsed_rc
1766
+
1767
+ except Exception as e:
1768
+ exc = e
1769
+ rc = 1
1770
+ finally:
1771
+ if trace:
1772
+ try:
1773
+ self.stata.run("set trace off")
1774
+ except Exception as e:
1775
+ logger.warning("Failed to turn off Stata trace mode: %s", e)
1776
+
1777
+ stdout = ""
1778
+ stderr = ""
1779
+ success = rc == 0 and exc is None
1780
+ error = None
1781
+ if not success:
1782
+ msg = str(exc) if exc else f"Stata error r({rc})"
1783
+ error = ErrorEnvelope(
1784
+ message=msg,
1785
+ rc=rc,
1786
+ command=code,
1787
+ stdout=ret_text,
1788
+ )
1789
+
1790
+ return CommandResponse(
1791
+ command=code,
1792
+ rc=rc,
1793
+ stdout=stdout,
1794
+ stderr=None,
1795
+ success=success,
1796
+ error=error,
1797
+ )
1798
+
1799
+ def _exec_no_capture_silent(self, code: str, echo: bool = False, trace: bool = False) -> CommandResponse:
1800
+ """Execute Stata code while suppressing stdout/stderr output."""
1801
+ if not self._initialized:
1802
+ self.init()
1803
+
1804
+ exc: Optional[Exception] = None
1805
+ ret_text: Optional[str] = None
1806
+ rc = 0
1807
+
1808
+ with self._exec_lock:
1809
+ try:
1810
+ from sfi import Scalar # Import SFI tools
1811
+ if trace:
1812
+ self.stata.run("set trace on")
1813
+ output_buf = StringIO()
1814
+ with redirect_stdout(output_buf), redirect_stderr(output_buf):
1815
+ ret = self.stata.run(code, echo=echo)
1816
+ if isinstance(ret, str) and ret:
1817
+ ret_text = ret
1818
+ # Try to parse RC from the output text.
1819
+ # Many Stata commands emit 'r(N);' on failure even when quietly.
1820
+ parsed_rc = self._parse_rc_from_text(ret_text)
1821
+ if parsed_rc is not None:
1822
+ rc = parsed_rc
1823
+ except Exception as e:
1824
+ exc = e
1825
+ rc = 1
1826
+ finally:
1827
+ if trace:
1828
+ try:
1829
+ self.stata.run("set trace off")
1830
+ except Exception as e:
1831
+ logger.warning("Failed to turn off Stata trace mode: %s", e)
1832
+
1833
+ stdout = ""
1834
+ stderr = ""
1835
+ success = rc == 0 and exc is None
1836
+ error = None
1837
+ if not success:
1838
+ msg = str(exc) if exc else f"Stata error r({rc})"
1839
+ error = ErrorEnvelope(
1840
+ message=msg,
1841
+ rc=rc,
1842
+ command=code,
1843
+ stdout=ret_text,
1844
+ )
1845
+
1846
+ return CommandResponse(
1847
+ command=code,
1848
+ rc=rc,
1849
+ stdout=stdout,
1850
+ stderr=None,
1851
+ success=success,
1852
+ error=error,
1853
+ )
1854
+
1855
+ def exec_lightweight(self, code: str) -> CommandResponse:
1856
+ """
1857
+ Executes a command using simple stdout redirection (no SMCL logs).
1858
+ Much faster on Windows as it avoids FS operations.
1859
+ LIMITED: Does not support error envelopes or complex return code parsing.
1860
+ """
1861
+ if not self._initialized:
1862
+ self.init()
1863
+
1864
+ code = self._maybe_rewrite_graph_name_in_command(code)
1865
+
1866
+ output_buffer = StringIO()
1867
+ error_buffer = StringIO()
1868
+ rc = 0
1869
+ exc = None
1870
+
1871
+ with self._exec_lock:
1872
+ with self._redirect_io(output_buffer, error_buffer):
1873
+ try:
1874
+ self.stata.run(code, echo=False)
1875
+ except Exception as e:
1876
+ exc = e
1877
+ rc = 1
1878
+
1879
+ stdout = output_buffer.getvalue()
1880
+ stderr = error_buffer.getvalue()
1881
+
1882
+ return CommandResponse(
1883
+ command=code,
1884
+ rc=rc,
1885
+ stdout=stdout,
1886
+ stderr=stderr if not exc else str(exc),
1887
+ success=(rc == 0),
1888
+ error=None
1889
+ )
1890
+
1891
+ async def run_command_streaming(
1892
+ self,
1893
+ code: str,
1894
+ *,
1895
+ notify_log: Callable[[str], Awaitable[None]],
1896
+ notify_progress: Optional[Callable[[float, Optional[float], Optional[str]], Awaitable[None]]] = None,
1897
+ echo: bool = True,
1898
+ trace: bool = False,
1899
+ max_output_lines: Optional[int] = None,
1900
+ cwd: Optional[str] = None,
1901
+ auto_cache_graphs: bool = False,
1902
+ on_graph_cached: Optional[Callable[[str, bool], Awaitable[None]]] = None,
1903
+ emit_graph_ready: bool = False,
1904
+ graph_ready_task_id: Optional[str] = None,
1905
+ graph_ready_format: str = "svg",
1906
+ ) -> CommandResponse:
1907
+ if not self._initialized:
1908
+ self.init()
1909
+
1910
+ code = self._maybe_rewrite_graph_name_in_command(code)
1911
+ auto_cache_graphs = auto_cache_graphs or emit_graph_ready
1912
+ total_lines = 0 # Commands (not do-files) do not have line-based progress
1913
+
1914
+ if cwd is not None and not os.path.isdir(cwd):
1915
+ return CommandResponse(
1916
+ command=code,
1917
+ rc=601,
1918
+ stdout="",
1919
+ stderr=None,
1920
+ success=False,
1921
+ error=ErrorEnvelope(
1922
+ message=f"cwd not found: {cwd}",
1923
+ rc=601,
1924
+ command=code,
1925
+ ),
1926
+ )
1927
+
1928
+ start_time = time.time()
1929
+ exc: Optional[Exception] = None
1930
+ smcl_content = ""
1931
+ smcl_path = None
1932
+
1933
+ # Setup streaming graph cache if enabled
1934
+ graph_cache = self._init_streaming_graph_cache(auto_cache_graphs, on_graph_cached, notify_log)
1935
+
1936
+ _log_file, log_path, tail, tee = self._create_streaming_log(trace=trace)
1937
+
1938
+ # Create SMCL log path for authoritative output capture
1939
+ smcl_path = self._create_smcl_log_path(base_dir=cwd)
1940
+ smcl_log_name = self._make_smcl_log_name()
1941
+
1942
+ # Inform the MCP client immediately where to read/tail the output.
1943
+ await notify_log(json.dumps({"event": "log_path", "path": smcl_path}))
1944
+
1945
+ rc = -1
1946
+ path_for_stata = code.replace("\\", "/")
1947
+ command = f'{path_for_stata}'
1948
+
1949
+ graph_ready_initial = self._capture_graph_state(graph_cache, emit_graph_ready)
1950
+
1951
+ # Increment AFTER capture so detected modifications are based on state BEFORE this command
1952
+ self._increment_command_idx()
1953
+
1954
+ graph_poll_state = [0.0]
1955
+
1956
+ async def on_chunk_for_graphs(_chunk: str) -> None:
1957
+ # Background the graph check so we don't block SMCL streaming or task completion
1958
+ asyncio.create_task(
1959
+ self._maybe_cache_graphs_on_chunk(
1960
+ graph_cache=graph_cache,
1961
+ emit_graph_ready=emit_graph_ready,
1962
+ notify_log=notify_log,
1963
+ graph_ready_task_id=graph_ready_task_id,
1964
+ graph_ready_format=graph_ready_format,
1965
+ graph_ready_initial=graph_ready_initial,
1966
+ last_check=graph_poll_state,
1967
+ )
1968
+ )
1969
+
1970
+ done = anyio.Event()
1971
+
1972
+ try:
1973
+ async with anyio.create_task_group() as tg:
1974
+ async def stream_smcl() -> None:
1975
+ try:
1976
+ await self._stream_smcl_log(
1977
+ smcl_path=smcl_path,
1978
+ notify_log=notify_log,
1979
+ done=done,
1980
+ on_chunk=on_chunk_for_graphs if graph_cache else None,
1981
+ )
1982
+ except Exception as exc:
1983
+ logger.debug("SMCL streaming failed: %s", exc)
1984
+
1985
+ tg.start_soon(stream_smcl)
1986
+
1987
+ if notify_progress is not None:
1988
+ if total_lines > 0:
1989
+ await notify_progress(0, float(total_lines), f"Executing command: 0/{total_lines}")
1990
+ else:
1991
+ await notify_progress(0, None, "Running command")
1992
+
1993
+ try:
1994
+ run_blocking = lambda: self._run_streaming_blocking(
1995
+ command=command,
1996
+ tee=tee,
1997
+ cwd=cwd,
1998
+ trace=trace,
1999
+ echo=echo,
2000
+ smcl_path=smcl_path,
2001
+ smcl_log_name=smcl_log_name,
2002
+ hold_attr="_hold_name_stream",
2003
+ require_smcl_log=True,
2004
+ )
2005
+ try:
2006
+ rc, exc = await anyio.to_thread.run_sync(
2007
+ run_blocking,
2008
+ abandon_on_cancel=True,
2009
+ )
2010
+ except TypeError:
2011
+ rc, exc = await anyio.to_thread.run_sync(run_blocking)
2012
+ except Exception as e:
2013
+ exc = e
2014
+ if rc in (-1, 0):
2015
+ rc = 1
2016
+ except get_cancelled_exc_class():
2017
+ self._request_break_in()
2018
+ await self._wait_for_stata_stop()
2019
+ raise
2020
+ finally:
2021
+ done.set()
2022
+ tee.close()
2023
+ except* Exception as exc_group:
2024
+ logger.debug("SMCL streaming task group failed: %s", exc_group)
2025
+
2026
+ # Read SMCL content as the authoritative source
2027
+ smcl_content = self._read_smcl_file(smcl_path)
2028
+
2029
+ if graph_cache:
2030
+ asyncio.create_task(
2031
+ self._cache_new_graphs(
2032
+ graph_cache,
2033
+ notify_progress=notify_progress,
2034
+ total_lines=total_lines,
2035
+ completed_label="Command",
2036
+ )
2037
+ )
2038
+
2039
+ combined = self._build_combined_log(tail, smcl_path, rc, trace, exc)
2040
+
2041
+ # Use SMCL content as primary source for RC detection
2042
+ if not exc or rc in (1, -1):
2043
+ parsed_rc = self._parse_rc_from_smcl(smcl_content)
2044
+ if parsed_rc is not None and parsed_rc != 0:
2045
+ rc = parsed_rc
2046
+ elif rc in (-1, 0, 1): # Also check text if rc is generic 1 or unset
2047
+ parsed_rc_text = self._parse_rc_from_text(combined)
2048
+ if parsed_rc_text is not None:
2049
+ rc = parsed_rc_text
2050
+ elif rc == -1:
2051
+ rc = 0 # Default to success if no error trace found
2052
+
2053
+ success = (rc == 0 and exc is None)
2054
+ stderr_final = None
2055
+ error = None
2056
+
2057
+ if not success:
2058
+ # Use SMCL as authoritative source for error extraction
2059
+ if smcl_content:
2060
+ msg, context = self._extract_error_from_smcl(smcl_content, rc)
2061
+ else:
2062
+ # Fallback to combined log
2063
+ msg, context = self._extract_error_and_context(combined, rc)
2064
+
2065
+ error = ErrorEnvelope(
2066
+ message=msg,
2067
+ context=context,
2068
+ rc=rc,
2069
+ command=command,
2070
+ log_path=log_path,
2071
+ snippet=smcl_content[-800:] if smcl_content else combined[-800:],
2072
+ smcl_output=smcl_content,
2073
+ )
2074
+ stderr_final = context
2075
+
2076
+ duration = time.time() - start_time
2077
+ logger.info(
2078
+ "stata.run(stream) rc=%s success=%s trace=%s duration_ms=%.2f code_preview=%s",
2079
+ rc,
2080
+ success,
2081
+ trace,
2082
+ duration * 1000,
2083
+ code.replace("\n", "\\n")[:120],
2084
+ )
2085
+
2086
+ result = CommandResponse(
2087
+ command=code,
2088
+ rc=rc,
2089
+ stdout="",
2090
+ stderr=stderr_final,
2091
+ log_path=log_path,
2092
+ success=success,
2093
+ error=error,
2094
+ smcl_output=smcl_content,
2095
+ )
2096
+
2097
+ if notify_progress is not None:
2098
+ await notify_progress(1, 1, "Finished")
2099
+
2100
+ return result
2101
+
2102
+ async def run_do_file_streaming(
2103
+ self,
2104
+ path: str,
2105
+ *,
2106
+ notify_log: Callable[[str], Awaitable[None]],
2107
+ notify_progress: Optional[Callable[[float, Optional[float], Optional[str]], Awaitable[None]]] = None,
2108
+ echo: bool = True,
2109
+ trace: bool = False,
2110
+ max_output_lines: Optional[int] = None,
2111
+ cwd: Optional[str] = None,
2112
+ auto_cache_graphs: bool = False,
2113
+ on_graph_cached: Optional[Callable[[str, bool], Awaitable[None]]] = None,
2114
+ emit_graph_ready: bool = False,
2115
+ graph_ready_task_id: Optional[str] = None,
2116
+ graph_ready_format: str = "svg",
2117
+ ) -> CommandResponse:
2118
+ effective_path, command, error_response = self._resolve_do_file_path(path, cwd)
2119
+ if error_response is not None:
2120
+ return error_response
2121
+
2122
+ total_lines = self._count_do_file_lines(effective_path)
2123
+ executed_lines = 0
2124
+ last_progress_time = 0.0
2125
+ dot_prompt = re.compile(r"^\.\s+\S")
2126
+
2127
+ async def on_chunk_for_progress(chunk: str) -> None:
2128
+ nonlocal executed_lines, last_progress_time
2129
+ if total_lines <= 0 or notify_progress is None:
2130
+ return
2131
+ for line in chunk.splitlines():
2132
+ if dot_prompt.match(line):
2133
+ executed_lines += 1
2134
+ if executed_lines > total_lines:
2135
+ executed_lines = total_lines
2136
+
2137
+ now = time.monotonic()
2138
+ if executed_lines > 0 and (now - last_progress_time) >= 0.25:
2139
+ last_progress_time = now
2140
+ await notify_progress(
2141
+ float(executed_lines),
2142
+ float(total_lines),
2143
+ f"Executing do-file: {executed_lines}/{total_lines}",
2144
+ )
2145
+
2146
+ if not self._initialized:
2147
+ self.init()
2148
+
2149
+ auto_cache_graphs = auto_cache_graphs or emit_graph_ready
2150
+
2151
+ start_time = time.time()
2152
+ exc: Optional[Exception] = None
2153
+ smcl_content = ""
2154
+ smcl_path = None
2155
+
2156
+ graph_cache = self._init_streaming_graph_cache(auto_cache_graphs, on_graph_cached, notify_log)
2157
+ _log_file, log_path, tail, tee = self._create_streaming_log(trace=trace)
2158
+
2159
+ base_dir = cwd or os.path.dirname(effective_path)
2160
+ smcl_path = self._create_smcl_log_path(base_dir=base_dir)
2161
+ smcl_log_name = self._make_smcl_log_name()
2162
+
2163
+ # Inform the MCP client immediately where to read/tail the output.
2164
+ await notify_log(json.dumps({"event": "log_path", "path": smcl_path}))
2165
+
2166
+ rc = -1
2167
+ graph_ready_initial = self._capture_graph_state(graph_cache, emit_graph_ready)
2168
+
2169
+ # Increment AFTER capture
2170
+ self._increment_command_idx()
2171
+
2172
+ graph_poll_state = [0.0]
2173
+
2174
+ async def on_chunk_for_graphs(_chunk: str) -> None:
2175
+ # Background the graph check so we don't block SMCL streaming or task completion
2176
+ asyncio.create_task(
2177
+ self._maybe_cache_graphs_on_chunk(
2178
+ graph_cache=graph_cache,
2179
+ emit_graph_ready=emit_graph_ready,
2180
+ notify_log=notify_log,
2181
+ graph_ready_task_id=graph_ready_task_id,
2182
+ graph_ready_format=graph_ready_format,
2183
+ graph_ready_initial=graph_ready_initial,
2184
+ last_check=graph_poll_state,
2185
+ )
2186
+ )
2187
+
2188
+ on_chunk_callback = on_chunk_for_progress
2189
+ if graph_cache:
2190
+ async def on_chunk_callback(chunk: str) -> None:
2191
+ await on_chunk_for_progress(chunk)
2192
+ await on_chunk_for_graphs(chunk)
2193
+
2194
+ done = anyio.Event()
2195
+
2196
+ try:
2197
+ async with anyio.create_task_group() as tg:
2198
+ async def stream_smcl() -> None:
2199
+ try:
2200
+ await self._stream_smcl_log(
2201
+ smcl_path=smcl_path,
2202
+ notify_log=notify_log,
2203
+ done=done,
2204
+ on_chunk=on_chunk_callback,
2205
+ )
2206
+ except Exception as exc:
2207
+ logger.debug("SMCL streaming failed: %s", exc)
2208
+
2209
+ tg.start_soon(stream_smcl)
2210
+
2211
+ if notify_progress is not None:
2212
+ if total_lines > 0:
2213
+ await notify_progress(0, float(total_lines), f"Executing do-file: 0/{total_lines}")
2214
+ else:
2215
+ await notify_progress(0, None, "Running do-file")
2216
+
2217
+ try:
2218
+ run_blocking = lambda: self._run_streaming_blocking(
2219
+ command=command,
2220
+ tee=tee,
2221
+ cwd=cwd,
2222
+ trace=trace,
2223
+ echo=echo,
2224
+ smcl_path=smcl_path,
2225
+ smcl_log_name=smcl_log_name,
2226
+ hold_attr="_hold_name_do",
2227
+ require_smcl_log=True,
2228
+ )
2229
+ try:
2230
+ rc, exc = await anyio.to_thread.run_sync(
2231
+ run_blocking,
2232
+ abandon_on_cancel=True,
2233
+ )
2234
+ except TypeError:
2235
+ rc, exc = await anyio.to_thread.run_sync(run_blocking)
2236
+ except Exception as e:
2237
+ exc = e
2238
+ if rc in (-1, 0):
2239
+ rc = 1
2240
+ except get_cancelled_exc_class():
2241
+ self._request_break_in()
2242
+ await self._wait_for_stata_stop()
2243
+ raise
2244
+ finally:
2245
+ done.set()
2246
+ tee.close()
2247
+ except* Exception as exc_group:
2248
+ logger.debug("SMCL streaming task group failed: %s", exc_group)
2249
+
2250
+ # Read SMCL content as the authoritative source
2251
+ smcl_content = self._read_smcl_file(smcl_path)
2252
+
2253
+ if graph_cache:
2254
+ asyncio.create_task(
2255
+ self._cache_new_graphs(
2256
+ graph_cache,
2257
+ notify_progress=notify_progress,
2258
+ total_lines=total_lines,
2259
+ completed_label="Do-file",
2260
+ )
2261
+ )
2262
+
2263
+ combined = self._build_combined_log(tail, log_path, rc, trace, exc)
2264
+
2265
+ # Use SMCL content as primary source for RC detection
2266
+ if not exc or rc in (1, -1):
2267
+ parsed_rc = self._parse_rc_from_smcl(smcl_content)
2268
+ if parsed_rc is not None and parsed_rc != 0:
2269
+ rc = parsed_rc
2270
+ elif rc in (-1, 0, 1):
2271
+ parsed_rc_text = self._parse_rc_from_text(combined)
2272
+ if parsed_rc_text is not None:
2273
+ rc = parsed_rc_text
2274
+ elif rc == -1:
2275
+ rc = 0 # Default to success if no error found
2276
+
2277
+ success = (rc == 0 and exc is None)
2278
+ stderr_final = None
2279
+ error = None
2280
+
2281
+ if not success:
2282
+ # Use SMCL as authoritative source for error extraction
2283
+ if smcl_content:
2284
+ msg, context = self._extract_error_from_smcl(smcl_content, rc)
2285
+ else:
2286
+ # Fallback to combined log
2287
+ msg, context = self._extract_error_and_context(combined, rc)
2288
+
2289
+ error = ErrorEnvelope(
2290
+ message=msg,
2291
+ context=context,
2292
+ rc=rc,
2293
+ command=command,
2294
+ log_path=log_path,
2295
+ snippet=smcl_content[-800:] if smcl_content else combined[-800:],
2296
+ smcl_output=smcl_content,
2297
+ )
2298
+ stderr_final = context
2299
+
2300
+ duration = time.time() - start_time
2301
+ logger.info(
2302
+ "stata.run(do stream) rc=%s success=%s trace=%s duration_ms=%.2f path=%s",
2303
+ rc,
2304
+ success,
2305
+ trace,
2306
+ duration * 1000,
2307
+ effective_path,
2308
+ )
2309
+
2310
+ result = CommandResponse(
2311
+ command=command,
2312
+ rc=rc,
2313
+ stdout="",
2314
+ stderr=stderr_final,
2315
+ log_path=log_path,
2316
+ success=success,
2317
+ error=error,
2318
+ smcl_output=smcl_content,
2319
+ )
2320
+
2321
+ if notify_progress is not None:
2322
+ if total_lines > 0:
2323
+ await notify_progress(float(total_lines), float(total_lines), f"Executing do-file: {total_lines}/{total_lines}")
2324
+ else:
2325
+ await notify_progress(1, 1, "Finished")
2326
+
2327
+ return result
2328
+
2329
+ def run_command_structured(self, code: str, echo: bool = True, trace: bool = False, max_output_lines: Optional[int] = None, cwd: Optional[str] = None) -> CommandResponse:
2330
+ """Runs a Stata command and returns a structured envelope.
2331
+
2332
+ Args:
2333
+ code: The Stata command to execute.
2334
+ echo: If True, the command itself is included in the output.
2335
+ trace: If True, enables trace mode for debugging.
2336
+ max_output_lines: If set, truncates stdout to this many lines (token efficiency).
2337
+ """
2338
+ result = self._exec_with_capture(code, echo=echo, trace=trace, cwd=cwd)
2339
+
2340
+ return self._truncate_command_output(result, max_output_lines)
2341
+
2342
+ def get_data(self, start: int = 0, count: int = 50) -> List[Dict[str, Any]]:
2343
+ """Returns valid JSON-serializable data."""
2344
+ if not self._initialized:
2345
+ self.init()
2346
+
2347
+ if count > self.MAX_DATA_ROWS:
2348
+ count = self.MAX_DATA_ROWS
2349
+
2350
+ with self._exec_lock:
2351
+ try:
2352
+ # Use pystata integration to retrieve data
2353
+ df = self.stata.pdataframe_from_data()
2354
+
2355
+ # Slice
2356
+ sliced = df.iloc[start : start + count]
2357
+
2358
+ # Convert to dict
2359
+ return sliced.to_dict(orient="records")
2360
+ except Exception as e:
2361
+ return [{"error": f"Failed to retrieve data: {e}"}]
2362
+
2363
+ def list_variables(self) -> List[Dict[str, str]]:
2364
+ """Returns list of variables with labels."""
2365
+ if not self._initialized:
2366
+ self.init()
2367
+
2368
+ # We can use sfi to be efficient
2369
+ from sfi import Data # type: ignore[import-not-found]
2370
+ vars_info = []
2371
+ with self._exec_lock:
2372
+ for i in range(Data.getVarCount()):
2373
+ var_index = i # 0-based
2374
+ name = Data.getVarName(var_index)
2375
+ label = Data.getVarLabel(var_index)
2376
+ type_str = Data.getVarType(var_index) # Returns int
2377
+
2378
+ vars_info.append({
2379
+ "name": name,
2380
+ "label": label,
2381
+ "type": str(type_str),
2382
+ })
2383
+ return vars_info
2384
+
2385
+ def get_dataset_state(self) -> Dict[str, Any]:
2386
+ """Return basic dataset state without mutating the dataset."""
2387
+ if not self._initialized:
2388
+ self.init()
2389
+
2390
+ from sfi import Data, Macro # type: ignore[import-not-found]
2391
+
2392
+ with self._exec_lock:
2393
+ n = int(Data.getObsTotal())
2394
+ k = int(Data.getVarCount())
2395
+
2396
+ frame = "default"
2397
+ sortlist = ""
2398
+ changed = False
2399
+ try:
2400
+ frame = str(Macro.getGlobal("frame") or "default")
2401
+ except Exception:
2402
+ logger.debug("Failed to get 'frame' macro", exc_info=True)
2403
+ frame = "default"
2404
+ try:
2405
+ sortlist = str(Macro.getGlobal("sortlist") or "")
2406
+ except Exception:
2407
+ logger.debug("Failed to get 'sortlist' macro", exc_info=True)
2408
+ sortlist = ""
2409
+ try:
2410
+ changed = bool(int(float(Macro.getGlobal("changed") or "0")))
2411
+ except Exception:
2412
+ logger.debug("Failed to get 'changed' macro", exc_info=True)
2413
+ changed = False
2414
+
2415
+ return {"frame": frame, "n": n, "k": k, "sortlist": sortlist, "changed": changed}
2416
+
2417
+ def _require_data_in_memory(self) -> None:
2418
+ state = self.get_dataset_state()
2419
+ if int(state.get("k", 0) or 0) == 0 and int(state.get("n", 0) or 0) == 0:
2420
+ # Stata empty dataset could still have k>0 n==0; treat that as ok.
2421
+ raise RuntimeError("No data in memory")
2422
+
2423
+ def _get_var_index_map(self) -> Dict[str, int]:
2424
+ from sfi import Data # type: ignore[import-not-found]
2425
+
2426
+ out: Dict[str, int] = {}
2427
+ with self._exec_lock:
2428
+ for i in range(int(Data.getVarCount())):
2429
+ try:
2430
+ out[str(Data.getVarName(i))] = i
2431
+ except Exception:
2432
+ continue
2433
+ return out
2434
+
2435
+ def list_variables_rich(self) -> List[Dict[str, Any]]:
2436
+ """Return variable metadata (name/type/label/format/valueLabel) without modifying the dataset."""
2437
+ if not self._initialized:
2438
+ self.init()
2439
+
2440
+ from sfi import Data # type: ignore[import-not-found]
2441
+
2442
+ vars_info: List[Dict[str, Any]] = []
2443
+ for i in range(int(Data.getVarCount())):
2444
+ name = str(Data.getVarName(i))
2445
+ label = None
2446
+ fmt = None
2447
+ vtype = None
2448
+ value_label = None
2449
+ try:
2450
+ label = Data.getVarLabel(i)
2451
+ except Exception:
2452
+ label = None
2453
+ try:
2454
+ fmt = Data.getVarFormat(i)
2455
+ except Exception:
2456
+ fmt = None
2457
+ try:
2458
+ vtype = Data.getVarType(i)
2459
+ except Exception:
2460
+ vtype = None
2461
+
2462
+ vars_info.append(
2463
+ {
2464
+ "name": name,
2465
+ "type": str(vtype) if vtype is not None else None,
2466
+ "label": label if label else None,
2467
+ "format": fmt if fmt else None,
2468
+ "valueLabel": value_label,
2469
+ }
2470
+ )
2471
+ return vars_info
2472
+
2473
+ @staticmethod
2474
+ def _is_stata_missing(value: Any) -> bool:
2475
+ if value is None:
2476
+ return True
2477
+ if isinstance(value, float):
2478
+ # Stata missing values typically show up as very large floats via sfi.Data.get
2479
+ return value > 8.0e307
2480
+ return False
2481
+
2482
+ def _normalize_cell(self, value: Any, *, max_chars: int) -> tuple[Any, bool]:
2483
+ if self._is_stata_missing(value):
2484
+ return ".", False
2485
+ if isinstance(value, str):
2486
+ if len(value) > max_chars:
2487
+ return value[:max_chars], True
2488
+ return value, False
2489
+ return value, False
2490
+
2491
+ def get_page(
2492
+ self,
2493
+ *,
2494
+ offset: int,
2495
+ limit: int,
2496
+ vars: List[str],
2497
+ include_obs_no: bool,
2498
+ max_chars: int,
2499
+ obs_indices: Optional[List[int]] = None,
2500
+ ) -> Dict[str, Any]:
2501
+ if not self._initialized:
2502
+ self.init()
2503
+
2504
+ from sfi import Data # type: ignore[import-not-found]
2505
+
2506
+ state = self.get_dataset_state()
2507
+ n = int(state.get("n", 0) or 0)
2508
+ k = int(state.get("k", 0) or 0)
2509
+ if k == 0 and n == 0:
2510
+ raise RuntimeError("No data in memory")
2511
+
2512
+ var_map = self._get_var_index_map()
2513
+ for v in vars:
2514
+ if v not in var_map:
2515
+ raise ValueError(f"Invalid variable: {v}")
2516
+
2517
+ if obs_indices is None:
2518
+ start = offset
2519
+ end = min(offset + limit, n)
2520
+ if start >= n:
2521
+ rows: list[list[Any]] = []
2522
+ returned = 0
2523
+ obs_list: list[int] = []
2524
+ else:
2525
+ obs_list = list(range(start, end))
2526
+ raw_rows = Data.get(var=vars, obs=obs_list)
2527
+ rows = raw_rows
2528
+ returned = len(rows)
2529
+ else:
2530
+ start = offset
2531
+ end = min(offset + limit, len(obs_indices))
2532
+ obs_list = obs_indices[start:end]
2533
+ raw_rows = Data.get(var=vars, obs=obs_list) if obs_list else []
2534
+ rows = raw_rows
2535
+ returned = len(rows)
2536
+
2537
+ out_vars = list(vars)
2538
+ out_rows: list[list[Any]] = []
2539
+ truncated_cells = 0
2540
+
2541
+ if include_obs_no:
2542
+ out_vars = ["_n"] + out_vars
2543
+
2544
+ for idx, raw in enumerate(rows):
2545
+ norm_row: list[Any] = []
2546
+ if include_obs_no:
2547
+ norm_row.append(int(obs_list[idx]) + 1)
2548
+ for cell in raw:
2549
+ norm, truncated = self._normalize_cell(cell, max_chars=max_chars)
2550
+ if truncated:
2551
+ truncated_cells += 1
2552
+ norm_row.append(norm)
2553
+ out_rows.append(norm_row)
2554
+
2555
+ return {
2556
+ "vars": out_vars,
2557
+ "rows": out_rows,
2558
+ "returned": returned,
2559
+ "truncated_cells": truncated_cells,
2560
+ }
2561
+
2562
+ def get_arrow_stream(
2563
+ self,
2564
+ *,
2565
+ offset: int,
2566
+ limit: int,
2567
+ vars: List[str],
2568
+ include_obs_no: bool,
2569
+ obs_indices: Optional[List[int]] = None,
2570
+ ) -> bytes:
2571
+ """
2572
+ Returns an Apache Arrow IPC stream (as bytes) for the requested data page.
2573
+ Uses Polars if available (faster), falls back to Pandas.
2574
+ """
2575
+ if not self._initialized:
2576
+ self.init()
2577
+
2578
+ import pyarrow as pa
2579
+ from sfi import Data # type: ignore[import-not-found]
2580
+
2581
+ use_polars = _get_polars_available()
2582
+ if use_polars:
2583
+ import polars as pl
2584
+ else:
2585
+ import pandas as pd
2586
+
2587
+ state = self.get_dataset_state()
2588
+ n = int(state.get("n", 0) or 0)
2589
+ k = int(state.get("k", 0) or 0)
2590
+ if k == 0 and n == 0:
2591
+ raise RuntimeError("No data in memory")
2592
+
2593
+ var_map = self._get_var_index_map()
2594
+ for v in vars:
2595
+ if v not in var_map:
2596
+ raise ValueError(f"Invalid variable: {v}")
2597
+
2598
+ # Determine observations to fetch
2599
+ if obs_indices is None:
2600
+ start = offset
2601
+ end = min(offset + limit, n)
2602
+ obs_list = list(range(start, end)) if start < n else []
2603
+ else:
2604
+ start = offset
2605
+ end = min(offset + limit, len(obs_indices))
2606
+ obs_list = obs_indices[start:end]
2607
+
2608
+ try:
2609
+ if not obs_list:
2610
+ # Empty schema-only table
2611
+ if use_polars:
2612
+ schema_cols = {}
2613
+ if include_obs_no:
2614
+ schema_cols["_n"] = pl.Int64
2615
+ for v in vars:
2616
+ schema_cols[v] = pl.Utf8
2617
+ table = pl.DataFrame(schema=schema_cols).to_arrow()
2618
+ else:
2619
+ columns = {}
2620
+ if include_obs_no:
2621
+ columns["_n"] = pa.array([], type=pa.int64())
2622
+ for v in vars:
2623
+ columns[v] = pa.array([], type=pa.string())
2624
+ table = pa.table(columns)
2625
+ else:
2626
+ # Fetch all data in one C-call
2627
+ raw_data = Data.get(var=vars, obs=obs_list, valuelabel=False)
2628
+
2629
+ if use_polars:
2630
+ df = pl.DataFrame(raw_data, schema=vars, orient="row")
2631
+ if include_obs_no:
2632
+ obs_nums = [i + 1 for i in obs_list]
2633
+ df = df.with_columns(pl.Series("_n", obs_nums, dtype=pl.Int64))
2634
+ df = df.select(["_n"] + vars)
2635
+ table = df.to_arrow()
2636
+ else:
2637
+ df = pd.DataFrame(raw_data, columns=vars)
2638
+ if include_obs_no:
2639
+ df.insert(0, "_n", [i + 1 for i in obs_list])
2640
+ table = pa.Table.from_pandas(df, preserve_index=False)
2641
+
2642
+ # Serialize to IPC Stream
2643
+ sink = pa.BufferOutputStream()
2644
+ with pa.RecordBatchStreamWriter(sink, table.schema) as writer:
2645
+ writer.write_table(table)
2646
+
2647
+ return sink.getvalue().to_pybytes()
2648
+
2649
+ except Exception as e:
2650
+ raise RuntimeError(f"Failed to generate Arrow stream: {e}")
2651
+
2652
+ _FILTER_IDENT = re.compile(r"\b[A-Za-z_][A-Za-z0-9_]*\b")
2653
+
2654
+ def _extract_filter_vars(self, filter_expr: str) -> List[str]:
2655
+ tokens = set(self._FILTER_IDENT.findall(filter_expr or ""))
2656
+ # Exclude python keywords we might inject.
2657
+ exclude = {"and", "or", "not", "True", "False", "None"}
2658
+ var_map = self._get_var_index_map()
2659
+ vars_used = [t for t in tokens if t not in exclude and t in var_map]
2660
+ return sorted(vars_used)
2661
+
2662
+ def _compile_filter_expr(self, filter_expr: str) -> Any:
2663
+ expr = (filter_expr or "").strip()
2664
+ if not expr:
2665
+ raise ValueError("Empty filter")
2666
+
2667
+ # Stata boolean operators.
2668
+ expr = expr.replace("&", " and ").replace("|", " or ")
2669
+
2670
+ # Replace missing literal '.' (but not numeric decimals like 0.5).
2671
+ expr = re.sub(r"(?<![0-9])\.(?![0-9A-Za-z_])", "None", expr)
2672
+
2673
+ try:
2674
+ return compile(expr, "<filterExpr>", "eval")
2675
+ except Exception as e:
2676
+ raise ValueError(f"Invalid filter expression: {e}")
2677
+
2678
+ def validate_filter_expr(self, filter_expr: str) -> None:
2679
+ if not self._initialized:
2680
+ self.init()
2681
+ state = self.get_dataset_state()
2682
+ if int(state.get("k", 0) or 0) == 0 and int(state.get("n", 0) or 0) == 0:
2683
+ raise RuntimeError("No data in memory")
2684
+
2685
+ vars_used = self._extract_filter_vars(filter_expr)
2686
+ if not vars_used:
2687
+ # still allow constant expressions like "1" or "True"
2688
+ self._compile_filter_expr(filter_expr)
2689
+ return
2690
+ self._compile_filter_expr(filter_expr)
2691
+
2692
+ def compute_view_indices(self, filter_expr: str, *, chunk_size: int = 5000) -> List[int]:
2693
+ if not self._initialized:
2694
+ self.init()
2695
+
2696
+ from sfi import Data # type: ignore[import-not-found]
2697
+
2698
+ state = self.get_dataset_state()
2699
+ n = int(state.get("n", 0) or 0)
2700
+ k = int(state.get("k", 0) or 0)
2701
+ if k == 0 and n == 0:
2702
+ raise RuntimeError("No data in memory")
2703
+
2704
+ vars_used = self._extract_filter_vars(filter_expr)
2705
+ code = self._compile_filter_expr(filter_expr)
2706
+ _ = self._get_var_index_map()
2707
+
2708
+ is_string_vars = []
2709
+ if vars_used:
2710
+ from sfi import Variable # type: ignore
2711
+ is_string_vars = [Variable.isString(v) for v in vars_used]
2712
+
2713
+ indices: List[int] = []
2714
+ for start in range(0, n, chunk_size):
2715
+ end = min(start + chunk_size, n)
2716
+ obs_list = list(range(start, end))
2717
+ raw_rows = Data.get(var=vars_used, obs=obs_list) if vars_used else [[None] for _ in obs_list]
2718
+
2719
+ # Try Rust optimization for the chunk
2720
+ if vars_used and raw_rows:
2721
+ # Transpose rows to columns for Rust
2722
+ cols = []
2723
+ # Extract columns
2724
+ for j in range(len(vars_used)):
2725
+ col_data_list = [row[j] for row in raw_rows]
2726
+ if not is_string_vars[j]:
2727
+ import numpy as np
2728
+ col_data = np.array(col_data_list, dtype=np.float64)
2729
+ else:
2730
+ col_data = col_data_list
2731
+ cols.append(col_data)
2732
+
2733
+ rust_indices = compute_filter_indices(filter_expr, vars_used, cols, is_string_vars)
2734
+ if rust_indices is not None:
2735
+ indices.extend([int(obs_list[i]) for i in rust_indices])
2736
+ continue
2737
+
2738
+ for row_i, obs in enumerate(obs_list):
2739
+ env: Dict[str, Any] = {}
2740
+ if vars_used:
2741
+ for j, v in enumerate(vars_used):
2742
+ val = raw_rows[row_i][j]
2743
+ env[v] = None if self._is_stata_missing(val) else val
2744
+
2745
+ ok = False
2746
+ try:
2747
+ ok = bool(eval(code, {"__builtins__": {}}, env))
2748
+ except NameError as e:
2749
+ raise ValueError(f"Invalid filter: {e}")
2750
+ except Exception as e:
2751
+ raise ValueError(f"Invalid filter: {e}")
2752
+
2753
+ if ok:
2754
+ indices.append(int(obs))
2755
+
2756
+ return indices
2757
+
2758
+ def apply_sort(self, sort_spec: List[str]) -> None:
2759
+ """
2760
+ Apply sorting to the dataset using gsort.
2761
+
2762
+ Args:
2763
+ sort_spec: List of variables to sort by, with optional +/- prefix.
2764
+ e.g., ["-price", "+mpg"] sorts by price descending, then mpg ascending.
2765
+ No prefix is treated as ascending (+).
2766
+
2767
+ Raises:
2768
+ ValueError: If sort_spec is invalid or contains invalid variables
2769
+ RuntimeError: If no data in memory or sort command fails
2770
+ """
2771
+ if not self._initialized:
2772
+ self.init()
2773
+
2774
+ state = self.get_dataset_state()
2775
+ if int(state.get("k", 0) or 0) == 0 and int(state.get("n", 0) or 0) == 0:
2776
+ raise RuntimeError("No data in memory")
2777
+
2778
+ if not sort_spec or not isinstance(sort_spec, list):
2779
+ raise ValueError("sort_spec must be a non-empty list")
2780
+
2781
+ # Validate all variables exist
2782
+ var_map = self._get_var_index_map()
2783
+ for spec in sort_spec:
2784
+ if not isinstance(spec, str) or not spec:
2785
+ raise ValueError(f"Invalid sort specification: {spec!r}")
2786
+ # Extract variable name (remove +/- prefix if present)
2787
+ varname = spec.lstrip("+-")
2788
+ if not varname:
2789
+ raise ValueError(f"Invalid sort specification: {spec!r}")
2790
+
2791
+ if varname not in var_map:
2792
+ raise ValueError(f"Variable not found: {varname}")
2793
+
2794
+ # Build gsort command
2795
+ # gsort uses - for descending, + or nothing for ascending
2796
+ gsort_args = []
2797
+ for spec in sort_spec:
2798
+ if spec.startswith("-") or spec.startswith("+"):
2799
+ gsort_args.append(spec)
2800
+ else:
2801
+ # No prefix means ascending, add + explicitly for clarity
2802
+ gsort_args.append(f"+{spec}")
2803
+
2804
+ cmd = f"gsort {' '.join(gsort_args)}"
2805
+
2806
+ try:
2807
+ # Sorting is hot-path for UI paging; use lightweight execution.
2808
+ result = self.exec_lightweight(cmd)
2809
+ if not result.success:
2810
+ error_msg = result.stderr or "Sort failed"
2811
+ raise RuntimeError(f"Failed to sort dataset: {error_msg}")
2812
+ except Exception as e:
2813
+ if isinstance(e, RuntimeError):
2814
+ raise
2815
+ raise RuntimeError(f"Failed to sort dataset: {e}")
2816
+
2817
+ def get_variable_details(self, varname: str) -> str:
2818
+ """Returns codebook/summary for a specific variable."""
2819
+ resp = self.run_command_structured(f"codebook {varname}", echo=True)
2820
+ if resp.success:
2821
+ return resp.stdout
2822
+ if resp.error:
2823
+ return resp.error.message
2824
+ return ""
2825
+
2826
+ def list_variables_structured(self) -> VariablesResponse:
2827
+ vars_info: List[VariableInfo] = []
2828
+ for item in self.list_variables():
2829
+ vars_info.append(
2830
+ VariableInfo(
2831
+ name=item.get("name", ""),
2832
+ label=item.get("label"),
2833
+ type=item.get("type"),
2834
+ )
2835
+ )
2836
+ return VariablesResponse(variables=vars_info)
2837
+
2838
+ def list_graphs(self, *, force_refresh: bool = False) -> List[str]:
2839
+ """Returns list of graphs in memory with TTL caching."""
2840
+ if not self._initialized:
2841
+ self.init()
2842
+
2843
+ import time
2844
+
2845
+ # Prevent recursive Stata calls - if we're already executing, return cached or empty
2846
+ if self._is_executing:
2847
+ with self._list_graphs_cache_lock:
2848
+ if self._list_graphs_cache is not None:
2849
+ logger.debug("Recursive list_graphs call prevented, returning cached value")
2850
+ return self._list_graphs_cache
2851
+ else:
2852
+ logger.debug("Recursive list_graphs call prevented, returning empty list")
2853
+ return []
2854
+
2855
+ # Check if cache is valid
2856
+ current_time = time.time()
2857
+ with self._list_graphs_cache_lock:
2858
+ if (not force_refresh and self._list_graphs_cache is not None and
2859
+ current_time - self._list_graphs_cache_time < self.LIST_GRAPHS_TTL):
2860
+ return self._list_graphs_cache
2861
+
2862
+ # Cache miss or expired, fetch fresh data
2863
+ with self._exec_lock:
2864
+ try:
2865
+ # Preservation of r() results is critical because this can be called
2866
+ # automatically after every user command (e.g., during streaming).
2867
+ import time
2868
+ hold_name = f"_mcp_ghold_{int(time.time() * 1000 % 1000000)}"
2869
+ self.stata.run(f"capture _return hold {hold_name}", echo=False)
2870
+
2871
+ try:
2872
+ self.stata.run("macro define mcp_graph_list \"\"", echo=False)
2873
+ self.stata.run("quietly graph dir, memory", echo=False)
2874
+ from sfi import Macro # type: ignore[import-not-found]
2875
+ self.stata.run("macro define mcp_graph_list `r(list)'", echo=False)
2876
+ graph_list_str = Macro.getGlobal("mcp_graph_list")
2877
+ finally:
2878
+ self.stata.run(f"capture _return restore {hold_name}", echo=False)
2879
+
2880
+ raw_list = graph_list_str.split() if graph_list_str else []
2881
+
2882
+ # Map internal Stata names back to user-facing names when we have an alias.
2883
+ reverse = getattr(self, "_graph_name_reverse", {})
2884
+ graph_list = [reverse.get(n, n) for n in raw_list]
2885
+
2886
+ result = graph_list
2887
+
2888
+ # Update cache
2889
+ with self._list_graphs_cache_lock:
2890
+ self._list_graphs_cache = result
2891
+ self._list_graphs_cache_time = time.time()
2892
+
2893
+ return result
2894
+
2895
+ except Exception as e:
2896
+ # On error, return cached result if available, otherwise empty list
2897
+ with self._list_graphs_cache_lock:
2898
+ if self._list_graphs_cache is not None:
2899
+ logger.warning(f"list_graphs failed, returning cached result: {e}")
2900
+ return self._list_graphs_cache
2901
+ logger.warning(f"list_graphs failed, no cache available: {e}")
2902
+ return []
2903
+
2904
+ def list_graphs_structured(self) -> GraphListResponse:
2905
+ names = self.list_graphs()
2906
+ active_name = names[-1] if names else None
2907
+ graphs = [GraphInfo(name=n, active=(n == active_name)) for n in names]
2908
+ return GraphListResponse(graphs=graphs)
2909
+
2910
+ def invalidate_list_graphs_cache(self) -> None:
2911
+ """Invalidate the list_graphs cache to force fresh data on next call."""
2912
+ with self._list_graphs_cache_lock:
2913
+ self._list_graphs_cache = None
2914
+ self._list_graphs_cache_time = 0
2915
+
2916
+ def export_graph(self, graph_name: str = None, filename: str = None, format: str = "pdf") -> str:
2917
+ """Exports graph to a temp file (pdf or png) and returns the path.
2918
+
2919
+ On Windows, PyStata can crash when exporting PNGs directly. For PNG on
2920
+ Windows, we save the graph to .gph and invoke the Stata executable in
2921
+ batch mode to export the PNG out-of-process.
2922
+ """
2923
+ import tempfile
2924
+
2925
+ fmt = (format or "pdf").strip().lower()
2926
+ if fmt not in {"pdf", "png", "svg"}:
2927
+ raise ValueError(f"Unsupported graph export format: {format}. Allowed: pdf, png, svg.")
2928
+
2929
+
2930
+ if not filename:
2931
+ suffix = f".{fmt}"
2932
+ with tempfile.NamedTemporaryFile(prefix="mcp_stata_", suffix=suffix, delete=False) as tmp:
2933
+ filename = tmp.name
2934
+ else:
2935
+ # Ensure fresh start
2936
+ if os.path.exists(filename):
2937
+ try:
2938
+ os.remove(filename)
2939
+ except Exception:
2940
+ pass
2941
+
2942
+ # Keep the user-facing path as a normal absolute Windows path
2943
+ user_filename = os.path.abspath(filename)
2944
+
2945
+ if fmt == "png" and os.name == "nt":
2946
+ # 1) Save graph to a .gph file from the embedded session
2947
+ with tempfile.NamedTemporaryFile(prefix="mcp_stata_graph_", suffix=".gph", delete=False) as gph_tmp:
2948
+ gph_path = gph_tmp.name
2949
+ gph_path_for_stata = gph_path.replace("\\", "/")
2950
+ # Make the target graph current, then save without name() (which isn't accepted there)
2951
+ if graph_name:
2952
+ self._exec_no_capture_silent(f'quietly graph display "{graph_name}"', echo=False)
2953
+ save_cmd = f'quietly graph save "{gph_path_for_stata}", replace'
2954
+ save_resp = self._exec_no_capture_silent(save_cmd, echo=False)
2955
+ if not save_resp.success:
2956
+ msg = save_resp.error.message if save_resp.error else f"graph save failed (rc={save_resp.rc})"
2957
+ raise RuntimeError(msg)
2958
+
2959
+ # 2) Prepare a do-file to export PNG externally
2960
+ user_filename_fwd = user_filename.replace("\\", "/")
2961
+ do_lines = [
2962
+ f'quietly graph use "{gph_path_for_stata}"',
2963
+ f'quietly graph export "{user_filename_fwd}", replace as(png)',
2964
+ "exit",
2965
+ ]
2966
+ with tempfile.NamedTemporaryFile(prefix="mcp_stata_export_", suffix=".do", delete=False, mode="w", encoding="ascii") as do_tmp:
2967
+ do_tmp.write("\n".join(do_lines))
2968
+ do_path = do_tmp.name
2969
+
2970
+ stata_exe = getattr(self, "_stata_exec_path", None)
2971
+ if not stata_exe or not os.path.exists(stata_exe):
2972
+ raise RuntimeError("Stata executable path unavailable for PNG export")
2973
+
2974
+ workdir = os.path.dirname(do_path) or None
2975
+ log_path = os.path.splitext(do_path)[0] + ".log"
2976
+
2977
+ cmd = [stata_exe, "/e", "do", do_path]
2978
+ try:
2979
+ completed = subprocess.run(
2980
+ cmd,
2981
+ capture_output=True,
2982
+ text=True,
2983
+ timeout=30,
2984
+ cwd=workdir,
2985
+ )
2986
+ except subprocess.TimeoutExpired:
2987
+ raise RuntimeError("External Stata export timed out")
2988
+ finally:
2989
+ try:
2990
+ os.remove(do_path)
2991
+ except Exception:
2992
+ # Ignore errors during temporary do-file cleanup (file may not exist or be locked)
2993
+ logger.warning("Failed to remove temporary do-file: %s", do_path, exc_info=True)
2994
+
2995
+ try:
2996
+ os.remove(gph_path)
2997
+ except Exception:
2998
+ logger.warning("Failed to remove temporary graph file: %s", gph_path, exc_info=True)
2999
+
3000
+ try:
3001
+ if os.path.exists(log_path):
3002
+ os.remove(log_path)
3003
+ except Exception:
3004
+ logger.warning("Failed to remove temporary log file: %s", log_path, exc_info=True)
3005
+
3006
+ if completed.returncode != 0:
3007
+ err = completed.stderr.strip() or completed.stdout.strip() or str(completed.returncode)
3008
+ raise RuntimeError(f"External Stata export failed: {err}")
3009
+
3010
+ else:
3011
+ # Stata prefers forward slashes in its command parser on Windows
3012
+ filename_for_stata = user_filename.replace("\\", "/")
3013
+
3014
+ if graph_name:
3015
+ resolved = self._resolve_graph_name_for_stata(graph_name)
3016
+ # Use display + export without name() for maximum compatibility.
3017
+ # name(NAME) often fails in PyStata for non-active graphs (r(693)).
3018
+ self._exec_no_capture_silent(f'quietly graph display "{resolved}"', echo=False)
3019
+
3020
+ cmd = f'quietly graph export "{filename_for_stata}", replace as({fmt})'
3021
+
3022
+ # Avoid stdout/stderr redirection for graph export because PyStata's
3023
+ # output thread can crash on Windows when we swap stdio handles.
3024
+ resp = self._exec_no_capture_silent(cmd, echo=False)
3025
+ if not resp.success:
3026
+ # Retry once after a short pause in case Stata had a transient file handle issue
3027
+ time.sleep(0.2)
3028
+ resp_retry = self._exec_no_capture_silent(cmd, echo=False)
3029
+ if not resp_retry.success:
3030
+ msg = resp_retry.error.message if resp_retry.error else f"graph export failed (rc={resp_retry.rc})"
3031
+ raise RuntimeError(msg)
3032
+ resp = resp_retry
3033
+
3034
+ if os.path.exists(user_filename):
3035
+ try:
3036
+ size = os.path.getsize(user_filename)
3037
+ if size == 0:
3038
+ raise RuntimeError(f"Graph export failed: produced empty file {user_filename}")
3039
+ if size > self.MAX_GRAPH_BYTES:
3040
+ raise RuntimeError(
3041
+ f"Graph export failed: file too large (> {self.MAX_GRAPH_BYTES} bytes): {user_filename}"
3042
+ )
3043
+ except Exception as size_err:
3044
+ # Clean up oversized or unreadable files
3045
+ try:
3046
+ os.remove(user_filename)
3047
+ except Exception:
3048
+ pass
3049
+ raise size_err
3050
+ return user_filename
3051
+
3052
+ # If file missing, it failed. Check output for details.
3053
+ msg = resp.error.message if resp.error else "graph export failed: file missing"
3054
+ raise RuntimeError(msg)
3055
+
3056
+ def get_help(self, topic: str, plain_text: bool = False) -> str:
3057
+ """Returns help text as Markdown (default) or plain text."""
3058
+ if not self._initialized:
3059
+ self.init()
3060
+
3061
+ with self._exec_lock:
3062
+ # Try to locate the .sthlp help file
3063
+ # We use 'capture' to avoid crashing if not found
3064
+ self.stata.run(f"capture findfile {topic}.sthlp")
3065
+
3066
+ # Retrieve the found path from r(fn)
3067
+ from sfi import Macro # type: ignore[import-not-found]
3068
+ self.stata.run("global mcp_help_file `r(fn)'")
3069
+ fn = Macro.getGlobal("mcp_help_file")
3070
+
3071
+ if fn and os.path.exists(fn):
3072
+ try:
3073
+ with open(fn, 'r', encoding='utf-8', errors='replace') as f:
3074
+ smcl = f.read()
3075
+ if plain_text:
3076
+ return self._smcl_to_text(smcl)
3077
+ try:
3078
+ return smcl_to_markdown(smcl, adopath=os.path.dirname(fn), current_file=os.path.splitext(os.path.basename(fn))[0])
3079
+ except Exception as parse_err:
3080
+ logger.warning("SMCL to Markdown failed, falling back to plain text: %s", parse_err)
3081
+ return self._smcl_to_text(smcl)
3082
+ except Exception as e:
3083
+ logger.warning("Help file read failed for %s: %s", topic, e)
3084
+
3085
+ # If no help file found, return a fallback message
3086
+ return f"Help file for '{topic}' not found."
3087
+
3088
+ def get_stored_results(self, force_fresh: bool = False) -> Dict[str, Any]:
3089
+ """Returns e() and r() results using SFI for maximum reliability."""
3090
+ if not force_fresh and self._last_results is not None:
3091
+ return self._last_results
3092
+
3093
+ if not self._initialized:
3094
+ self.init()
3095
+
3096
+ with self._exec_lock:
3097
+ # We must be extremely careful not to clobber r()/e() while fetching their names.
3098
+ # We use a hold to peek at the results.
3099
+ hold_name = f"mcp_peek_{uuid.uuid4().hex[:8]}"
3100
+ self.stata.run(f"capture _return hold {hold_name}", echo=False)
3101
+
3102
+ try:
3103
+ from sfi import Scalar, Macro
3104
+ results = {"r": {}, "e": {}}
3105
+
3106
+ for rclass in ["r", "e"]:
3107
+ # Restore with 'hold' to peek at results without losing them from the hold
3108
+ # Note: Stata 18+ supports 'restore ..., hold' which is ideal.
3109
+ self.stata.run(f"capture _return restore {hold_name}, hold", echo=False)
3110
+
3111
+ # Fetch names using backtick expansion (which we verified works better than colon)
3112
+ # and avoid leading underscores which were causing syntax errors with 'global'
3113
+ self.stata.run(f"macro define mcp_scnames `: {rclass}(scalars)'", echo=False)
3114
+ self.stata.run(f"macro define mcp_macnames `: {rclass}(macros)'", echo=False)
3115
+
3116
+ # 1. Capture Scalars
3117
+ names_str = Macro.getGlobal("mcp_scnames")
3118
+ if names_str:
3119
+ for name in names_str.split():
3120
+ try:
3121
+ val = Scalar.getValue(f"{rclass}({name})")
3122
+ results[rclass][name] = val
3123
+ except Exception:
3124
+ pass
3125
+
3126
+ # 2. Capture Macros (strings)
3127
+ macros_str = Macro.getGlobal("mcp_macnames")
3128
+ if macros_str:
3129
+ for name in macros_str.split():
3130
+ try:
3131
+ # Restore/Hold again to be safe before fetching each macro
3132
+ self.stata.run(f"capture _return restore {hold_name}, hold", echo=False)
3133
+ # Capture the string value into a macro
3134
+ self.stata.run(f"macro define mcp_mval `{rclass}({name})'", echo=False)
3135
+ val = Macro.getGlobal("mcp_mval")
3136
+ results[rclass][name] = val
3137
+ except Exception:
3138
+ pass
3139
+
3140
+ # Cleanup
3141
+ self.stata.run("macro drop mcp_scnames mcp_macnames mcp_mval", echo=False)
3142
+ self.stata.run(f"capture _return restore {hold_name}", echo=False) # Restore one last time to leave Stata in correct state
3143
+
3144
+ self._last_results = results
3145
+ return results
3146
+ except Exception as e:
3147
+ logger.error(f"SFI-based get_stored_results failed: {e}")
3148
+ # Try to clean up hold if we failed
3149
+ try:
3150
+ self.stata.run(f"capture _return drop {hold_name}", echo=False)
3151
+ except Exception:
3152
+ pass
3153
+ return {"r": {}, "e": {}}
3154
+
3155
+ def invalidate_graph_cache(self, graph_name: str = None) -> None:
3156
+ """Invalidate cache for specific graph or all graphs.
3157
+
3158
+ Args:
3159
+ graph_name: Specific graph name to invalidate. If None, clears all cache.
3160
+ """
3161
+ self._initialize_cache()
3162
+
3163
+ with self._cache_lock:
3164
+ if graph_name is None:
3165
+ # Clear all cache
3166
+ self._preemptive_cache.clear()
3167
+ else:
3168
+ # Clear specific graph cache
3169
+ if graph_name in self._preemptive_cache:
3170
+ del self._preemptive_cache[graph_name]
3171
+ # Also clear hash if present
3172
+ hash_key = f"{graph_name}_hash"
3173
+ if hash_key in self._preemptive_cache:
3174
+ del self._preemptive_cache[hash_key]
3175
+
3176
+ def _initialize_cache(self) -> None:
3177
+ """Initialize cache in a thread-safe manner."""
3178
+ import tempfile
3179
+ import threading
3180
+ import os
3181
+ import uuid
3182
+
3183
+ with StataClient._cache_init_lock: # Use class-level lock
3184
+ if not hasattr(self, '_cache_initialized'):
3185
+ self._preemptive_cache = {}
3186
+ self._cache_access_times = {} # Track access times for LRU
3187
+ self._cache_sizes = {} # Track individual cache item sizes
3188
+ self._total_cache_size = 0 # Track total cache size in bytes
3189
+ # Use unique identifier to avoid conflicts
3190
+ unique_id = f"preemptive_cache_{uuid.uuid4().hex[:8]}_{os.getpid()}"
3191
+ self._preemptive_cache_dir = tempfile.mkdtemp(prefix=unique_id)
3192
+ self._cache_lock = threading.Lock()
3193
+ self._cache_initialized = True
3194
+
3195
+ # Register cleanup function
3196
+ import atexit
3197
+ atexit.register(self._cleanup_cache)
3198
+ else:
3199
+ # Cache already initialized, but directory might have been removed.
3200
+ if (not hasattr(self, '_preemptive_cache_dir') or
3201
+ not self._preemptive_cache_dir or
3202
+ not os.path.isdir(self._preemptive_cache_dir)):
3203
+ unique_id = f"preemptive_cache_{uuid.uuid4().hex[:8]}_{os.getpid()}"
3204
+ self._preemptive_cache_dir = tempfile.mkdtemp(prefix=unique_id)
3205
+
3206
+ def _cleanup_cache(self) -> None:
3207
+ """Clean up cache directory and files."""
3208
+ import os
3209
+ import shutil
3210
+
3211
+ if hasattr(self, '_preemptive_cache_dir') and self._preemptive_cache_dir:
3212
+ try:
3213
+ shutil.rmtree(self._preemptive_cache_dir, ignore_errors=True)
3214
+ except Exception:
3215
+ pass # Best effort cleanup
3216
+
3217
+ if hasattr(self, '_preemptive_cache'):
3218
+ self._preemptive_cache.clear()
3219
+
3220
+ def _evict_cache_if_needed(self, new_item_size: int = 0) -> None:
3221
+ """
3222
+ Evict least recently used cache items if cache size limits are exceeded.
3223
+
3224
+ NOTE: The caller is responsible for holding ``self._cache_lock`` while
3225
+ invoking this method, so that eviction and subsequent cache insertion
3226
+ (if any) occur within a single critical section.
3227
+ """
3228
+ import time
3229
+
3230
+ # Check if we need to evict based on count or size
3231
+ needs_eviction = (
3232
+ len(self._preemptive_cache) > StataClient.MAX_CACHE_SIZE or
3233
+ self._total_cache_size + new_item_size > StataClient.MAX_CACHE_BYTES
3234
+ )
3235
+
3236
+ if not needs_eviction:
3237
+ return
3238
+
3239
+ # Sort by access time (oldest first)
3240
+ items_by_access = sorted(
3241
+ self._cache_access_times.items(),
3242
+ key=lambda x: x[1]
3243
+ )
3244
+
3245
+ evicted_count = 0
3246
+ for graph_name, access_time in items_by_access:
3247
+ if (len(self._preemptive_cache) < StataClient.MAX_CACHE_SIZE and
3248
+ self._total_cache_size + new_item_size <= StataClient.MAX_CACHE_BYTES):
3249
+ break
3250
+
3251
+ # Remove from cache
3252
+ if graph_name in self._preemptive_cache:
3253
+ cache_path = self._preemptive_cache[graph_name]
3254
+
3255
+ # Remove file
3256
+ try:
3257
+ if os.path.exists(cache_path):
3258
+ os.remove(cache_path)
3259
+ except Exception:
3260
+ pass
3261
+
3262
+ # Update tracking
3263
+ item_size = self._cache_sizes.get(graph_name, 0)
3264
+ del self._preemptive_cache[graph_name]
3265
+ del self._cache_access_times[graph_name]
3266
+ if graph_name in self._cache_sizes:
3267
+ del self._cache_sizes[graph_name]
3268
+ self._total_cache_size -= item_size
3269
+ evicted_count += 1
3270
+
3271
+ # Remove hash entry if exists
3272
+ hash_key = f"{graph_name}_hash"
3273
+ if hash_key in self._preemptive_cache:
3274
+ del self._preemptive_cache[hash_key]
3275
+
3276
+ if evicted_count > 0:
3277
+ logger.debug(f"Evicted {evicted_count} items from graph cache due to size limits")
3278
+
3279
+ def _get_content_hash(self, data: bytes) -> str:
3280
+ """Generate content hash for cache validation."""
3281
+ import hashlib
3282
+ return hashlib.md5(data).hexdigest()
3283
+
3284
+ def _sanitize_filename(self, name: str) -> str:
3285
+ """Sanitize graph name for safe file system usage."""
3286
+ import re
3287
+ # Remove or replace problematic characters
3288
+ safe_name = re.sub(r'[<>:"/\\|?*]', '_', name)
3289
+ safe_name = re.sub(r'[^\w\-_.]', '_', safe_name)
3290
+ # Limit length
3291
+ return safe_name[:100] if len(safe_name) > 100 else safe_name
3292
+
3293
+ def _validate_graph_exists(self, graph_name: str) -> bool:
3294
+ """Validate that graph still exists in Stata."""
3295
+ try:
3296
+ # First try to get graph list to verify existence
3297
+ graph_list = self.list_graphs(force_refresh=True)
3298
+ if graph_name not in graph_list:
3299
+ return False
3300
+
3301
+ # Additional validation by attempting to display the graph
3302
+ resolved = self._resolve_graph_name_for_stata(graph_name)
3303
+ cmd = f'quietly graph display {resolved}'
3304
+ resp = self._exec_no_capture_silent(cmd, echo=False)
3305
+ return resp.success
3306
+ except Exception:
3307
+ return False
3308
+
3309
+ def _is_cache_valid(self, graph_name: str, cache_path: str) -> bool:
3310
+ """Check if cached content is still valid using internal signatures."""
3311
+ try:
3312
+ if not os.path.exists(cache_path) or os.path.getsize(cache_path) == 0:
3313
+ return False
3314
+
3315
+ current_sig = self._get_graph_signature(graph_name)
3316
+ cached_sig = self._preemptive_cache.get(f"{graph_name}_sig")
3317
+
3318
+ # If we have a signature match, it's valid for the current command session
3319
+ if cached_sig and cached_sig == current_sig:
3320
+ return True
3321
+
3322
+ # Otherwise it's invalid (needs refresh for new command)
3323
+ return False
3324
+ except Exception:
3325
+ return False
3326
+
3327
+ def export_graphs_all(self) -> GraphExportResponse:
3328
+ """Exports all graphs to file paths."""
3329
+ exports: List[GraphExport] = []
3330
+ graph_names = self.list_graphs(force_refresh=True)
3331
+
3332
+ if not graph_names:
3333
+ return GraphExportResponse(graphs=exports)
3334
+
3335
+ import tempfile
3336
+ import os
3337
+ import threading
3338
+ import uuid
3339
+ import time
3340
+ import logging
3341
+
3342
+ # Initialize cache in thread-safe manner
3343
+ self._initialize_cache()
3344
+
3345
+ def _cache_keyed_svg_path(name: str) -> str:
3346
+ import hashlib
3347
+ safe_name = self._sanitize_filename(name)
3348
+ suffix = hashlib.md5((name or "").encode("utf-8")).hexdigest()[:8]
3349
+ return os.path.join(self._preemptive_cache_dir, f"{safe_name}_{suffix}.svg")
3350
+
3351
+ def _export_svg_bytes(name: str) -> bytes:
3352
+ resolved = self._resolve_graph_name_for_stata(name)
3353
+
3354
+ temp_dir = tempfile.gettempdir()
3355
+ safe_temp_name = self._sanitize_filename(name)
3356
+ unique_filename = f"{safe_temp_name}_{uuid.uuid4().hex[:8]}_{os.getpid()}_{int(time.time())}.svg"
3357
+ svg_path = os.path.join(temp_dir, unique_filename)
3358
+ svg_path_for_stata = svg_path.replace("\\", "/")
3359
+
3360
+ try:
3361
+ export_cmd = f'quietly graph export "{svg_path_for_stata}", name({resolved}) replace as(svg)'
3362
+ export_resp = self._exec_no_capture_silent(export_cmd, echo=False)
3363
+
3364
+ if not export_resp.success:
3365
+ display_cmd = f'quietly graph display {resolved}'
3366
+ display_resp = self._exec_no_capture_silent(display_cmd, echo=False)
3367
+ if display_resp.success:
3368
+ export_cmd2 = f'quietly graph export "{svg_path_for_stata}", replace as(svg)'
3369
+ export_resp = self._exec_no_capture_silent(export_cmd2, echo=False)
3370
+ else:
3371
+ export_resp = display_resp
3372
+
3373
+ if export_resp.success and os.path.exists(svg_path) and os.path.getsize(svg_path) > 0:
3374
+ with open(svg_path, "rb") as f:
3375
+ return f.read()
3376
+ error_msg = getattr(export_resp, 'error', 'Unknown error')
3377
+ raise RuntimeError(f"Failed to export graph {name}: {error_msg}")
3378
+ finally:
3379
+ if os.path.exists(svg_path):
3380
+ try:
3381
+ os.remove(svg_path)
3382
+ except OSError as e:
3383
+ logger.warning(f"Failed to cleanup temp file {svg_path}: {e}")
3384
+
3385
+ cached_graphs = {}
3386
+ uncached_graphs = []
3387
+ cache_errors = []
3388
+
3389
+ with self._cache_lock:
3390
+ for name in graph_names:
3391
+ if name in self._preemptive_cache:
3392
+ cached_path = self._preemptive_cache[name]
3393
+ if os.path.exists(cached_path) and os.path.getsize(cached_path) > 0:
3394
+ # Additional validation: check if graph content has changed
3395
+ if self._is_cache_valid(name, cached_path):
3396
+ cached_graphs[name] = cached_path
3397
+ else:
3398
+ uncached_graphs.append(name)
3399
+ # Remove stale cache entry
3400
+ del self._preemptive_cache[name]
3401
+ else:
3402
+ uncached_graphs.append(name)
3403
+ # Remove invalid cache entry
3404
+ if name in self._preemptive_cache:
3405
+ del self._preemptive_cache[name]
3406
+ else:
3407
+ uncached_graphs.append(name)
3408
+
3409
+ for name, cached_path in cached_graphs.items():
3410
+ try:
3411
+ exports.append(GraphExport(name=name, file_path=cached_path))
3412
+ except Exception as e:
3413
+ cache_errors.append(f"Failed to read cached graph {name}: {e}")
3414
+ # Fall back to uncached processing
3415
+ uncached_graphs.append(name)
3416
+
3417
+ if uncached_graphs:
3418
+ successful_graphs = []
3419
+ failed_graphs = []
3420
+ memory_results = {}
3421
+
3422
+ for name in uncached_graphs:
3423
+ try:
3424
+ svg_data = _export_svg_bytes(name)
3425
+ memory_results[name] = svg_data
3426
+ successful_graphs.append(name)
3427
+ except Exception as e:
3428
+ failed_graphs.append(name)
3429
+ cache_errors.append(f"Failed to cache graph {name}: {e}")
3430
+
3431
+ for name in successful_graphs:
3432
+ result = memory_results[name]
3433
+
3434
+ cache_path = _cache_keyed_svg_path(name)
3435
+
3436
+ try:
3437
+ with open(cache_path, 'wb') as f:
3438
+ f.write(result)
3439
+
3440
+ # Update cache with size tracking and eviction
3441
+ import time
3442
+ item_size = len(result)
3443
+ self._evict_cache_if_needed(item_size)
3444
+
3445
+ with self._cache_lock:
3446
+ self._preemptive_cache[name] = cache_path
3447
+ # Store content hash for validation
3448
+ self._preemptive_cache[f"{name}_hash"] = self._get_content_hash(result)
3449
+ # Update tracking
3450
+ self._cache_access_times[name] = time.time()
3451
+ self._cache_sizes[name] = item_size
3452
+ self._total_cache_size += item_size
3453
+
3454
+ exports.append(GraphExport(name=name, file_path=cache_path))
3455
+ except Exception as e:
3456
+ cache_errors.append(f"Failed to cache graph {name}: {e}")
3457
+ # Still return the result even if caching fails
3458
+ # Create temp file for immediate use
3459
+ safe_name = self._sanitize_filename(name)
3460
+ temp_path = os.path.join(tempfile.gettempdir(), f"{safe_name}_{uuid.uuid4().hex[:8]}.svg")
3461
+ with open(temp_path, 'wb') as f:
3462
+ f.write(result)
3463
+ exports.append(GraphExport(name=name, file_path=temp_path))
3464
+
3465
+ # Log errors if any occurred
3466
+ if cache_errors:
3467
+ logger = logging.getLogger(__name__)
3468
+ for error in cache_errors:
3469
+ logger.warning(error)
3470
+
3471
+ return GraphExportResponse(graphs=exports)
3472
+
3473
+ def cache_graph_on_creation(self, graph_name: str) -> bool:
3474
+ """Revolutionary method to cache a graph immediately after creation.
3475
+
3476
+ Call this method right after creating a graph to pre-emptively cache it.
3477
+ This eliminates all export wait time for future access.
3478
+
3479
+ Args:
3480
+ graph_name: Name of the graph to cache
3481
+
3482
+ Returns:
3483
+ True if caching succeeded, False otherwise
3484
+ """
3485
+ import os
3486
+ import logging
3487
+ logger = logging.getLogger("mcp_stata.stata_client")
3488
+
3489
+ # Initialize cache in thread-safe manner
3490
+ self._initialize_cache()
3491
+
3492
+ # Invalidate list_graphs cache since a new graph was created
3493
+ self.invalidate_list_graphs_cache()
3494
+
3495
+ # Check if already cached and valid
3496
+ with self._cache_lock:
3497
+ if graph_name in self._preemptive_cache:
3498
+ cache_path = self._preemptive_cache[graph_name]
3499
+ if os.path.exists(cache_path) and os.path.getsize(cache_path) > 0:
3500
+ if self._is_cache_valid(graph_name, cache_path):
3501
+ # Update access time for LRU
3502
+ import time
3503
+ self._cache_access_times[graph_name] = time.time()
3504
+ return True
3505
+ else:
3506
+ # Remove stale cache entry
3507
+ del self._preemptive_cache[graph_name]
3508
+ if graph_name in self._cache_access_times:
3509
+ del self._cache_access_times[graph_name]
3510
+ if graph_name in self._cache_sizes:
3511
+ self._total_cache_size -= self._cache_sizes[graph_name]
3512
+ del self._cache_sizes[graph_name]
3513
+ # Remove hash entry if exists
3514
+ hash_key = f"{graph_name}_hash"
3515
+ if hash_key in self._preemptive_cache:
3516
+ del self._preemptive_cache[hash_key]
3517
+
3518
+ try:
3519
+ # Include signature in filename to force client-side refresh
3520
+ import hashlib
3521
+ sig = self._get_graph_signature(graph_name)
3522
+ safe_name = self._sanitize_filename(sig)
3523
+ suffix = hashlib.md5((sig or "").encode("utf-8")).hexdigest()[:8]
3524
+ cache_path = os.path.join(self._preemptive_cache_dir, f"{safe_name}_{suffix}.svg")
3525
+ cache_path_for_stata = cache_path.replace("\\", "/")
3526
+
3527
+ resolved_graph_name = self._resolve_graph_name_for_stata(graph_name)
3528
+ safe_name = resolved_graph_name.strip()
3529
+
3530
+ # The most reliable and efficient strategy for capturing distinct graphs in
3531
+ # PyStata background tasks:
3532
+ # 1. Ensure the specific graph is active in the Stata engine via 'graph display'.
3533
+ # 2. Export with the explicit name() option to ensure isolation.
3534
+ # We use name() without quotes as it's an internal Stata name.
3535
+ self._exec_no_capture_silent(f'graph display {safe_name}', echo=False)
3536
+ export_cmd = f'graph export "{cache_path_for_stata}", name({safe_name}) replace as(svg)'
3537
+ resp = self._exec_no_capture_silent(export_cmd, echo=False)
3538
+
3539
+ if resp.success and os.path.exists(cache_path) and os.path.getsize(cache_path) > 0:
3540
+ # Read the data to compute hash
3541
+ with open(cache_path, 'rb') as f:
3542
+ data = f.read()
3543
+
3544
+ # Update cache with size tracking and eviction
3545
+ import time
3546
+ item_size = len(data)
3547
+ self._evict_cache_if_needed(item_size)
3548
+
3549
+ with self._cache_lock:
3550
+ # Clear any old versions of this graph from the path cache
3551
+ # (Optional but keeps it clean)
3552
+ old_path = self._preemptive_cache.get(graph_name)
3553
+ if old_path and old_path != cache_path:
3554
+ try:
3555
+ os.remove(old_path)
3556
+ except Exception:
3557
+ pass
3558
+
3559
+ self._preemptive_cache[graph_name] = cache_path
3560
+ # Store content hash for validation
3561
+ self._preemptive_cache[f"{graph_name}_hash"] = self._get_content_hash(data)
3562
+ # Store signature for fast validation
3563
+ self._preemptive_cache[f"{graph_name}_sig"] = self._get_graph_signature(graph_name)
3564
+ # Update tracking
3565
+ self._cache_access_times[graph_name] = time.time()
3566
+ self._cache_sizes[graph_name] = item_size
3567
+ self._total_cache_size += item_size
3568
+
3569
+ return True
3570
+ else:
3571
+ error_msg = getattr(resp, 'error', 'Unknown error')
3572
+ logger = logging.getLogger(__name__)
3573
+ logger.warning(f"Failed to cache graph {graph_name}: {error_msg}")
3574
+
3575
+ except Exception as e:
3576
+ logger = logging.getLogger(__name__)
3577
+ logger.warning(f"Exception caching graph {graph_name}: {e}")
3578
+
3579
+ return False
3580
+
3581
+ def run_do_file(self, path: str, echo: bool = True, trace: bool = False, max_output_lines: Optional[int] = None, cwd: Optional[str] = None) -> CommandResponse:
3582
+ effective_path, command, error_response = self._resolve_do_file_path(path, cwd)
3583
+ if error_response is not None:
3584
+ return error_response
3585
+
3586
+ if not self._initialized:
3587
+ self.init()
3588
+
3589
+ start_time = time.time()
3590
+ exc: Optional[Exception] = None
3591
+ smcl_content = ""
3592
+ smcl_path = None
3593
+
3594
+ _log_file, log_path, tail, tee = self._create_streaming_log(trace=trace)
3595
+ base_dir = cwd or os.path.dirname(effective_path)
3596
+ smcl_path = self._create_smcl_log_path(base_dir=base_dir)
3597
+ smcl_log_name = self._make_smcl_log_name()
3598
+
3599
+ rc = -1
3600
+ try:
3601
+ rc, exc = self._run_streaming_blocking(
3602
+ command=command,
3603
+ tee=tee,
3604
+ cwd=cwd,
3605
+ trace=trace,
3606
+ echo=echo,
3607
+ smcl_path=smcl_path,
3608
+ smcl_log_name=smcl_log_name,
3609
+ hold_attr="_hold_name_do_sync",
3610
+ require_smcl_log=True,
3611
+ )
3612
+ except Exception as e:
3613
+ exc = e
3614
+ rc = 1
3615
+ finally:
3616
+ tee.close()
3617
+
3618
+ # Read SMCL content as the authoritative source
3619
+ smcl_content = self._read_smcl_file(smcl_path)
3620
+
3621
+ combined = self._build_combined_log(tail, log_path, rc, trace, exc)
3622
+
3623
+ # Use SMCL content as primary source for RC detection if not already captured
3624
+ if rc == -1 and not exc:
3625
+ parsed_rc = self._parse_rc_from_smcl(smcl_content)
3626
+ if parsed_rc is not None:
3627
+ rc = parsed_rc
3628
+ else:
3629
+ # Fallback to text parsing
3630
+ parsed_rc = self._parse_rc_from_text(combined)
3631
+ rc = parsed_rc if parsed_rc is not None else 0
3632
+ elif exc and rc == 1:
3633
+ # Try to parse more specific RC from exception message
3634
+ parsed_rc = self._parse_rc_from_text(str(exc))
3635
+ if parsed_rc is not None:
3636
+ rc = parsed_rc
3637
+
3638
+ success = (rc == 0 and exc is None)
3639
+ error = None
3640
+
3641
+ if not success:
3642
+ # Use SMCL as authoritative source for error extraction
3643
+ if smcl_content:
3644
+ msg, context = self._extract_error_from_smcl(smcl_content, rc)
3645
+ else:
3646
+ # Fallback to combined log
3647
+ msg, context = self._extract_error_and_context(combined, rc)
3648
+
3649
+ error = ErrorEnvelope(
3650
+ message=msg,
3651
+ rc=rc,
3652
+ snippet=context,
3653
+ command=command,
3654
+ log_path=log_path,
3655
+ smcl_output=smcl_content,
3656
+ )
3657
+
3658
+ duration = time.time() - start_time
3659
+ logger.info(
3660
+ "stata.run(do) rc=%s success=%s trace=%s duration_ms=%.2f path=%s",
3661
+ rc,
3662
+ success,
3663
+ trace,
3664
+ duration * 1000,
3665
+ effective_path,
3666
+ )
3667
+
3668
+ return CommandResponse(
3669
+ command=command,
3670
+ rc=rc,
3671
+ stdout="",
3672
+ stderr=None,
3673
+ log_path=log_path,
3674
+ success=success,
3675
+ error=error,
3676
+ smcl_output=smcl_content,
3677
+ )
3678
+
3679
+ def load_data(self, source: str, clear: bool = True, max_output_lines: Optional[int] = None) -> CommandResponse:
3680
+ src = source.strip()
3681
+ clear_suffix = ", clear" if clear else ""
3682
+
3683
+ if src.startswith("sysuse "):
3684
+ cmd = f"{src}{clear_suffix}"
3685
+ elif src.startswith("webuse "):
3686
+ cmd = f"{src}{clear_suffix}"
3687
+ elif src.startswith("use "):
3688
+ cmd = f"{src}{clear_suffix}"
3689
+ elif "://" in src or src.endswith(".dta") or os.path.sep in src:
3690
+ cmd = f'use "{src}"{clear_suffix}'
3691
+ else:
3692
+ cmd = f"sysuse {src}{clear_suffix}"
3693
+
3694
+ result = self._exec_with_capture(cmd, echo=True, trace=False)
3695
+ return self._truncate_command_output(result, max_output_lines)
3696
+
3697
+ def codebook(self, varname: str, trace: bool = False, max_output_lines: Optional[int] = None) -> CommandResponse:
3698
+ result = self._exec_with_capture(f"codebook {varname}", trace=trace)
3699
+ return self._truncate_command_output(result, max_output_lines)