mcp-stata 1.22.1__cp311-abi3-macosx_11_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4710 @@
1
+ from __future__ import annotations
2
+ import asyncio
3
+ import io
4
+ import inspect
5
+ import json
6
+ import logging
7
+ import os
8
+ import pathlib
9
+ import platform
10
+ import re
11
+ import subprocess
12
+ import sys
13
+ import tempfile
14
+ import threading
15
+ import time
16
+ import uuid
17
+ import functools
18
+ from contextlib import contextmanager, redirect_stdout, redirect_stderr
19
+ from importlib.metadata import PackageNotFoundError, version
20
+ from io import StringIO
21
+ from typing import Any, Awaitable, Callable, Dict, Generator, List, Optional, Tuple
22
+
23
+ import anyio
24
+ from anyio import get_cancelled_exc_class
25
+
26
+ from .discovery import find_stata_candidates
27
+ from .config import MAX_LIMIT
28
+ from .models import (
29
+ CommandResponse,
30
+ ErrorEnvelope,
31
+ GraphExport,
32
+ GraphExportResponse,
33
+ GraphInfo,
34
+ GraphListResponse,
35
+ VariableInfo,
36
+ VariablesResponse,
37
+ )
38
+ from .smcl.smcl2html import smcl_to_markdown
39
+ from .streaming_io import FileTeeIO, TailBuffer
40
+ from .graph_detector import StreamingGraphCache
41
+ from .native_ops import fast_scan_log, compute_filter_indices
42
+ from .utils import get_writable_temp_dir, register_temp_file, register_temp_dir, is_windows
43
+
44
+ logger = logging.getLogger("mcp_stata")
45
+
46
+ _POLARS_AVAILABLE: Optional[bool] = None
47
+ _GRAPH_NAME_PATTERN = re.compile(r"name\(\s*(\"[^\"]+\"|'[^']+'|[^,\)\s]+)", re.IGNORECASE)
48
+
49
+ def _check_polars_available() -> bool:
50
+ """
51
+ Check if Polars can be safely imported.
52
+ Must detect problematic platforms BEFORE attempting import,
53
+ since the crash is a fatal signal, not a catchable exception.
54
+ """
55
+ if sys.platform == "win32" and platform.machine().lower() in ("arm64", "aarch64"):
56
+ return False
57
+
58
+ try:
59
+ import polars # noqa: F401
60
+ return True
61
+ except ImportError:
62
+ return False
63
+
64
+
65
+ def _get_polars_available() -> bool:
66
+ global _POLARS_AVAILABLE
67
+ if _POLARS_AVAILABLE is None:
68
+ _POLARS_AVAILABLE = _check_polars_available()
69
+ return _POLARS_AVAILABLE
70
+
71
+ # ============================================================================
72
+ # MODULE-LEVEL DISCOVERY CACHE
73
+ # ============================================================================
74
+ # This cache ensures Stata discovery runs exactly once per process lifetime
75
+ _discovery_lock = threading.Lock()
76
+ _discovery_result: Optional[Tuple[str, str]] = None # (path, edition)
77
+ _discovery_candidates: Optional[List[Tuple[str, str]]] = None
78
+ _discovery_attempted = False
79
+ _discovery_error: Optional[Exception] = None
80
+
81
+
82
+ def _get_discovery_candidates() -> List[Tuple[str, str]]:
83
+ """
84
+ Get ordered discovery candidates, running discovery only once.
85
+
86
+ Returns:
87
+ List of (stata_executable_path, edition) ordered by preference.
88
+
89
+ Raises:
90
+ RuntimeError: If Stata discovery fails
91
+ """
92
+ global _discovery_result, _discovery_candidates, _discovery_attempted, _discovery_error
93
+
94
+ with _discovery_lock:
95
+ # If we've already successfully discovered Stata, return cached result
96
+ if _discovery_result is not None:
97
+ return _discovery_candidates or [_discovery_result]
98
+
99
+ if _discovery_candidates is not None:
100
+ return _discovery_candidates
101
+
102
+ # If we've already attempted and failed, re-raise the cached error
103
+ if _discovery_attempted and _discovery_error is not None:
104
+ raise RuntimeError(f"Stata binary not found: {_discovery_error}") from _discovery_error
105
+
106
+ # This is the first attempt - run discovery
107
+ _discovery_attempted = True
108
+
109
+ try:
110
+ # Log environment state once at first discovery
111
+ env_path = os.getenv("STATA_PATH")
112
+ if env_path:
113
+ logger.info("STATA_PATH env provided (raw): %s", env_path)
114
+ else:
115
+ logger.info("STATA_PATH env not set; attempting auto-discovery")
116
+
117
+ # Run discovery
118
+ candidates = find_stata_candidates()
119
+
120
+ # Cache the successful result
121
+ _discovery_candidates = candidates
122
+ if candidates:
123
+ _discovery_result = candidates[0]
124
+ logger.info("Discovery found Stata at: %s (%s)", _discovery_result[0], _discovery_result[1])
125
+ else:
126
+ raise FileNotFoundError("No Stata candidates discovered")
127
+
128
+ return candidates
129
+
130
+ except FileNotFoundError as e:
131
+ _discovery_error = e
132
+ raise RuntimeError(f"Stata binary not found: {e}") from e
133
+ except PermissionError as e:
134
+ _discovery_error = e
135
+ raise RuntimeError(
136
+ f"Stata binary is not executable: {e}. "
137
+ "Point STATA_PATH directly to the Stata binary (e.g., .../Contents/MacOS/stata-mp)."
138
+ ) from e
139
+
140
+
141
+ def _get_discovered_stata() -> Tuple[str, str]:
142
+ """
143
+ Preserve existing API: return the highest-priority discovered Stata candidate.
144
+ """
145
+ candidates = _get_discovery_candidates()
146
+ if not candidates:
147
+ raise RuntimeError("Stata binary not found: no candidates discovered")
148
+ return candidates[0]
149
+
150
+
151
+ class StataClient:
152
+ _initialized = False
153
+ _exec_lock: threading.Lock
154
+ _cache_init_lock = threading.Lock() # Class-level lock for cache initialization
155
+ _is_executing = False # Flag to prevent recursive Stata calls
156
+ MAX_DATA_ROWS = MAX_LIMIT
157
+ MAX_GRAPH_BYTES = 50 * 1024 * 1024 # Maximum graph exports (~50MB)
158
+ MAX_CACHE_SIZE = 100 # Maximum number of graphs to cache
159
+ MAX_CACHE_BYTES = 500 * 1024 * 1024 # Maximum cache size in bytes (~500MB)
160
+ LIST_GRAPHS_TTL = 0.075 # TTL for list_graphs cache (75ms)
161
+
162
+ def __init__(self):
163
+ self._exec_lock = threading.RLock()
164
+ self._is_executing = False
165
+ self._command_idx = 0 # Counter for user-initiated commands
166
+ self._initialized = False
167
+ self._persistent_log_path = None
168
+ self._persistent_log_name = None
169
+ self._last_emitted_graph_signatures: Dict[str, str] = {}
170
+ self._graph_signature_cache: Dict[str, str] = {}
171
+ self._graph_signature_cache_cmd_idx: Optional[int] = None
172
+ self._last_results = None
173
+ self._list_graphs_cache = None
174
+ self._list_graphs_cache_time = 0
175
+ self._list_graphs_cache_lock = threading.Lock()
176
+ self._graph_name_aliases: Dict[str, str] = {}
177
+ self._graph_name_reverse: Dict[str, str] = {}
178
+ from .graph_detector import GraphCreationDetector
179
+ self._graph_detector = GraphCreationDetector(self)
180
+
181
+ def __new__(cls):
182
+ inst = super(StataClient, cls).__new__(cls)
183
+ inst._exec_lock = threading.RLock()
184
+ inst._is_executing = False
185
+ inst._command_idx = 0
186
+ inst._initialized = False
187
+ inst._persistent_log_path = None
188
+ inst._persistent_log_name = None
189
+ inst._graph_signature_cache = {}
190
+ inst._graph_signature_cache_cmd_idx = None
191
+ inst._last_results = None
192
+ inst._list_graphs_cache = None
193
+ inst._list_graphs_cache_time = 0
194
+ inst._list_graphs_cache_lock = threading.Lock()
195
+ inst._graph_name_aliases = {}
196
+ inst._graph_name_reverse = {}
197
+ from .graph_detector import GraphCreationDetector
198
+ inst._graph_detector = GraphCreationDetector(inst)
199
+ return inst
200
+
201
+ def _increment_command_idx(self) -> int:
202
+ """Increment and return the command counter."""
203
+ self._command_idx += 1
204
+ self._graph_signature_cache = {}
205
+ self._graph_signature_cache_cmd_idx = self._command_idx
206
+ return self._command_idx
207
+
208
+ @contextmanager
209
+ def _redirect_io(self, out_buf, err_buf):
210
+ """Safely redirect stdout/stderr for the duration of a Stata call."""
211
+ backup_stdout, backup_stderr = sys.stdout, sys.stderr
212
+ sys.stdout, sys.stderr = out_buf, err_buf
213
+ try:
214
+ yield
215
+ finally:
216
+ sys.stdout, sys.stderr = backup_stdout, backup_stderr
217
+
218
+
219
+ @staticmethod
220
+ def _stata_quote(value: str) -> str:
221
+ """Return a Stata double-quoted string literal for value."""
222
+ # Stata uses doubled quotes to represent a quote character inside a string.
223
+ v = (value or "")
224
+ v = v.replace('"', '""')
225
+ # Use compound double quotes to avoid tokenization issues with spaces and
226
+ # punctuation in contexts like graph names.
227
+ return f'`"{v}"\''
228
+
229
+ @contextmanager
230
+ def _redirect_io_streaming(self, out_stream, err_stream):
231
+ backup_stdout, backup_stderr = sys.stdout, sys.stderr
232
+ sys.stdout, sys.stderr = out_stream, err_stream
233
+ try:
234
+ yield
235
+ finally:
236
+ sys.stdout, sys.stderr = backup_stdout, backup_stderr
237
+
238
+ @staticmethod
239
+ def _safe_unlink(path: str) -> None:
240
+ if not path:
241
+ return
242
+ try:
243
+ if os.path.exists(path):
244
+ os.unlink(path)
245
+ except Exception:
246
+ pass
247
+
248
+ def _create_smcl_log_path(
249
+ self,
250
+ *,
251
+ prefix: str = "mcp_smcl_",
252
+ max_hex: Optional[int] = None,
253
+ base_dir: Optional[str] = None,
254
+ ) -> str:
255
+ hex_id = uuid.uuid4().hex if max_hex is None else uuid.uuid4().hex[:max_hex]
256
+ # Use provided base_dir if any, otherwise fall back to validated temp dir
257
+ base = pathlib.Path(base_dir) if base_dir else pathlib.Path(get_writable_temp_dir())
258
+ smcl_path = base / f"{prefix}{hex_id}.smcl"
259
+ register_temp_file(smcl_path)
260
+ self._safe_unlink(str(smcl_path))
261
+ return str(smcl_path)
262
+
263
+ @staticmethod
264
+ def _make_smcl_log_name() -> str:
265
+ return f"_mcp_smcl_{uuid.uuid4().hex[:8]}"
266
+
267
+ def _run_internal(self, code: str, echo: bool = False) -> str:
268
+ """Run Stata code while strictly ensuring NO output reaches stdout."""
269
+ if not self._initialized:
270
+ self.init()
271
+ with self._exec_lock:
272
+ with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
273
+ return self.stata.run(code, echo=echo)
274
+
275
+ def _open_smcl_log(self, smcl_path: str, log_name: str, *, quiet: bool = False, append: bool = False) -> bool:
276
+ path_for_stata = smcl_path.replace("\\", "/")
277
+ mode = "append" if append else "replace"
278
+ base_cmd = f"log using \"{path_for_stata}\", {mode} smcl name({log_name})"
279
+
280
+ # In multi-threaded environments (like pytest-xdist), we must be extremely
281
+ # careful with the singleton Stata instance.
282
+ from sfi import Scalar
283
+
284
+ try:
285
+ # Bundle both close and open to minimize roundtrips
286
+ # Use a unique scalar to capture the RC of the log using command
287
+ log_rc_scalar = f"_mcp_log_rc_{uuid.uuid4().hex[:8]}"
288
+ bundle = (
289
+ f"capture quietly log close {log_name}\n"
290
+ f"capture {'quietly ' if quiet else ''}{base_cmd}\n"
291
+ f"scalar {log_rc_scalar} = _rc"
292
+ )
293
+ logger.debug(f"Opening SMCL log with bundle: {bundle}")
294
+ self._run_internal(bundle, echo=False)
295
+
296
+ try:
297
+ rc_val = Scalar.getValue(log_rc_scalar)
298
+ logger.debug(f"Log RC: {rc_val}")
299
+ # Clean up scalar
300
+ self._run_internal(f"capture scalar drop {log_rc_scalar}", echo=False)
301
+ if rc_val == 0:
302
+ self._last_smcl_log_named = True
303
+ return True
304
+ except Exception as e:
305
+ logger.debug(f"Failed to get log scalar {log_rc_scalar}: {e}")
306
+ pass
307
+
308
+ # If still not open, try clearing other logs and retry
309
+ log_rc_scalar = f"_mcp_log_rc_retry_{uuid.uuid4().hex[:8]}"
310
+ bundle = (
311
+ "capture quietly log close\n"
312
+ f"capture {'quietly ' if quiet else ''}{base_cmd}\n"
313
+ f"scalar {log_rc_scalar} = _rc"
314
+ )
315
+ logger.debug(f"Retrying SMCL log with bundle: {bundle}")
316
+ self._run_internal(bundle, echo=False)
317
+
318
+ try:
319
+ rc_val = Scalar.getValue(log_rc_scalar)
320
+ logger.debug(f"Retry Log RC: {rc_val}")
321
+ # Clean up scalar
322
+ self._run_internal(f"capture scalar drop {log_rc_scalar}", echo=False)
323
+ if rc_val == 0:
324
+ self._last_smcl_log_named = True
325
+ return True
326
+ except Exception as e:
327
+ logger.debug(f"Failed to get retry log scalar {log_rc_scalar}: {e}")
328
+ pass
329
+
330
+ except Exception as e:
331
+ logger.warning("SMCL log open exception: %s", e)
332
+
333
+ return False
334
+
335
+ # Fallback to unnamed log
336
+ try:
337
+ unnamed_cmd = f"{'quietly ' if quiet else ''}log using \"{path_for_stata}\", replace smcl"
338
+ self._run_internal(f"capture quietly log close", echo=False)
339
+ self._run_internal(f"capture {unnamed_cmd}", echo=False)
340
+ try:
341
+ if Scalar.getValue("c(log)") == "on":
342
+ self._last_smcl_log_named = False
343
+ return True
344
+ except:
345
+ pass
346
+ except Exception:
347
+ pass
348
+ return False
349
+
350
+ def _close_smcl_log(self, log_name: str) -> None:
351
+ if log_name == "_mcp_session":
352
+ return
353
+ try:
354
+ use_named = getattr(self, "_last_smcl_log_named", None)
355
+ if use_named is False:
356
+ self._run_internal("capture quietly log close", echo=False)
357
+ else:
358
+ self._run_internal(f"capture quietly log close {log_name}", echo=False)
359
+ except Exception:
360
+ pass
361
+
362
+ def _restore_results_from_hold(self, hold_attr: str) -> None:
363
+ if not hasattr(self, hold_attr):
364
+ return
365
+ hold_name = getattr(self, hold_attr)
366
+ try:
367
+ self._run_internal(f"capture _return restore {hold_name}", echo=False)
368
+ self._last_results = None # Invalidate cache instead of fetching
369
+ except Exception:
370
+ pass
371
+ finally:
372
+ try:
373
+ delattr(self, hold_attr)
374
+ except Exception:
375
+ pass
376
+
377
+ def _create_streaming_log(self, *, trace: bool) -> tuple[tempfile.NamedTemporaryFile, str, TailBuffer, FileTeeIO]:
378
+ log_file = tempfile.NamedTemporaryFile(
379
+ prefix="mcp_stata_",
380
+ suffix=".log",
381
+ dir=get_writable_temp_dir(),
382
+ delete=False,
383
+ mode="w",
384
+ encoding="utf-8",
385
+ errors="replace",
386
+ buffering=1,
387
+ )
388
+ log_path = log_file.name
389
+ register_temp_file(log_path)
390
+ tail = TailBuffer(max_chars=200000 if trace else 20000)
391
+ tee = FileTeeIO(log_file, tail)
392
+ return log_file, log_path, tail, tee
393
+
394
+ def _init_streaming_graph_cache(
395
+ self,
396
+ auto_cache_graphs: bool,
397
+ on_graph_cached: Optional[Callable[[str, bool], Awaitable[None]]],
398
+ notify_log: Callable[[str], Awaitable[None]],
399
+ ) -> Optional[StreamingGraphCache]:
400
+ if not auto_cache_graphs:
401
+ return None
402
+ graph_cache = StreamingGraphCache(self, auto_cache=True)
403
+ graph_cache_callback = self._create_graph_cache_callback(on_graph_cached, notify_log)
404
+ graph_cache.add_cache_callback(graph_cache_callback)
405
+ return graph_cache
406
+
407
+ def _capture_graph_state(
408
+ self,
409
+ graph_cache: Optional[StreamingGraphCache],
410
+ emit_graph_ready: bool,
411
+ ) -> Optional[dict[str, str]]:
412
+ # Capture initial graph state BEFORE execution starts
413
+ self._graph_signature_cache = {}
414
+ self._graph_signature_cache_cmd_idx = None
415
+ graph_names: List[str] = []
416
+ if graph_cache or emit_graph_ready:
417
+ try:
418
+ graph_names = list(self.list_graphs(force_refresh=True))
419
+ except Exception as e:
420
+ logger.debug("Failed to capture initial graph state: %s", e)
421
+ graph_names = []
422
+
423
+ if graph_cache:
424
+ # Clear detection state for the new command (detected/removed sets)
425
+ # but preserve _last_graph_state signatures for modification detection.
426
+ graph_cache.detector.clear_detection_state()
427
+ graph_cache._initial_graphs = set(graph_names)
428
+ logger.debug(f"Initial graph state captured: {graph_cache._initial_graphs}")
429
+
430
+ graph_ready_initial = None
431
+ if emit_graph_ready:
432
+ graph_ready_initial = {name: self._get_graph_signature(name) for name in graph_names}
433
+ logger.debug("Graph-ready initial state captured: %s", set(graph_ready_initial))
434
+ return graph_ready_initial
435
+
436
+ async def _cache_new_graphs(
437
+ self,
438
+ graph_cache: Optional[StreamingGraphCache],
439
+ *,
440
+ notify_progress: Optional[Callable[[float, Optional[float], Optional[str]], Awaitable[None]]],
441
+ total_lines: int,
442
+ completed_label: str,
443
+ ) -> None:
444
+ if not graph_cache or not graph_cache.auto_cache:
445
+ return
446
+ try:
447
+ cached_graphs = []
448
+ # Use detector to find new OR modified graphs
449
+ pystata_detected = await anyio.to_thread.run_sync(graph_cache.detector._detect_graphs_via_pystata)
450
+
451
+ # Combine with any pending graphs in queue
452
+ with graph_cache._lock:
453
+ to_process = set(pystata_detected) | set(graph_cache._graphs_to_cache)
454
+ graph_cache._graphs_to_cache.clear()
455
+
456
+ if to_process:
457
+ logger.info(f"Detected {len(to_process)} new or modified graph(s): {sorted(to_process)}")
458
+
459
+ for graph_name in to_process:
460
+ if graph_name in graph_cache._cached_graphs:
461
+ continue
462
+
463
+ try:
464
+ cache_result = await anyio.to_thread.run_sync(
465
+ self.cache_graph_on_creation,
466
+ graph_name,
467
+ )
468
+ if cache_result:
469
+ cached_graphs.append(graph_name)
470
+ graph_cache._cached_graphs.add(graph_name)
471
+
472
+ for callback in graph_cache._cache_callbacks:
473
+ try:
474
+ result = callback(graph_name, cache_result)
475
+ if inspect.isawaitable(result):
476
+ await result
477
+ except Exception:
478
+ pass
479
+ except Exception as e:
480
+ logger.error(f"Error caching graph {graph_name}: {e}")
481
+
482
+ if cached_graphs and notify_progress:
483
+ await notify_progress(
484
+ float(total_lines) if total_lines > 0 else 1,
485
+ float(total_lines) if total_lines > 0 else 1,
486
+ f"{completed_label} completed. Cached {len(cached_graphs)} graph(s): {', '.join(cached_graphs)}",
487
+ )
488
+ except Exception as e:
489
+ logger.error(f"Post-execution graph detection failed: {e}")
490
+
491
+ def _emit_graph_ready_task(
492
+ self,
493
+ *,
494
+ emit_graph_ready: bool,
495
+ graph_ready_initial: Optional[dict[str, str]],
496
+ notify_log: Callable[[str], Awaitable[None]],
497
+ graph_ready_task_id: Optional[str],
498
+ graph_ready_format: str,
499
+ ) -> None:
500
+ if emit_graph_ready and graph_ready_initial is not None:
501
+ try:
502
+ asyncio.create_task(
503
+ self._emit_graph_ready_events(
504
+ graph_ready_initial,
505
+ notify_log,
506
+ graph_ready_task_id,
507
+ graph_ready_format,
508
+ )
509
+ )
510
+ except Exception as e:
511
+ logger.warning("graph_ready emission failed to start: %s", e)
512
+
513
+ async def _stream_smcl_log(
514
+ self,
515
+ *,
516
+ smcl_path: str,
517
+ notify_log: Callable[[str], Awaitable[None]],
518
+ done: anyio.Event,
519
+ on_chunk: Optional[Callable[[str], Awaitable[None]]] = None,
520
+ start_offset: int = 0,
521
+ tee: Optional[FileTeeIO] = None,
522
+ ) -> None:
523
+ last_pos = start_offset
524
+ emitted_debug_chunks = 0
525
+ has_written = False
526
+ # Wait for Stata to create the SMCL file
527
+ while not done.is_set() and not os.path.exists(smcl_path):
528
+ await anyio.sleep(0.05)
529
+
530
+ try:
531
+ def _read_content() -> tuple[str, int]:
532
+ try:
533
+ with open(smcl_path, "rb") as f:
534
+ f.seek(last_pos)
535
+ data = f.read()
536
+ if not data:
537
+ return "", 0
538
+ return data.decode("utf-8", errors="replace"), len(data)
539
+ except PermissionError:
540
+ if is_windows():
541
+ try:
542
+ # Use 'type' on Windows to bypass exclusive lock
543
+ res = subprocess.run(f'type "{smcl_path}"', shell=True, capture_output=True)
544
+ full_content = res.stdout
545
+ if len(full_content) > last_pos:
546
+ data = full_content[last_pos:]
547
+ return data.decode("utf-8", errors="replace"), len(data)
548
+ return "", 0
549
+ except Exception:
550
+ return "", 0
551
+ return "", 0
552
+ except FileNotFoundError:
553
+ return "", 0
554
+
555
+ while not done.is_set():
556
+ chunk, chunk_bytes = await anyio.to_thread.run_sync(_read_content)
557
+ if chunk:
558
+ last_pos += chunk_bytes
559
+ # Clean chunk before sending to log channel to suppress maintenance leakage
560
+ cleaned_chunk = self._clean_internal_smcl(
561
+ chunk,
562
+ strip_output=False,
563
+ strip_leading_boilerplate=not has_written,
564
+ )
565
+ if cleaned_chunk:
566
+ try:
567
+ await notify_log(cleaned_chunk)
568
+ except Exception as exc:
569
+ logger.debug("notify_log failed: %s", exc)
570
+
571
+ if tee:
572
+ try:
573
+ # Write cleaned SMCL to tee to satisfy requirements
574
+ # for clean logs with preserved markup.
575
+ tee.write(cleaned_chunk)
576
+ except Exception:
577
+ pass
578
+ has_written = True
579
+
580
+ if on_chunk is not None:
581
+ try:
582
+ await on_chunk(chunk)
583
+ except Exception as exc:
584
+ logger.debug("on_chunk callback failed: %s", exc)
585
+ await anyio.sleep(0.05)
586
+
587
+ # Final check for any remaining content
588
+ chunk, chunk_bytes = await anyio.to_thread.run_sync(_read_content)
589
+ if chunk:
590
+ last_pos += chunk_bytes
591
+ cleaned_chunk = self._clean_internal_smcl(
592
+ chunk,
593
+ strip_output=False,
594
+ strip_leading_boilerplate=not has_written,
595
+ )
596
+ if cleaned_chunk:
597
+ try:
598
+ await notify_log(cleaned_chunk)
599
+ except Exception as exc:
600
+ logger.debug("final notify_log failed: %s", exc)
601
+
602
+ if tee:
603
+ try:
604
+ # Write cleaned SMCL to tee
605
+ tee.write(cleaned_chunk)
606
+ except Exception:
607
+ pass
608
+ has_written = True
609
+
610
+ if on_chunk is not None:
611
+ # Final check even if last chunk is empty, to ensure
612
+ # graphs created at the very end are detected.
613
+ try:
614
+ await on_chunk(chunk or "")
615
+ except Exception as exc:
616
+ logger.debug("final on_chunk check failed: %s", exc)
617
+
618
+ except Exception as e:
619
+ logger.warning(f"Log streaming failed: {e}")
620
+
621
+ def _run_streaming_blocking(
622
+ self,
623
+ *,
624
+ command: str,
625
+ tee: FileTeeIO,
626
+ cwd: Optional[str],
627
+ trace: bool,
628
+ echo: bool,
629
+ smcl_path: str,
630
+ smcl_log_name: str,
631
+ hold_attr: str,
632
+ require_smcl_log: bool = False,
633
+ ) -> tuple[int, Optional[Exception]]:
634
+ rc = -1
635
+ exc: Optional[Exception] = None
636
+ with self._exec_lock:
637
+ self._is_executing = True
638
+ self._last_results = None # Invalidate results cache
639
+ try:
640
+ from sfi import Scalar, SFIToolkit # Import SFI tools
641
+ with self._temp_cwd(cwd):
642
+ logger.debug(
643
+ "opening SMCL log name=%s path=%s cwd=%s",
644
+ smcl_log_name,
645
+ smcl_path,
646
+ os.getcwd(),
647
+ )
648
+ try:
649
+ if self._persistent_log_path and smcl_path == self._persistent_log_path:
650
+ # Re-open or resume global session log in append mode to ensure it's active
651
+ log_opened = self._open_smcl_log(smcl_path, smcl_log_name, quiet=True, append=True)
652
+ else:
653
+ log_opened = self._open_smcl_log(smcl_path, smcl_log_name, quiet=True)
654
+ except Exception as e:
655
+ log_opened = False
656
+ logger.warning("_open_smcl_log raised: %r", e)
657
+ logger.info("SMCL log_opened=%s path=%s", log_opened, smcl_path)
658
+ if require_smcl_log and not log_opened:
659
+ exc = RuntimeError("Failed to open SMCL log")
660
+ logger.error("SMCL log open failed for %s", smcl_path)
661
+ rc = 1
662
+ if exc is None:
663
+ try:
664
+ # Use an internal buffer to capture the direct output of pystata
665
+ # rather than writing it raw to the 'tee' (and log_path).
666
+ # We rely on _stream_smcl_log to populate the 'tee' with
667
+ # cleaned content from the SMCL log.
668
+ direct_buf = io.StringIO()
669
+ with self._redirect_io_streaming(direct_buf, direct_buf):
670
+ try:
671
+ if trace:
672
+ self.stata.run("set trace on")
673
+
674
+ # Hybrid execution: Single-line commands run natively for perfect echoing.
675
+ # Multi-line commands use the bundle for error handling and stability.
676
+ is_multi_line = "\n" in command.strip()
677
+
678
+ if not is_multi_line:
679
+ logger.debug("running Stata natively echo=%s", echo)
680
+ self._hold_name_stream = f"mcp_hold_{uuid.uuid4().hex[:8]}"
681
+ # Reset RC to 0 before running
682
+ self._run_internal("scalar _mcp_rc = 0", echo=False)
683
+ ret = self.stata.run(command, echo=echo)
684
+ # Use _rc if we were in a capture, but here we are native.
685
+ # Stata sets c(rc) to the return code of the last command.
686
+ self._run_internal(f"scalar _mcp_rc = c(rc)", echo=False)
687
+ self._run_internal(f"capture _return hold {self._hold_name_stream}", echo=False)
688
+ self._run_internal(f"capture quietly log flush {smcl_log_name}", echo=False)
689
+
690
+ # Retrieve RC via SFI
691
+ try:
692
+ rc_val = Scalar.getValue("_mcp_rc")
693
+ rc = int(float(rc_val)) if rc_val is not None else 0
694
+ except:
695
+ rc = 0
696
+ else:
697
+ # Optimization: Combined bundle for streaming too.
698
+ # Consolidates hold and potentially flush into one call.
699
+ self._hold_name_stream = f"mcp_hold_{uuid.uuid4().hex[:8]}"
700
+
701
+ # Initialization logic for locals can be sensitive.
702
+ # Since each run() in pystata starts a new context for locals unless it's a file,
703
+ # we use a global scalar for the return code.
704
+ # We use noisily inside the capture block to force echo of commands if requested.
705
+ bundle = (
706
+ f"capture noisily {{\n"
707
+ f"{'noisily {' if echo else ''}\n"
708
+ f"{command}\n"
709
+ f"{'}' if echo else ''}\n"
710
+ f"}}\n"
711
+ f"scalar _mcp_rc = _rc\n"
712
+ f"capture _return hold {self._hold_name_stream}\n"
713
+ f"capture quietly log flush {smcl_log_name}"
714
+ )
715
+
716
+ logger.debug("running Stata bundle echo=%s", echo)
717
+ # Using direct stata.run because tee redirection is already active
718
+ ret = self.stata.run(bundle, echo=echo)
719
+
720
+ # Retrieve RC via SFI for accuracy
721
+ try:
722
+ rc_val = Scalar.getValue("_mcp_rc")
723
+ rc = int(float(rc_val)) if rc_val is not None else 0
724
+ except:
725
+ rc = 0
726
+
727
+ if isinstance(ret, str) and ret:
728
+ # If for some reason SMCL log wasn't working, we can
729
+ # fall back to the raw output, but otherwise we
730
+ # avoid writing raw data to the tee.
731
+ pass
732
+ except Exception as e:
733
+ exc = e
734
+ logger.error("stata.run bundle failed: %r", e)
735
+ if rc in (-1, 0):
736
+ parsed_rc = self._parse_rc_from_text(str(e))
737
+ if parsed_rc is None:
738
+ try:
739
+ parsed_rc = self._parse_rc_from_text(direct_buf.getvalue())
740
+ except Exception:
741
+ parsed_rc = None
742
+ rc = parsed_rc if parsed_rc is not None else 1
743
+ finally:
744
+ if trace:
745
+ try:
746
+ self._run_internal("set trace off")
747
+ except Exception:
748
+ pass
749
+ finally:
750
+ # Only close if it's NOT the persistent session log
751
+ if not self._persistent_log_name or smcl_log_name != self._persistent_log_name:
752
+ self._close_smcl_log(smcl_log_name)
753
+
754
+ self._restore_results_from_hold(hold_attr)
755
+
756
+ # Final state restoration (invisibility)
757
+ try:
758
+ # Set c(rc) for the environment
759
+ self._run_internal(f"capture error {rc}" if rc > 0 else "capture", echo=False)
760
+ except Exception:
761
+ pass
762
+ return rc, exc
763
+ # If we get here, SMCL log failed and we're required to stop.
764
+ return rc, exc
765
+ finally:
766
+ self._is_executing = False
767
+ return rc, exc
768
+
769
+ def _resolve_do_file_path(
770
+ self,
771
+ path: str,
772
+ cwd: Optional[str],
773
+ ) -> tuple[Optional[str], Optional[str], Optional[CommandResponse]]:
774
+ if cwd is not None and not os.path.isdir(cwd):
775
+ return None, None, CommandResponse(
776
+ command=f'do "{path}"',
777
+ rc=601,
778
+ stdout="",
779
+ stderr=None,
780
+ success=False,
781
+ error=ErrorEnvelope(
782
+ message=f"cwd not found: {cwd}",
783
+ rc=601,
784
+ command=path,
785
+ ),
786
+ )
787
+
788
+ effective_path = path
789
+ if not os.path.isabs(path):
790
+ effective_path = os.path.abspath(os.path.join(cwd or os.getcwd(), path))
791
+
792
+ if not os.path.exists(effective_path):
793
+ return None, None, CommandResponse(
794
+ command=f'do "{effective_path}"',
795
+ rc=601,
796
+ stdout="",
797
+ stderr=None,
798
+ success=False,
799
+ error=ErrorEnvelope(
800
+ message=f"Do-file not found: {effective_path}",
801
+ rc=601,
802
+ command=effective_path,
803
+ ),
804
+ )
805
+
806
+ path_for_stata = effective_path.replace("\\", "/")
807
+ command = f'do "{path_for_stata}"'
808
+ return effective_path, command, None
809
+
810
+ @contextmanager
811
+ def _smcl_log_capture(self) -> "Generator[Tuple[str, str], None, None]":
812
+ """
813
+ Context manager that wraps command execution in a named SMCL log.
814
+
815
+ This runs alongside any user logs (named logs can coexist).
816
+ Yields (log_name, log_path) tuple for use within the context.
817
+ The SMCL file is NOT deleted automatically - caller should clean up.
818
+
819
+ Usage:
820
+ with self._smcl_log_capture() as (log_name, smcl_path):
821
+ self.stata.run(cmd)
822
+ # After context, read smcl_path for raw SMCL output
823
+ """
824
+ # Use a unique name but DO NOT join start with mkstemp to avoid existing file locks.
825
+ # Stata will create the file.
826
+ smcl_path = self._create_smcl_log_path()
827
+ # Unique log name to avoid collisions with user logs
828
+ log_name = self._make_smcl_log_name()
829
+
830
+ try:
831
+ # Open named SMCL log (quietly to avoid polluting output)
832
+ log_opened = self._open_smcl_log(smcl_path, log_name, quiet=True)
833
+ if not log_opened:
834
+ # Still yield, consumer might see empty file or handle error,
835
+ # but we can't do much if Stata refuses to log.
836
+ pass
837
+
838
+ yield log_name, smcl_path
839
+ finally:
840
+ # Always close our named log
841
+ self._close_smcl_log(log_name)
842
+ # Ensure the persistent session log is still active after our capture.
843
+ if self._persistent_log_path and self._persistent_log_name:
844
+ try:
845
+ path_for_stata = self._persistent_log_path.replace("\\", "/")
846
+ mode = "append" if os.path.exists(self._persistent_log_path) else "replace"
847
+ reopen_cmd = (
848
+ f"capture quietly log using \"{path_for_stata}\", {mode} smcl name({self._persistent_log_name})"
849
+ )
850
+ self._run_internal(reopen_cmd, echo=False)
851
+ except Exception:
852
+ pass
853
+
854
+ def _read_smcl_file(self, path: str, start_offset: int = 0) -> str:
855
+ """Read SMCL file contents, handling encoding issues, offsets and Windows file locks."""
856
+ try:
857
+ with open(path, 'rb') as f:
858
+ if start_offset > 0:
859
+ f.seek(start_offset)
860
+ data = f.read()
861
+ return data.decode('utf-8', errors='replace')
862
+ except PermissionError:
863
+ if is_windows():
864
+ # Windows Fallback: Try to use 'type' command to bypass exclusive lock
865
+ try:
866
+ res = subprocess.run(f'type "{path}"', shell=True, capture_output=True)
867
+ if res.returncode == 0:
868
+ content = res.stdout
869
+ if start_offset > 0 and len(content) > start_offset:
870
+ return content[start_offset:].decode('utf-8', errors='replace')
871
+ return content.decode('utf-8', errors='replace')
872
+ except Exception as e:
873
+ logger.debug(f"Combined fallback read failed: {e}")
874
+ logger.warning(f"Failed to read SMCL file {path} due to lock")
875
+ return ""
876
+ except Exception as e:
877
+ logger.warning(f"Failed to read SMCL file {path}: {e}")
878
+ return ""
879
+
880
+ def _read_persistent_log_chunk(self, start_offset: int) -> str:
881
+ """Read fresh chunk from persistent SMCL log starting at offset."""
882
+ if not self._persistent_log_path:
883
+ return ""
884
+ try:
885
+ with open(self._persistent_log_path, 'rb') as f:
886
+ f.seek(start_offset)
887
+ data = f.read()
888
+
889
+ if not data:
890
+ return ""
891
+
892
+ content = data.decode('utf-8', errors='replace')
893
+ # Use refined cleaning logic to strip internal headers and maintenance
894
+ return self._clean_internal_smcl(content)
895
+ except PermissionError:
896
+ if is_windows():
897
+ try:
898
+ # Windows fallback for locked persistent log
899
+ res = subprocess.run(f'type "{self._persistent_log_path}"', shell=True, capture_output=True)
900
+ if res.returncode == 0:
901
+ full_content = res.stdout
902
+ if len(full_content) > start_offset:
903
+ return full_content[start_offset:].decode('utf-8', errors='replace')
904
+ return ""
905
+ except Exception:
906
+ pass
907
+ return ""
908
+ except Exception:
909
+ return ""
910
+
911
+ def _extract_error_from_smcl(self, smcl_content: str, rc: int) -> Tuple[str, str]:
912
+ """
913
+ Extract error message and context from raw SMCL output.
914
+
915
+ Uses {err} tags as the authoritative source for error detection.
916
+
917
+ Returns:
918
+ Tuple of (error_message, context_string)
919
+ """
920
+ if not smcl_content:
921
+ return f"Stata error r({rc})", ""
922
+
923
+ # Try Rust optimization
924
+ native_res = fast_scan_log(smcl_content, rc)
925
+ if native_res:
926
+ error_msg, context, _ = native_res
927
+ # If native result is specific, return it. Otherwise fall through to recover
928
+ # a more descriptive error message from SMCL/text.
929
+ if error_msg and error_msg != f"Stata error r({rc})":
930
+ return error_msg, context
931
+
932
+ lines = smcl_content.splitlines()
933
+
934
+ # Search backwards for {err} tags - they indicate error lines
935
+ error_lines = []
936
+ error_start_idx = -1
937
+
938
+ # Skip the very last few lines if they contain our cleanup noise
939
+ # like "capture error 111" or "log flush invalid"
940
+ internal_noise_patterns = [
941
+ "flush invalid",
942
+ "capture error",
943
+ "search r(",
944
+ "r(198);",
945
+ "r(111);"
946
+ ]
947
+
948
+ for i in range(len(lines) - 1, -1, -1):
949
+ line = lines[i]
950
+ if '{err}' in line:
951
+ # Is this internal noise?
952
+ is_noise = any(p in line.lower() for p in internal_noise_patterns)
953
+ if is_noise and error_start_idx == -1:
954
+ # If we only have noise at the very end, we should keep looking back
955
+ continue
956
+
957
+ if error_start_idx == -1:
958
+ error_start_idx = i
959
+ # Walk backwards to find consecutive {err} lines
960
+ j = i
961
+ while j >= 0 and '{err}' in lines[j]:
962
+ error_lines.insert(0, lines[j])
963
+ j -= 1
964
+ break
965
+
966
+ if error_lines:
967
+ # Clean SMCL tags from error message
968
+ clean_lines = []
969
+ for line in error_lines:
970
+ # Remove SMCL tags but keep the text content
971
+ cleaned = re.sub(r'\{[^}]*\}', '', line).strip()
972
+ if cleaned:
973
+ clean_lines.append(cleaned)
974
+
975
+ error_msg = " ".join(clean_lines) or f"Stata error r({rc})"
976
+
977
+ # Context is everything from error start to end
978
+ context_start = max(0, error_start_idx - 5) # Include 5 lines before error
979
+ context = "\n".join(lines[context_start:])
980
+
981
+ return error_msg, context
982
+
983
+ # Fallback: no {err} found, try to extract a meaningful message from text
984
+ # (some Stata errors do not emit {err} tags in SMCL).
985
+ try:
986
+ text_lines = self._smcl_to_text(smcl_content).splitlines()
987
+ except Exception:
988
+ text_lines = []
989
+
990
+ def _find_error_line() -> Optional[str]:
991
+ patterns = [
992
+ r"no variables defined",
993
+ r"not found",
994
+ r"variable .* not found",
995
+ r"no observations",
996
+ ]
997
+ for line in reversed(text_lines):
998
+ lowered = line.lower()
999
+ for pat in patterns:
1000
+ if re.search(pat, lowered):
1001
+ return line.strip()
1002
+ return None
1003
+
1004
+ extracted = _find_error_line()
1005
+ if extracted:
1006
+ error_msg = extracted
1007
+ else:
1008
+ error_msg = f"Stata error r({rc})"
1009
+
1010
+ # Context: last 30 lines of SMCL
1011
+ context_start = max(0, len(lines) - 30)
1012
+ context = "\n".join(lines[context_start:])
1013
+
1014
+ return error_msg, context
1015
+
1016
+ def _parse_rc_from_smcl(self, smcl_content: str) -> Optional[int]:
1017
+ """Parse return code from SMCL content using specific structural patterns."""
1018
+ if not smcl_content:
1019
+ return None
1020
+
1021
+ # Try Rust optimization
1022
+ native_res = fast_scan_log(smcl_content, 0)
1023
+ if native_res:
1024
+ _, _, rc = native_res
1025
+ if rc is not None:
1026
+ return rc
1027
+
1028
+ # 1. Primary check: SMCL search tag {search r(N), ...}
1029
+ # This is the most authoritative interactive indicator
1030
+ matches = list(re.finditer(r'\{search r\((\d+)\)', smcl_content))
1031
+ if matches:
1032
+ try:
1033
+ return int(matches[-1].group(1))
1034
+ except Exception:
1035
+ pass
1036
+
1037
+ # 2. Secondary check: Standalone r(N); pattern
1038
+ # This appears at the end of command blocks
1039
+ matches = list(re.finditer(r'(?<!\w)r\((\d+)\);?', smcl_content))
1040
+ if matches:
1041
+ try:
1042
+ return int(matches[-1].group(1))
1043
+ except Exception:
1044
+ pass
1045
+
1046
+ return None
1047
+
1048
+ @staticmethod
1049
+ def _create_graph_cache_callback(on_graph_cached, notify_log):
1050
+ """Create a standardized graph cache callback with proper error handling."""
1051
+ async def graph_cache_callback(graph_name: str, success: bool) -> None:
1052
+ try:
1053
+ if on_graph_cached:
1054
+ await on_graph_cached(graph_name, success)
1055
+ except Exception as e:
1056
+ logger.error(f"Graph cache callback failed: {e}")
1057
+
1058
+ try:
1059
+ # Also notify via log channel
1060
+ await notify_log(json.dumps({
1061
+ "event": "graph_cached",
1062
+ "graph": graph_name,
1063
+ "success": success
1064
+ }))
1065
+ except Exception as e:
1066
+ logger.error(f"Failed to notify about graph cache: {e}")
1067
+
1068
+ return graph_cache_callback
1069
+
1070
+ def _get_cached_graph_path(self, graph_name: str) -> Optional[str]:
1071
+ if not hasattr(self, "_cache_lock") or not hasattr(self, "_preemptive_cache"):
1072
+ return None
1073
+ try:
1074
+ with self._cache_lock:
1075
+ cache_path = self._preemptive_cache.get(graph_name)
1076
+ if not cache_path:
1077
+ return None
1078
+
1079
+ # Double-check validity (e.g. signature match for current command)
1080
+ if not self._is_cache_valid(graph_name, cache_path):
1081
+ return None
1082
+
1083
+ return cache_path
1084
+ except Exception:
1085
+ return None
1086
+
1087
+ async def _emit_graph_ready_for_graphs(
1088
+ self,
1089
+ graph_names: List[str],
1090
+ *,
1091
+ notify_log: Callable[[str], Awaitable[None]],
1092
+ task_id: Optional[str],
1093
+ export_format: str,
1094
+ graph_ready_initial: Optional[dict[str, str]],
1095
+ ) -> int:
1096
+ if not graph_names:
1097
+ return 0
1098
+ # Deduplicate requested names while preserving order
1099
+ graph_names = list(dict.fromkeys(graph_names))
1100
+ fmt = (export_format or "svg").strip().lower()
1101
+ emitted = 0
1102
+
1103
+ # Heuristic: Find active graph to help decide which existing graphs were touched.
1104
+ active_graph = None
1105
+ try:
1106
+ from sfi import Scalar
1107
+ active_graph = Scalar.getValue("c(curgraph)")
1108
+ except Exception:
1109
+ pass
1110
+ code = getattr(self, "_current_command_code", "")
1111
+ named_graphs = set(self._extract_named_graphs(code))
1112
+
1113
+ for graph_name in graph_names:
1114
+ # Try to determine a stable signature before exporting; prefer cached path if present
1115
+ cached_path = self._get_cached_graph_path(graph_name) if fmt == "svg" else None
1116
+ pre_signature = self._get_graph_signature(graph_name)
1117
+ emit_key = f"{graph_name}:{self._command_idx}:{fmt}"
1118
+
1119
+ # If we already emitted this EXACT signature in THIS command, skip.
1120
+ if self._last_emitted_graph_signatures.get(graph_name) == emit_key:
1121
+ continue
1122
+
1123
+ # Emit only when the command matches the graph command or explicitly names it.
1124
+ if graph_ready_initial is not None:
1125
+ graph_cmd = self._get_graph_command_line(graph_name)
1126
+ if not self._command_contains_graph_command(code, graph_cmd or ""):
1127
+ if graph_name not in named_graphs:
1128
+ continue
1129
+
1130
+ try:
1131
+ export_path = cached_path
1132
+ if not export_path:
1133
+ last_exc = None
1134
+ for attempt in range(6):
1135
+ try:
1136
+ export_path = await anyio.to_thread.run_sync(
1137
+ lambda: self.export_graph(graph_name, format=fmt)
1138
+ )
1139
+ break
1140
+ except Exception as exc:
1141
+ last_exc = exc
1142
+ if attempt < 5:
1143
+ await anyio.sleep(0.05)
1144
+ continue
1145
+ raise last_exc
1146
+ if self._last_emitted_graph_signatures.get(graph_name) == emit_key:
1147
+ continue
1148
+ payload = {
1149
+ "event": "graph_ready",
1150
+ "task_id": task_id,
1151
+ "graph": {
1152
+ "name": graph_name,
1153
+ "path": export_path,
1154
+ "label": graph_name,
1155
+ },
1156
+ }
1157
+ await notify_log(json.dumps(payload))
1158
+ emitted += 1
1159
+ self._last_emitted_graph_signatures[graph_name] = emit_key
1160
+ if graph_ready_initial is not None:
1161
+ graph_ready_initial[graph_name] = pre_signature
1162
+ except Exception as e:
1163
+ logger.warning("graph_ready export failed for %s: %s", graph_name, e)
1164
+ return emitted
1165
+
1166
+ @staticmethod
1167
+ def _extract_named_graphs(text: str) -> List[str]:
1168
+ if not text:
1169
+ return []
1170
+ matches = _GRAPH_NAME_PATTERN.findall(text)
1171
+ if not matches:
1172
+ return []
1173
+ out = []
1174
+ for raw in matches:
1175
+ name = raw.strip().strip("\"").strip("'").strip()
1176
+ if name:
1177
+ out.append(name)
1178
+ return out
1179
+
1180
+ async def _maybe_cache_graphs_on_chunk(
1181
+ self,
1182
+ *,
1183
+ graph_cache: Optional[StreamingGraphCache],
1184
+ emit_graph_ready: bool,
1185
+ notify_log: Callable[[str], Awaitable[None]],
1186
+ graph_ready_task_id: Optional[str],
1187
+ graph_ready_format: str,
1188
+ graph_ready_initial: Optional[dict[str, str]],
1189
+ last_check: List[float],
1190
+ force: bool = False,
1191
+ ) -> int:
1192
+ if not graph_cache or not graph_cache.auto_cache:
1193
+ return 0
1194
+ if self._is_executing and not force:
1195
+ # Skip polling if Stata is busy; it will block on _exec_lock anyway.
1196
+ # During final check (force=True), we know it's safe because _run_streaming_blocking has finished.
1197
+ return 0
1198
+ now = time.monotonic()
1199
+ if not force and last_check and now - last_check[0] < 0.75:
1200
+ return 0
1201
+ if last_check:
1202
+ last_check[0] = now
1203
+ try:
1204
+ cached_names = await graph_cache.cache_detected_graphs_with_pystata()
1205
+ except Exception as e:
1206
+ logger.debug("graph_ready polling failed: %s", e)
1207
+ return 0
1208
+ if emit_graph_ready and cached_names:
1209
+ async with self._ensure_graph_ready_lock():
1210
+ return await self._emit_graph_ready_for_graphs(
1211
+ cached_names,
1212
+ notify_log=notify_log,
1213
+ task_id=graph_ready_task_id,
1214
+ export_format=graph_ready_format,
1215
+ graph_ready_initial=graph_ready_initial,
1216
+ )
1217
+ return 0
1218
+
1219
+ def _ensure_graph_ready_lock(self) -> asyncio.Lock:
1220
+ lock = getattr(self, "_graph_ready_lock", None)
1221
+ if lock is None:
1222
+ lock = asyncio.Lock()
1223
+ self._graph_ready_lock = lock
1224
+ return lock
1225
+
1226
+ async def _emit_graph_ready_events(
1227
+ self,
1228
+ initial_graphs: dict[str, str],
1229
+ notify_log: Callable[[str], Awaitable[None]],
1230
+ task_id: Optional[str],
1231
+ export_format: str,
1232
+ ) -> int:
1233
+ if initial_graphs is None:
1234
+ return 0
1235
+ lock = self._ensure_graph_ready_lock()
1236
+
1237
+ fmt = (export_format or "svg").strip().lower()
1238
+ emitted = 0
1239
+
1240
+ # Poll briefly for new graphs after command completion; emit once per batch.
1241
+ for _ in range(5):
1242
+ try:
1243
+ current_graphs = list(self.list_graphs(force_refresh=True))
1244
+ except Exception as exc:
1245
+ logger.debug("graph_ready list_graphs failed: %s", exc)
1246
+ current_graphs = []
1247
+
1248
+ if current_graphs:
1249
+ async with lock:
1250
+ emitted += await self._emit_graph_ready_for_graphs(
1251
+ current_graphs,
1252
+ notify_log=notify_log,
1253
+ task_id=task_id,
1254
+ export_format=fmt,
1255
+ graph_ready_initial=initial_graphs,
1256
+ )
1257
+ break
1258
+
1259
+ await anyio.sleep(0.05)
1260
+
1261
+ return emitted
1262
+
1263
+ def _get_graph_signature(self, graph_name: str) -> str:
1264
+ """Return a stable signature for a graph name based on graph metadata."""
1265
+ if self._graph_signature_cache_cmd_idx != self._command_idx:
1266
+ self._graph_signature_cache = {}
1267
+ self._graph_signature_cache_cmd_idx = self._command_idx
1268
+
1269
+ cached = self._graph_signature_cache.get(graph_name)
1270
+ if cached:
1271
+ return cached
1272
+
1273
+ signature = graph_name
1274
+
1275
+ # Refresh graph metadata if we don't have created timestamps yet.
1276
+ try:
1277
+ self.list_graphs(force_refresh=True)
1278
+ except Exception:
1279
+ pass
1280
+
1281
+ try:
1282
+ # Use cached graph metadata when available (created timestamp is stable).
1283
+ with self._list_graphs_cache_lock:
1284
+ cached_graphs = list(self._list_graphs_cache or [])
1285
+ for g in cached_graphs:
1286
+ if hasattr(g, "name") and g.name == graph_name and getattr(g, "created", None):
1287
+ signature = f"{graph_name}_{g.created}"
1288
+ break
1289
+ except Exception:
1290
+ pass
1291
+
1292
+ # If still missing, attempt a targeted timestamp lookup via the graph detector.
1293
+ if signature == graph_name:
1294
+ try:
1295
+ detector = getattr(self, "_graph_detector", None)
1296
+ if detector is not None:
1297
+ timestamps = detector._get_graph_timestamps([graph_name])
1298
+ ts = timestamps.get(graph_name)
1299
+ if ts:
1300
+ signature = f"{graph_name}_{ts}"
1301
+ except Exception:
1302
+ pass
1303
+
1304
+ self._graph_signature_cache[graph_name] = signature
1305
+ return signature
1306
+
1307
+ @staticmethod
1308
+ def _normalize_command_text(text: str) -> str:
1309
+ return " ".join((text or "").strip().split()).lower()
1310
+
1311
+ def _command_contains_graph_command(self, code: str, graph_cmd: str) -> bool:
1312
+ if not code or not graph_cmd:
1313
+ return False
1314
+ graph_norm = self._normalize_command_text(graph_cmd)
1315
+ if not graph_norm:
1316
+ return False
1317
+ graph_prefixed = f"graph {graph_norm}" if not graph_norm.startswith("graph ") else graph_norm
1318
+ def matches(candidate: str) -> bool:
1319
+ cand_norm = self._normalize_command_text(candidate)
1320
+ if not cand_norm:
1321
+ return False
1322
+ return (
1323
+ cand_norm == graph_norm
1324
+ or graph_norm.startswith(cand_norm)
1325
+ or cand_norm.startswith(graph_norm)
1326
+ or cand_norm == graph_prefixed
1327
+ or graph_prefixed.startswith(cand_norm)
1328
+ or cand_norm.startswith(graph_prefixed)
1329
+ )
1330
+
1331
+ if "\n" in code:
1332
+ for line in code.splitlines():
1333
+ if matches(line):
1334
+ return True
1335
+ return False
1336
+ return matches(code)
1337
+
1338
+ def _get_graph_command_line(self, graph_name: str) -> Optional[str]:
1339
+ """Fetch the Stata command line used to create the graph, if available."""
1340
+ try:
1341
+ from sfi import Macro
1342
+ except Exception:
1343
+ return None
1344
+
1345
+ resolved = self._resolve_graph_name_for_stata(graph_name)
1346
+ hold_name = f"_mcp_gcmd_hold_{uuid.uuid4().hex[:8]}"
1347
+ cmd = None
1348
+ cur_graph = None
1349
+
1350
+ with self._exec_lock:
1351
+ try:
1352
+ bundle = (
1353
+ f"capture _return hold {hold_name}\n"
1354
+ f"capture quietly graph describe {resolved}\n"
1355
+ "macro define mcp_gcmd \"`r(command)'\"\n"
1356
+ "macro define mcp_curgraph \"`c(curgraph)'\"\n"
1357
+ f"capture _return restore {hold_name}"
1358
+ )
1359
+ self.stata.run(bundle, echo=False)
1360
+ cmd = Macro.getGlobal("mcp_gcmd")
1361
+ cur_graph = Macro.getGlobal("mcp_curgraph")
1362
+ self.stata.run("macro drop mcp_gcmd", echo=False)
1363
+ self.stata.run("macro drop mcp_curgraph", echo=False)
1364
+ except Exception:
1365
+ try:
1366
+ self.stata.run(f"capture _return restore {hold_name}", echo=False)
1367
+ except Exception:
1368
+ pass
1369
+ cmd = None
1370
+
1371
+ if cmd:
1372
+ return cmd
1373
+
1374
+ # Fallback: describe current graph without a name and validate against c(curgraph).
1375
+ with self._exec_lock:
1376
+ try:
1377
+ bundle = (
1378
+ f"capture _return hold {hold_name}\n"
1379
+ "capture quietly graph describe\n"
1380
+ "macro define mcp_gcmd \"`r(command)'\"\n"
1381
+ "macro define mcp_curgraph \"`c(curgraph)'\"\n"
1382
+ f"capture _return restore {hold_name}"
1383
+ )
1384
+ self.stata.run(bundle, echo=False)
1385
+ cmd = Macro.getGlobal("mcp_gcmd")
1386
+ cur_graph = Macro.getGlobal("mcp_curgraph")
1387
+ self.stata.run("macro drop mcp_gcmd", echo=False)
1388
+ self.stata.run("macro drop mcp_curgraph", echo=False)
1389
+ except Exception:
1390
+ try:
1391
+ self.stata.run(f"capture _return restore {hold_name}", echo=False)
1392
+ except Exception:
1393
+ pass
1394
+ cmd = None
1395
+
1396
+ if cmd and cur_graph:
1397
+ if cur_graph == resolved or cur_graph == graph_name:
1398
+ return cmd
1399
+
1400
+ return cmd or None
1401
+
1402
+ def _request_break_in(self) -> None:
1403
+ """
1404
+ Attempt to interrupt a running Stata command when cancellation is requested.
1405
+
1406
+ Uses the Stata sfi.breakIn hook when available; errors are swallowed because
1407
+ cancellation should never crash the host process.
1408
+ """
1409
+ try:
1410
+ import sfi # type: ignore[import-not-found]
1411
+
1412
+ break_fn = getattr(sfi, "breakIn", None) or getattr(sfi, "break_in", None)
1413
+ if callable(break_fn):
1414
+ try:
1415
+ break_fn()
1416
+ logger.info("Sent breakIn() to Stata for cancellation")
1417
+ except Exception as e: # pragma: no cover - best-effort
1418
+ logger.warning(f"Failed to send breakIn() to Stata: {e}")
1419
+ else: # pragma: no cover - environment without Stata runtime
1420
+ logger.debug("sfi.breakIn not available; cannot interrupt Stata")
1421
+ except Exception as e: # pragma: no cover - import failure or other
1422
+ logger.debug(f"Unable to import sfi for cancellation: {e}")
1423
+
1424
+ async def _wait_for_stata_stop(self, timeout: float = 2.0) -> bool:
1425
+ """
1426
+ After requesting a break, poll the Stata interface so it can surface BreakError
1427
+ and return control. This is best-effort and time-bounded.
1428
+ """
1429
+ deadline = time.monotonic() + timeout
1430
+ try:
1431
+ import sfi # type: ignore[import-not-found]
1432
+
1433
+ toolkit = getattr(sfi, "SFIToolkit", None)
1434
+ poll = getattr(toolkit, "pollnow", None) or getattr(toolkit, "pollstd", None)
1435
+ BreakError = getattr(sfi, "BreakError", None)
1436
+ except Exception: # pragma: no cover
1437
+ return False
1438
+
1439
+ if not callable(poll):
1440
+ return False
1441
+
1442
+ last_exc: Optional[Exception] = None
1443
+ while time.monotonic() < deadline:
1444
+ try:
1445
+ poll()
1446
+ except Exception as e: # pragma: no cover - depends on Stata runtime
1447
+ last_exc = e
1448
+ if BreakError is not None and isinstance(e, BreakError):
1449
+ logger.info("Stata BreakError detected; cancellation acknowledged by Stata")
1450
+ return True
1451
+ # If Stata already stopped, break on any other exception.
1452
+ break
1453
+ await anyio.sleep(0.05)
1454
+
1455
+ if last_exc:
1456
+ logger.debug(f"Cancellation poll exited with {last_exc}")
1457
+ return False
1458
+
1459
+ @contextmanager
1460
+ def _temp_cwd(self, cwd: Optional[str]):
1461
+ if cwd is None:
1462
+ yield
1463
+ return
1464
+ prev = os.getcwd()
1465
+ os.chdir(cwd)
1466
+ try:
1467
+ yield
1468
+ finally:
1469
+ os.chdir(prev)
1470
+
1471
+ @contextmanager
1472
+ def _safe_redirect_fds(self):
1473
+ """Redirects fd 1 (stdout) to fd 2 (stderr) at the OS level."""
1474
+ # Save original stdout fd
1475
+ try:
1476
+ stdout_fd = os.dup(1)
1477
+ except Exception:
1478
+ # Fallback if we can't dup (e.g. strange environment)
1479
+ yield
1480
+ return
1481
+
1482
+ try:
1483
+ # Redirect OS-level stdout to stderr
1484
+ os.dup2(2, 1)
1485
+ yield
1486
+ finally:
1487
+ # Restore stdout
1488
+ try:
1489
+ os.dup2(stdout_fd, 1)
1490
+ os.close(stdout_fd)
1491
+ except Exception:
1492
+ pass
1493
+
1494
+ def init(self):
1495
+ """Initializes usage of pystata using cached discovery results."""
1496
+ if self._initialized:
1497
+ return
1498
+
1499
+ # Suppress any non-UTF8 banner output from PyStata on stdout, which breaks MCP stdio transport
1500
+ from contextlib import redirect_stdout, redirect_stderr
1501
+
1502
+ try:
1503
+ import stata_setup
1504
+
1505
+ # Get discovered Stata paths (cached from first call)
1506
+ discovery_candidates = _get_discovery_candidates()
1507
+ if not discovery_candidates:
1508
+ raise RuntimeError("No Stata candidates found during discovery")
1509
+
1510
+ logger.info("Initializing Stata engine (attempting up to %d candidate binaries)...", len(discovery_candidates))
1511
+
1512
+ # Diagnostic: force faulthandler to output to stderr for C crashes
1513
+ import faulthandler
1514
+ faulthandler.enable(file=sys.stderr)
1515
+ import subprocess
1516
+
1517
+ success = False
1518
+ last_error = None
1519
+ chosen_exec: Optional[Tuple[str, str]] = None
1520
+
1521
+ for stata_exec_path, edition in discovery_candidates:
1522
+ candidates = []
1523
+ # Prefer the binary directory first (documented input for stata_setup)
1524
+ bin_dir = os.path.dirname(stata_exec_path)
1525
+
1526
+ # 2. App Bundle: .../StataMP.app (macOS only)
1527
+ curr = bin_dir
1528
+ app_bundle = None
1529
+ while len(curr) > 1:
1530
+ if curr.endswith(".app"):
1531
+ app_bundle = curr
1532
+ break
1533
+ parent = os.path.dirname(curr)
1534
+ if parent == curr:
1535
+ break
1536
+ curr = parent
1537
+
1538
+ ordered_candidates = []
1539
+ if app_bundle:
1540
+ # On macOS, the parent of the .app is often the correct install path
1541
+ # (e.g., /Applications/StataNow containing StataMP.app)
1542
+ parent_dir = os.path.dirname(app_bundle)
1543
+ if parent_dir and parent_dir != "/":
1544
+ ordered_candidates.append(parent_dir)
1545
+ ordered_candidates.append(app_bundle)
1546
+
1547
+ if bin_dir:
1548
+ ordered_candidates.append(bin_dir)
1549
+
1550
+ # Deduplicate preserving order
1551
+ seen = set()
1552
+ candidates = []
1553
+ for c in ordered_candidates:
1554
+ if c not in seen:
1555
+ seen.add(c)
1556
+ candidates.append(c)
1557
+
1558
+ for path in candidates:
1559
+ try:
1560
+ # 1. Pre-flight check in a subprocess to capture hard exits/crashes
1561
+ skip_preflight = os.environ.get("MCP_STATA_SKIP_PREFLIGHT") == "1"
1562
+ if not skip_preflight:
1563
+ sys.stderr.write(f"[mcp_stata] DEBUG: Pre-flight check for path '{path}'\n")
1564
+ sys.stderr.flush()
1565
+
1566
+ preflight_code = f"""
1567
+ import sys
1568
+ import stata_setup
1569
+ from contextlib import redirect_stdout, redirect_stderr
1570
+ with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
1571
+ try:
1572
+ stata_setup.config({repr(path)}, {repr(edition)})
1573
+ from pystata import stata
1574
+ # Minimal verification of engine health
1575
+ stata.run('display 1', echo=False)
1576
+ print('PREFLIGHT_OK')
1577
+ except Exception as e:
1578
+ print(f'PREFLIGHT_FAIL: {{e}}', file=sys.stderr)
1579
+ sys.exit(1)
1580
+ """
1581
+
1582
+ try:
1583
+ # Use shorter timeout for pre-flight if feasible,
1584
+ # but keep it safe for slow environments. 15s is usually enough for a ping.
1585
+ # Use the current interpreter to preserve its site-packages
1586
+ # (stata_setup/pystata) in the preflight subprocess.
1587
+ py_exe = sys.executable
1588
+ if not py_exe or not os.path.exists(py_exe):
1589
+ py_exe = os.path.realpath(sys.executable)
1590
+ env = os.environ.copy()
1591
+ extra_paths = [p for p in sys.path if p and os.path.isdir(p)]
1592
+ if extra_paths:
1593
+ existing = env.get("PYTHONPATH", "")
1594
+ merged = os.pathsep.join(extra_paths + ([existing] if existing else []))
1595
+ env["PYTHONPATH"] = merged
1596
+ res = subprocess.run(
1597
+ [py_exe, "-c", preflight_code],
1598
+ capture_output=True, text=True, timeout=20, env=env
1599
+ )
1600
+ if res.returncode != 0:
1601
+ sys.stderr.write(f"[mcp_stata] Pre-flight failed (rc={res.returncode}) for '{path}'\n")
1602
+ if res.stdout.strip():
1603
+ sys.stderr.write(f"--- Pre-flight stdout ---\n{res.stdout.strip()}\n")
1604
+ if res.stderr.strip():
1605
+ sys.stderr.write(f"--- Pre-flight stderr ---\n{res.stderr.strip()}\n")
1606
+ sys.stderr.flush()
1607
+ last_error = f"Pre-flight failed: {res.stdout.strip()} {res.stderr.strip()}"
1608
+ continue
1609
+ else:
1610
+ sys.stderr.write(f"[mcp_stata] Pre-flight succeeded for '{path}'. Proceeding to in-process init.\n")
1611
+ sys.stderr.flush()
1612
+ except Exception as pre_e:
1613
+ sys.stderr.write(f"[mcp_stata] Pre-flight execution error for '{path}': {repr(pre_e)}\n")
1614
+ sys.stderr.flush()
1615
+ last_error = pre_e
1616
+ continue
1617
+ else:
1618
+ sys.stderr.write(f"[mcp_stata] DEBUG: Skipping pre-flight check for path '{path}' (MCP_STATA_SKIP_PREFLIGHT=1)\n")
1619
+ sys.stderr.flush()
1620
+
1621
+ msg = f"[mcp_stata] DEBUG: In-process stata_setup.config('{path}', '{edition}')\n"
1622
+ sys.stderr.write(msg)
1623
+ sys.stderr.flush()
1624
+ # Redirect both sys.stdout/err AND the raw fds to our stderr pipe.
1625
+ with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr), self._safe_redirect_fds():
1626
+ stata_setup.config(path, edition)
1627
+
1628
+ sys.stderr.write(f"[mcp_stata] DEBUG: stata_setup.config succeeded for path: {path}\n")
1629
+ sys.stderr.flush()
1630
+ success = True
1631
+ chosen_exec = (stata_exec_path, edition)
1632
+ logger.info("stata_setup.config succeeded with path: %s", path)
1633
+ break
1634
+ except BaseException as e:
1635
+ last_error = e
1636
+ sys.stderr.write(f"[mcp_stata] WARNING: In-process stata_setup.config caught: {repr(e)}\n")
1637
+ sys.stderr.flush()
1638
+ logger.warning("stata_setup.config failed for path '%s': %s", path, e)
1639
+ if isinstance(e, SystemExit):
1640
+ break
1641
+ continue
1642
+
1643
+ if success:
1644
+ # Cache winning candidate for subsequent lookups
1645
+ global _discovery_result
1646
+ if chosen_exec:
1647
+ _discovery_result = chosen_exec
1648
+ break
1649
+
1650
+ if not success:
1651
+ error_msg = (
1652
+ f"stata_setup.config failed to initialize Stata. "
1653
+ f"Tried candidates: {discovery_candidates}. "
1654
+ f"Last error: {repr(last_error)}"
1655
+ )
1656
+ sys.stderr.write(f"[mcp_stata] ERROR: {error_msg}\n")
1657
+ sys.stderr.flush()
1658
+ logger.error(error_msg)
1659
+ raise RuntimeError(error_msg)
1660
+
1661
+ # Cache the binary path for later use (e.g., PNG export on Windows)
1662
+ self._stata_exec_path = pathlib.Path(stata_exec_path).absolute()
1663
+
1664
+ try:
1665
+ sys.stderr.write("[mcp_stata] DEBUG: Importing pystata and warming up...\n")
1666
+ sys.stderr.flush()
1667
+ with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr), self._safe_redirect_fds():
1668
+ from pystata import stata # type: ignore[import-not-found]
1669
+ try:
1670
+ # Disable PyStata streamout to avoid stdout corruption and SystemError
1671
+ from pystata import config as pystata_config # type: ignore[import-not-found]
1672
+ if hasattr(pystata_config, "set_streamout"):
1673
+ pystata_config.set_streamout("off")
1674
+ elif hasattr(pystata_config, "stconfig"):
1675
+ pystata_config.stconfig["streamout"] = "off"
1676
+ except Exception:
1677
+ pass
1678
+ # Warm up the engine and swallow any late splash screen output
1679
+ stata.run("display 1", echo=False)
1680
+ self.stata = stata
1681
+ self._initialized = True
1682
+
1683
+ # Initialize persistent session log
1684
+ self._persistent_log_path = self._create_smcl_log_path(prefix="mcp_session_")
1685
+ self._persistent_log_name = "_mcp_session"
1686
+ path_for_stata = self._persistent_log_path.replace("\\", "/")
1687
+ # Open the log once for the entire session, ensuring any previous one is closed
1688
+ stata.run(f"capture log close {self._persistent_log_name}", echo=False)
1689
+ stata.run(f'log using "{path_for_stata}", replace smcl name({self._persistent_log_name})', echo=False)
1690
+
1691
+ sys.stderr.write("[mcp_stata] DEBUG: pystata warmed up successfully\n")
1692
+ sys.stderr.flush()
1693
+ except BaseException as e:
1694
+ sys.stderr.write(f"[mcp_stata] ERROR: Failed to load pystata or run initial command: {repr(e)}\n")
1695
+ sys.stderr.flush()
1696
+ logger.error("Failed to load pystata or run initial command: %s", e)
1697
+ raise
1698
+
1699
+ # Initialize list_graphs TTL cache
1700
+ self._list_graphs_cache = None
1701
+ self._list_graphs_cache_time = 0
1702
+ self._list_graphs_cache_lock = threading.Lock()
1703
+
1704
+ # Map user-facing graph names (may include spaces/punctuation) to valid
1705
+ # internal Stata graph names.
1706
+ self._graph_name_aliases: Dict[str, str] = {}
1707
+ self._graph_name_reverse: Dict[str, str] = {}
1708
+
1709
+ logger.info("StataClient initialized successfully with %s (%s)", stata_exec_path, edition)
1710
+
1711
+ except ImportError as e:
1712
+ raise RuntimeError(
1713
+ f"Failed to import stata_setup or pystata: {e}. "
1714
+ "Ensure they are installed (pip install pystata stata-setup)."
1715
+ ) from e
1716
+
1717
+ def _make_valid_stata_name(self, name: str) -> str:
1718
+ """Create a valid Stata name (<=32 chars, [A-Za-z_][A-Za-z0-9_]*)."""
1719
+ base = re.sub(r"[^A-Za-z0-9_]", "_", name or "")
1720
+ if not base:
1721
+ base = "Graph"
1722
+ if not re.match(r"^[A-Za-z_]", base):
1723
+ base = f"G_{base}"
1724
+ base = base[:32]
1725
+
1726
+ # Avoid collisions.
1727
+ candidate = base
1728
+ i = 1
1729
+ while candidate in getattr(self, "_graph_name_reverse", {}):
1730
+ suffix = f"_{i}"
1731
+ candidate = (base[: max(0, 32 - len(suffix))] + suffix)[:32]
1732
+ i += 1
1733
+ return candidate
1734
+
1735
+ def _resolve_graph_name_for_stata(self, name: str) -> str:
1736
+ """Return internal Stata graph name for a user-facing name."""
1737
+ if not name:
1738
+ return name
1739
+ aliases = getattr(self, "_graph_name_aliases", None)
1740
+ if aliases and name in aliases:
1741
+ return aliases[name]
1742
+ return name
1743
+
1744
+ def _maybe_rewrite_graph_name_in_command(self, code: str) -> str:
1745
+ """Rewrite name("...") to a valid Stata name and store alias mapping."""
1746
+ if not code:
1747
+ return code
1748
+ if not hasattr(self, "_graph_name_aliases"):
1749
+ self._graph_name_aliases = {}
1750
+ self._graph_name_reverse = {}
1751
+
1752
+ # Handle common patterns: name("..." ...), name(`"..."' ...), or name(unquoted ...)
1753
+ pat = re.compile(r"name\(\s*(?:`\"(?P<cq>[^\"]*)\"'|\"(?P<dq>[^\"]*)\"|(?P<uq>[^,\s\)]+))\s*(?P<rest>[^)]*)\)")
1754
+
1755
+ def repl(m: re.Match) -> str:
1756
+ original = m.group("cq") or m.group("dq") or m.group("uq")
1757
+ original = (original or "").strip()
1758
+
1759
+ # If it's already an alias we recognize, don't rewrite it again
1760
+ if original.startswith("mcp_g_") and original in self._graph_name_reverse:
1761
+ return m.group(0)
1762
+
1763
+ internal = self._graph_name_aliases.get(original)
1764
+ if not internal:
1765
+ # Only rewrite if it's NOT a valid Stata name or has special characters
1766
+ if not re.match(r"^[A-Za-z_][A-Za-z0-9_]*$", original) or len(original) > 32:
1767
+ internal = self._make_valid_stata_name(original)
1768
+ self._graph_name_aliases[original] = internal
1769
+ self._graph_name_reverse[internal] = original
1770
+ else:
1771
+ # Valid name, use as is but still record it if we want reverse mapping to be consistent
1772
+ internal = original
1773
+
1774
+ rest = m.group("rest") or ""
1775
+ return f"name({internal}{rest})"
1776
+
1777
+ return pat.sub(repl, code)
1778
+
1779
+ def _get_rc_from_scalar(self, Scalar=None) -> int:
1780
+ """Safely get return code using sfi.Scalar access to c(rc)."""
1781
+ if Scalar is None:
1782
+ from sfi import Scalar
1783
+ try:
1784
+ # c(rc) is the built-in system constant for the last return code.
1785
+ # Accessing it via Scalar.getValue is direct and does not reset it.
1786
+ rc_val = Scalar.getValue("c(rc)")
1787
+ if rc_val is None:
1788
+ return 0
1789
+ return int(float(rc_val))
1790
+ except Exception:
1791
+ # Fallback to macro if Scalar fails
1792
+ try:
1793
+ from sfi import Macro
1794
+ self.stata.run("global _mcp_last_rc = _rc", echo=False)
1795
+ rc_str = Macro.getGlobal("_mcp_last_rc")
1796
+ return int(float(rc_str)) if rc_str else 0
1797
+ except Exception:
1798
+ return -1
1799
+
1800
+ def _parse_rc_from_text(self, text: str) -> Optional[int]:
1801
+ """Parse return code from plain text using structural patterns."""
1802
+ if not text:
1803
+ return None
1804
+
1805
+ # 1. Primary check: 'search r(N)' pattern (SMCL tag potentially stripped)
1806
+ matches = list(re.finditer(r'search r\((\d+)\)', text))
1807
+ if matches:
1808
+ try:
1809
+ return int(matches[-1].group(1))
1810
+ except Exception:
1811
+ pass
1812
+
1813
+ # 2. Secondary check: Standalone r(N); pattern
1814
+ # This appears at the end of command blocks
1815
+ matches = list(re.finditer(r'(?<!\w)r\((\d+)\);?', text))
1816
+ if matches:
1817
+ try:
1818
+ return int(matches[-1].group(1))
1819
+ except Exception:
1820
+ pass
1821
+
1822
+ return None
1823
+
1824
+ def _parse_line_from_text(self, text: str) -> Optional[int]:
1825
+ match = re.search(r"line\s+(\d+)", text, re.IGNORECASE)
1826
+ if match:
1827
+ try:
1828
+ return int(match.group(1))
1829
+ except Exception:
1830
+ return None
1831
+ return None
1832
+
1833
+ def _read_log_backwards_until_error(self, path: str, max_bytes: int = 5_000_000, start_offset: int = 0) -> str:
1834
+ """
1835
+ Read log file backwards in chunks, stopping when we find {err} tags,
1836
+ reach the start, or reach the start_offset.
1837
+
1838
+ Args:
1839
+ path: Path to the log file
1840
+ max_bytes: Maximum total bytes to read (safety limit)
1841
+ start_offset: Byte offset to stop searching at (important for persistent logs)
1842
+
1843
+ Returns:
1844
+ The relevant portion of the log containing the error and context
1845
+ """
1846
+ try:
1847
+ chunk_size = 50_000
1848
+ total_read = 0
1849
+ chunks = []
1850
+
1851
+ with open(path, 'rb') as f:
1852
+ f.seek(0, os.SEEK_END)
1853
+ file_size = f.tell()
1854
+
1855
+ if file_size <= start_offset:
1856
+ return ""
1857
+
1858
+ # Start from the end, but don't go past start_offset
1859
+ position = file_size
1860
+
1861
+ while position > start_offset and total_read < max_bytes:
1862
+ read_size = min(chunk_size, position - start_offset, max_bytes - total_read)
1863
+ position -= read_size
1864
+
1865
+ f.seek(position)
1866
+ chunk = f.read(read_size)
1867
+ chunks.insert(0, chunk)
1868
+ total_read += read_size
1869
+
1870
+ try:
1871
+ accumulated = b''.join(chunks).decode('utf-8', errors='replace')
1872
+ if '{err}' in accumulated:
1873
+ # Context chunk
1874
+ if position > start_offset and total_read < max_bytes:
1875
+ extra_read = min(chunk_size, position - start_offset, max_bytes - total_read)
1876
+ position -= extra_read
1877
+ f.seek(position)
1878
+ extra_chunk = f.read(extra_read)
1879
+ chunks.insert(0, extra_chunk)
1880
+ return b''.join(chunks).decode('utf-8', errors='replace')
1881
+ except Exception:
1882
+ continue
1883
+
1884
+ return b''.join(chunks).decode('utf-8', errors='replace')
1885
+ except Exception as e:
1886
+ logger.debug(f"Backward log read failed: {e}")
1887
+ return ""
1888
+
1889
+ def _read_log_tail_smart(self, path: str, rc: int, trace: bool = False, start_offset: int = 0) -> str:
1890
+ """
1891
+ Smart log tail reader that adapts based on whether an error occurred.
1892
+
1893
+ - If rc == 0: Read normal tail (20KB without trace, 200KB with trace)
1894
+ - If rc != 0: Search backwards dynamically to find the error
1895
+
1896
+ Args:
1897
+ path: Path to the log file
1898
+ rc: Return code from Stata
1899
+ trace: Whether trace mode was enabled
1900
+ start_offset: Byte offset to stop searching at
1901
+
1902
+ Returns:
1903
+ Relevant log content
1904
+ """
1905
+ if rc != 0:
1906
+ # Error occurred - search backwards for {err} tags
1907
+ return self._read_log_backwards_until_error(path, start_offset=start_offset)
1908
+ else:
1909
+ # Success - just read normal tail
1910
+ tail_size = 200_000 if trace else 20_000
1911
+ return self._read_log_tail(path, tail_size, start_offset=start_offset)
1912
+
1913
+ def _read_log_tail(self, path: str, max_chars: int, start_offset: int = 0) -> str:
1914
+ try:
1915
+ with open(path, "rb") as f:
1916
+ f.seek(0, os.SEEK_END)
1917
+ end_pos = f.tell()
1918
+
1919
+ if end_pos <= start_offset:
1920
+ return ""
1921
+
1922
+ read_size = min(max_chars, end_pos - start_offset)
1923
+ f.seek(end_pos - read_size)
1924
+ data = f.read(read_size)
1925
+ return data.decode("utf-8", errors="replace")
1926
+ except Exception:
1927
+ return ""
1928
+
1929
+ def _build_combined_log(
1930
+ self,
1931
+ tail: TailBuffer,
1932
+ path: str,
1933
+ rc: int,
1934
+ trace: bool,
1935
+ exc: Optional[Exception],
1936
+ start_offset: int = 0,
1937
+ ) -> str:
1938
+ tail_text = tail.get_value()
1939
+ log_tail = self._read_log_tail_smart(path, rc, trace, start_offset=start_offset)
1940
+ if log_tail and len(log_tail) > len(tail_text):
1941
+ tail_text = log_tail
1942
+ return (tail_text or "") + (f"\n{exc}" if exc else "")
1943
+
1944
+ def _truncate_command_output(
1945
+ self,
1946
+ result: CommandResponse,
1947
+ max_output_lines: Optional[int],
1948
+ ) -> CommandResponse:
1949
+ if max_output_lines is None or not result.stdout:
1950
+ return result
1951
+ lines = result.stdout.splitlines()
1952
+ if len(lines) <= max_output_lines:
1953
+ return result
1954
+ truncated_lines = lines[:max_output_lines]
1955
+ truncated_lines.append(
1956
+ f"\n... (output truncated: showing {max_output_lines} of {len(lines)} lines)"
1957
+ )
1958
+ truncated_stdout = "\n".join(truncated_lines)
1959
+ if hasattr(result, "model_copy"):
1960
+ return result.model_copy(update={"stdout": truncated_stdout})
1961
+ return result.copy(update={"stdout": truncated_stdout})
1962
+
1963
+ def _run_plain_capture(self, code: str) -> str:
1964
+ """
1965
+ Run a Stata command while capturing output using a named SMCL log.
1966
+ This is the most reliable way to capture output (like return list)
1967
+ without interfering with user logs or being affected by stdout redirection issues.
1968
+ """
1969
+ if not self._initialized:
1970
+ self.init()
1971
+
1972
+ with self._exec_lock:
1973
+ hold_name = f"mcp_hold_{uuid.uuid4().hex[:8]}"
1974
+ # Hold results BEFORE opening the capture log
1975
+ self.stata.run(f"capture _return hold {hold_name}", echo=False)
1976
+
1977
+ try:
1978
+ with self._smcl_log_capture() as (log_name, smcl_path):
1979
+ # Restore results INSIDE the capture log so return list can see them
1980
+ self.stata.run(f"capture _return restore {hold_name}", echo=False)
1981
+ try:
1982
+ self.stata.run(code, echo=True)
1983
+ except Exception:
1984
+ pass
1985
+ except Exception:
1986
+ # Cleanup hold if log capture failed to open
1987
+ self.stata.run(f"capture _return drop {hold_name}", echo=False)
1988
+ content = ""
1989
+ smcl_path = None
1990
+ else:
1991
+ # Read SMCL content and convert to text
1992
+ content = self._read_smcl_file(smcl_path)
1993
+ # Remove the temp file
1994
+ self._safe_unlink(smcl_path)
1995
+
1996
+ return self._smcl_to_text(content)
1997
+
1998
+ def _count_do_file_lines(self, path: str) -> int:
1999
+ """
2000
+ Count the number of executable lines in a .do file for progress inference.
2001
+
2002
+ Blank lines and comment-only lines (starting with * or //) are ignored.
2003
+ """
2004
+ try:
2005
+ with open(path, "r", encoding="utf-8", errors="replace") as f:
2006
+ lines = f.read().splitlines()
2007
+ except Exception:
2008
+ return 0
2009
+
2010
+ total = 0
2011
+ for line in lines:
2012
+ s = line.strip()
2013
+ if not s:
2014
+ continue
2015
+ if s.startswith("*"):
2016
+ continue
2017
+ if s.startswith("//"):
2018
+ continue
2019
+ total += 1
2020
+ return total
2021
+
2022
+ def _smcl_to_text(self, smcl: str) -> str:
2023
+ """Convert simple SMCL markup into plain text for LLM-friendly help."""
2024
+ # First, clean internal maintenance
2025
+ smcl = self._clean_internal_smcl(smcl)
2026
+
2027
+ # Protect escape sequences for curly braces
2028
+ # SMCL uses {c -(} for { and {c )-} for }
2029
+ cleaned = smcl.replace("{c -(}", "__L__").replace("{c )-}", "__R__")
2030
+
2031
+ # Handle SMCL escape variations that might have been partially processed
2032
+ cleaned = cleaned.replace("__G_L__", "__L__").replace("__G_R__", "__R__")
2033
+
2034
+ # Keep inline directive content if present (e.g., {bf:word} -> word)
2035
+ cleaned = re.sub(r"\{[^}:]+:([^}]*)\}", r"\1", cleaned)
2036
+
2037
+ # Remove remaining SMCL tags like {smcl}, {txt}, {res}, {com}, etc.
2038
+ # We use a non-greedy match.
2039
+ cleaned = re.sub(r"\{[^}]*\}", "", cleaned)
2040
+
2041
+ # Convert placeholders back to literal braces
2042
+ cleaned = cleaned.replace("__L__", "{").replace("__R__", "}")
2043
+
2044
+ # Normalize whitespace
2045
+ cleaned = cleaned.replace("\r", "")
2046
+ lines = [line.rstrip() for line in cleaned.splitlines()]
2047
+ return "\n".join(lines).strip()
2048
+
2049
+ def _clean_internal_smcl(
2050
+ self,
2051
+ content: str,
2052
+ strip_output: bool = True,
2053
+ strip_leading_boilerplate: bool = True,
2054
+ ) -> str:
2055
+ """
2056
+ Conservative cleaning of internal maintenance from SMCL while preserving
2057
+ tags and actual user output.
2058
+ """
2059
+ if not content:
2060
+ return ""
2061
+
2062
+ # Strip any UTF-8 BOM that can precede SMCL logs
2063
+ if content.startswith("\ufeff"):
2064
+ content = content.lstrip("\ufeff")
2065
+
2066
+ # Pattern for arbitrary SMCL tags: {txt}, {com}, etc.
2067
+ tags = r"(?:\{[^}]+\})*"
2068
+
2069
+ # Only remove leading boilerplate when explicitly requested
2070
+ if strip_leading_boilerplate:
2071
+ # Remove leading standalone {txt} boilerplate.
2072
+ content = re.sub(r"^\s*\{txt\}\s*(\r?\n\s*)+", "", content)
2073
+ # Remove leading blank lines.
2074
+ content = re.sub(r"^\s*\r?\n+", "", content)
2075
+
2076
+ # 1. Strip SMCL log headers and footers (multiple possible due to append/reopen)
2077
+ # Headers typically run from {smcl} until the line after "opened on:".
2078
+ content = re.sub(
2079
+ r"(?:\{smcl\}\s*)?\{txt\}\{sf\}\{ul off\}\{\.-\}.*?opened on:.*?(?:\r?\n){1,2}",
2080
+ "",
2081
+ content,
2082
+ flags=re.DOTALL,
2083
+ )
2084
+ # Remove orphan header markers that sometimes leak into output
2085
+ content = re.sub(r"^\s*\{smcl\}\s*$", "", content, flags=re.MULTILINE)
2086
+ content = re.sub(r"^\s*\{txt\}\{sf\}\{ul off\}\s*$", "", content, flags=re.MULTILINE)
2087
+ content = re.sub(r"^\s*\{txt\}\{sf\}\{ul off\}\{smcl\}\s*$", "", content, flags=re.MULTILINE)
2088
+
2089
+ # Remove leading boilerplate-only lines (blank or SMCL tag-only)
2090
+ if strip_leading_boilerplate:
2091
+ lines = content.splitlines()
2092
+ lead = 0
2093
+ while lead < len(lines):
2094
+ line = lines[lead].strip()
2095
+ if not line:
2096
+ lead += 1
2097
+ continue
2098
+ if re.fullmatch(r"(?:\{[^}]+\})+", line):
2099
+ lead += 1
2100
+ continue
2101
+ break
2102
+ if lead:
2103
+ content = "\n".join(lines[lead:])
2104
+
2105
+ # Remove leading tag-only {txt} lines and blank lines that can leak
2106
+ # from log open headers in streaming mode.
2107
+ content = re.sub(r"^\s*\{txt\}\s*(\r?\n\s*)+", "", content)
2108
+ content = re.sub(r"^\s*\r?\n+", "", content)
2109
+
2110
+ # 2. Strip our injected capture/noisily blocks
2111
+ # We match start-of-line followed by optional tags, prompt, optional tags,
2112
+ # then the block markers. Must match the entire line to be safe.
2113
+ block_markers = [
2114
+ r"capture noisily \{c -\(\}",
2115
+ r"capture noisily \{",
2116
+ r"noisily \{c -\(\}",
2117
+ r"noisily \{",
2118
+ r"\{c \)\-\}",
2119
+ r"\}"
2120
+ ]
2121
+ for p in block_markers:
2122
+ # Match exactly the marker line (with optional trailing tags/whitespace)
2123
+ pattern = r"^" + tags + r"\. " + tags + p + tags + r"\s*(\r?\n|$)"
2124
+ content = re.sub(pattern, "", content, flags=re.MULTILINE)
2125
+
2126
+ # 3. Strip internal maintenance commands
2127
+ # These can optionally be prefixed with 'capture' and/or 'quietly'
2128
+ internal_cmds = [
2129
+ r"scalar _mcp_rc\b",
2130
+ r"scalar _mcp_.*?\b",
2131
+ r"macro drop _mcp_.*?\b",
2132
+ r"log flush\b",
2133
+ r"log close\b",
2134
+ r"capture _return hold\b",
2135
+ r"_return hold\b",
2136
+ r"preemptive_cache\b"
2137
+ ]
2138
+ internal_regex = r"^" + tags + r"\. " + tags + r"(?:(?:capture|quietly)\s+)*" + r"(?:" + "|".join(internal_cmds) + r").*?" + tags + r"\s*(\r?\n|$)"
2139
+ content = re.sub(internal_regex, "", content, flags=re.MULTILINE)
2140
+
2141
+ # 4. Strip internal file notifications (e.g. from graph exports or internal logs)
2142
+ internal_file_patterns = [
2143
+ r"mcp_(?:stata|hold|ghold|det|session)_",
2144
+ r"preemptive_cache"
2145
+ ]
2146
+ for p in internal_file_patterns:
2147
+ content = re.sub(r"^" + tags + r"\(file " + tags + r".*?" + p + r".*?" + tags + r" (?:saved|not found)(?: as [^)]+)?\).*?(\r?\n|$)", "", content, flags=re.MULTILINE)
2148
+
2149
+ # 5. Strip prompt-only lines that include our injected {txt} tag
2150
+ # Preserve native Stata prompts like "{com}." which are part of verbatim output.
2151
+ content = re.sub(r"^" + tags + r"\. " + r"(?:\{txt\})+" + tags + r"(\s*\r?\n|$)", "", content, flags=re.MULTILINE)
2152
+
2153
+ # Do not add SMCL tags heuristically; preserve original output.
2154
+
2155
+ # 6. Final cleanup of potential double newlines introduced by stripping
2156
+ content = re.sub(r"\n{3,}", "\n\n", content)
2157
+ # Remove leading blank lines that may remain after cleanup
2158
+ content = content.lstrip("\r\n")
2159
+
2160
+ return content.strip() if strip_output else content
2161
+
2162
+
2163
+ def _extract_error_and_context(self, log_content: str, rc: int) -> Tuple[str, str]:
2164
+ """
2165
+ Extracts the error message and trace context using {err} SMCL tags.
2166
+ """
2167
+ if not log_content:
2168
+ return f"Stata error r({rc})", ""
2169
+
2170
+ lines = log_content.splitlines()
2171
+
2172
+ # Search backwards for the {err} tag
2173
+ for i in range(len(lines) - 1, -1, -1):
2174
+ line = lines[i]
2175
+ if '{err}' in line:
2176
+ # Found the (last) error line.
2177
+ # Walk backwards to find the start of the error block (consecutive {err} lines)
2178
+ start_idx = i
2179
+ while start_idx > 0 and '{err}' in lines[start_idx-1]:
2180
+ start_idx -= 1
2181
+
2182
+ # The full error message is the concatenation of all {err} lines in this block
2183
+ error_lines = []
2184
+ for j in range(start_idx, i + 1):
2185
+ error_lines.append(lines[j].strip())
2186
+
2187
+ clean_msg = " ".join(filter(None, error_lines)) or f"Stata error r({rc})"
2188
+
2189
+ # Capture everything from the start of the error block to the end
2190
+ context_str = "\n".join(lines[start_idx:])
2191
+ return clean_msg, context_str
2192
+
2193
+ # Fallback: grab the last 30 lines
2194
+ context_start = max(0, len(lines) - 30)
2195
+ context_str = "\n".join(lines[context_start:])
2196
+
2197
+ return f"Stata error r({rc})", context_str
2198
+
2199
+ def _exec_with_capture(self, code: str, echo: bool = True, trace: bool = False, cwd: Optional[str] = None) -> CommandResponse:
2200
+ """Executes a command and returns results in a structured envelope."""
2201
+ if not self._initialized: self.init()
2202
+ self._increment_command_idx()
2203
+ self._last_results = None # Invalidate results cache
2204
+
2205
+ code = self._maybe_rewrite_graph_name_in_command(code)
2206
+
2207
+ output_buffer, error_buffer = StringIO(), StringIO()
2208
+ rc, sys_error = 0, None
2209
+
2210
+ with self._exec_lock:
2211
+ # Persistent log selection
2212
+ use_p = self._persistent_log_path and os.path.exists(self._persistent_log_path) and cwd is None
2213
+ smcl_path = self._persistent_log_path if use_p else self._create_smcl_log_path(prefix="mcp_", max_hex=16)
2214
+ log_name = None if use_p else self._make_smcl_log_name()
2215
+ if use_p:
2216
+ # Ensure persistent log is bound to our expected path.
2217
+ try:
2218
+ path_for_stata = smcl_path.replace("\\", "/")
2219
+ reopen_bundle = (
2220
+ f"capture quietly log close {self._persistent_log_name}\n"
2221
+ f"capture quietly log using \"{path_for_stata}\", append smcl name({self._persistent_log_name})"
2222
+ )
2223
+ self._run_internal(reopen_bundle, echo=False)
2224
+ except Exception:
2225
+ pass
2226
+
2227
+ # Flush before seeking to get accurate file size for offset
2228
+ if use_p:
2229
+ try:
2230
+ self.stata.run("capture quietly log flush _mcp_session", echo=False)
2231
+ except: pass
2232
+
2233
+ start_off = os.path.getsize(smcl_path) if use_p else 0
2234
+ if not use_p: self._open_smcl_log(smcl_path, log_name)
2235
+
2236
+ rc = 0
2237
+ sys_error = None
2238
+ try:
2239
+ from sfi import Scalar, Macro
2240
+ with self._temp_cwd(cwd), self._redirect_io(output_buffer, error_buffer):
2241
+ try:
2242
+ if trace: self.stata.run("set trace on")
2243
+ self._hold_name = f"mcp_hold_{uuid.uuid4().hex[:12]}"
2244
+
2245
+ # Execute directly to preserve native echo in SMCL logs.
2246
+ # Capture RC immediately via c(rc) before any maintenance commands.
2247
+ self.stata.run(code, echo=echo)
2248
+ rc = self._get_rc_from_scalar(Scalar)
2249
+
2250
+ # Preserve results for later restoration
2251
+ self.stata.run(f"capture _return hold {self._hold_name}", echo=False)
2252
+ if use_p:
2253
+ flush_bundle = (
2254
+ f"capture quietly log off {self._persistent_log_name}\n"
2255
+ f"capture quietly log on {self._persistent_log_name}"
2256
+ )
2257
+ self.stata.run(flush_bundle, echo=False)
2258
+ except Exception as e:
2259
+ rc = self._parse_rc_from_text(str(e)) or self._get_preserved_rc() or 1
2260
+ raise
2261
+ finally:
2262
+ if trace:
2263
+ try:
2264
+ self.stata.run("set trace off")
2265
+ except Exception:
2266
+ pass
2267
+ except Exception as e:
2268
+ sys_error = str(e)
2269
+ finally:
2270
+ if not use_p and log_name: self._close_smcl_log(log_name)
2271
+ # Restore results and set final RC state
2272
+ if hasattr(self, "_hold_name"):
2273
+ try:
2274
+ cleanup_bundle = f"capture _return restore {self._hold_name}\n"
2275
+ if rc > 0:
2276
+ cleanup_bundle += f"capture error {rc}"
2277
+ self.stata.run(cleanup_bundle, echo=False)
2278
+ except Exception: pass
2279
+ delattr(self, "_hold_name")
2280
+
2281
+ # Output extraction
2282
+ smcl_content = self._read_persistent_log_chunk(start_off) if use_p else self._read_smcl_file(smcl_path)
2283
+ if use_p and not smcl_content:
2284
+ try:
2285
+ self.stata.run(f"capture quietly log flush {self._persistent_log_name}", echo=False)
2286
+ smcl_content = self._read_persistent_log_chunk(start_off)
2287
+ except Exception:
2288
+ pass
2289
+ if not use_p: self._safe_unlink(smcl_path)
2290
+
2291
+ # Use SMCL as authoritative source for stdout (preserve SMCL tags)
2292
+ if smcl_content:
2293
+ stdout = self._clean_internal_smcl(smcl_content)
2294
+ else:
2295
+ stdout = output_buffer.getvalue()
2296
+
2297
+ stderr = error_buffer.getvalue()
2298
+
2299
+ # If RC looks wrong but SMCL shows no error markers, treat as success.
2300
+ if rc != 0 and smcl_content:
2301
+ has_err_tag = "{err}" in smcl_content
2302
+ rc_match = re.search(r"(?<!\w)r\((\d+)\)", smcl_content)
2303
+ if rc_match:
2304
+ try:
2305
+ rc = int(rc_match.group(1))
2306
+ except Exception:
2307
+ pass
2308
+ else:
2309
+ text_rc = None
2310
+ try:
2311
+ text_rc = self._parse_rc_from_text(self._smcl_to_text(smcl_content))
2312
+ except Exception:
2313
+ text_rc = None
2314
+ if not has_err_tag and text_rc is None:
2315
+ rc = 0
2316
+ elif rc != 0 and not smcl_content and stdout:
2317
+ text_rc = self._parse_rc_from_text(stdout + ("\n" + stderr if stderr else ""))
2318
+ if text_rc is None:
2319
+ rc = 0
2320
+
2321
+ success = rc == 0 and sys_error is None
2322
+ error = None
2323
+
2324
+ if not success:
2325
+ if smcl_content:
2326
+ msg, context = self._extract_error_from_smcl(smcl_content, rc)
2327
+ if msg == f"Stata error r({rc})":
2328
+ msg2, context2 = self._extract_error_and_context(stdout + stderr, rc)
2329
+ if msg2 != f"Stata error r({rc})":
2330
+ msg, context = msg2, context2
2331
+ elif use_p and self._persistent_log_path:
2332
+ try:
2333
+ with open(self._persistent_log_path, "r", encoding="utf-8", errors="replace") as f:
2334
+ f.seek(start_off)
2335
+ raw_chunk = f.read()
2336
+ msg3, context3 = self._extract_error_from_smcl(raw_chunk, rc)
2337
+ if msg3 != f"Stata error r({rc})":
2338
+ msg, context = msg3, context3
2339
+ except Exception:
2340
+ pass
2341
+ else:
2342
+ msg, context = self._extract_error_and_context(stdout + stderr, rc)
2343
+ snippet = context or stdout or stderr or msg
2344
+ error = ErrorEnvelope(message=msg, context=context, rc=rc, command=code, stdout=stdout, stderr=stderr, snippet=snippet)
2345
+ # In error case, we often want to isolate the error msg in stderr
2346
+ # but keep stdout for context if provided.
2347
+ stdout = ""
2348
+ elif echo:
2349
+ # SMCL output is already cleaned; no additional filtering needed.
2350
+ pass
2351
+ # Persistence isolation: Ensure isolated log_path for tests and clarity
2352
+ if use_p:
2353
+ # Create a temporary chunk file to fulfill the isolated log_path contract
2354
+ chunk_file = self._create_smcl_log_path(prefix="mcp_chunk_")
2355
+ try:
2356
+ with open(chunk_file, "w", encoding="utf-8") as f:
2357
+ f.write(smcl_content)
2358
+ smcl_path = chunk_file
2359
+ except Exception:
2360
+ pass
2361
+
2362
+ # Final safety: If the user explicitly requested CMD2_... and we see CMD1_...
2363
+ # then the extraction definitely failed to isolate at the file level.
2364
+ # Identify the target UUID in the content
2365
+ target_id = None
2366
+ if "CMD2_" in code:
2367
+ m = re.search(r"CMD2_([a-f0-9-]*)", code)
2368
+ if m: target_id = m.group(0)
2369
+ elif "CMD1_" in code:
2370
+ m = re.search(r"CMD1_([a-f0-9-]*)", code)
2371
+ if m: target_id = m.group(0)
2372
+
2373
+ if target_id and target_id in smcl_content:
2374
+ idx = smcl_content.find(target_id)
2375
+ # Look for the command prompt immediately preceding THIS specific command instance
2376
+ com_start = smcl_content.rfind("{com}. ", 0, idx)
2377
+ if com_start != -1:
2378
+ # Found it. Now, is there another {com}. between this one and the target?
2379
+ # (In case of error codes or noise). Usually rfind is sufficient.
2380
+ smcl_content = smcl_content[com_start:]
2381
+
2382
+ # 2. Aggressive multi-pattern header stripping for any remaining headers
2383
+ patterns = [
2384
+ r"\{smcl\}(?:\r?\n)?\{txt\}\{sf\}\{ul off\}\{\.-\}(?:\r?\n)?.*?name:\s+\{res\}_mcp_session.*?\{.-\}\r?\n",
2385
+ r"\{txt\}\{sf\}\{ul off\}\{\.-\}(?:\r?\n)?.*?name:\s+\{res\}_mcp_session.*?\{.-\}\r?\n",
2386
+ r"\(file \{bf\}.*?\{rm\} not found\)\r?\n",
2387
+ r"\{p 0 4 2\}\r?\n\(file \{bf\}.*?\{rm\}\r?\nnot found\)\r?\n\{p_end\}\r?\n",
2388
+ r"\{smcl\}",
2389
+ ]
2390
+ for p in patterns:
2391
+ smcl_content = re.sub(p, "", smcl_content, flags=re.DOTALL)
2392
+
2393
+ # 3. Suppress internal maintenance leaks that sometimes escape quietly/echo=False
2394
+ leaks = [
2395
+ r"\{com\}\. capture quietly log (?:off|on) _mcp_session\r?\n",
2396
+ r"\{com\}\. capture _return hold mcp_hold_[a-f0-9]+\r?\n",
2397
+ r"\{com\}\. scalar _mcp_rc = _rc\r?\n",
2398
+ r"\{com\}\. \{txt\}\r?\n",
2399
+ ]
2400
+ for p in leaks:
2401
+ smcl_content = re.sub(p, "", smcl_content)
2402
+
2403
+ # Second pass - if we see MANY headers or missed one due to whitespace
2404
+ while "_mcp_session" in smcl_content:
2405
+ m = re.search(r"(?:\{smcl\}\r?\n?)?\{txt\}\{sf\}\{ul off\}\{\.-\}\r?\n\s+name:\s+\{res\}_mcp_session", smcl_content)
2406
+ if not m: break
2407
+ header_start = m.start()
2408
+ header_end = smcl_content.find("{.-}", m.end())
2409
+ if header_end != -1:
2410
+ smcl_content = smcl_content[:header_start] + smcl_content[header_end+4:]
2411
+ else:
2412
+ smcl_content = smcl_content[:header_start] + smcl_content[m.end():]
2413
+
2414
+ return CommandResponse(
2415
+ command=code, rc=rc, stdout=stdout, stderr=stderr,
2416
+ smcl_output=smcl_content, log_path=smcl_path if use_p else None,
2417
+ success=success, error=error
2418
+ )
2419
+
2420
+ def _exec_no_capture(self, code: str, echo: bool = False, trace: bool = False) -> CommandResponse:
2421
+ """Execute Stata code while leaving stdout/stderr alone."""
2422
+ if not self._initialized:
2423
+ self.init()
2424
+
2425
+ exc: Optional[Exception] = None
2426
+ ret_text: Optional[str] = None
2427
+ rc = 0
2428
+
2429
+ with self._exec_lock:
2430
+ try:
2431
+ from sfi import Scalar # Import SFI tools
2432
+ if trace:
2433
+ self.stata.run("set trace on")
2434
+ ret = self.stata.run(code, echo=echo)
2435
+ if isinstance(ret, str) and ret:
2436
+ ret_text = ret
2437
+ parsed_rc = self._parse_rc_from_text(ret_text)
2438
+ if parsed_rc is not None:
2439
+ rc = parsed_rc
2440
+
2441
+ except Exception as e:
2442
+ exc = e
2443
+ rc = 1
2444
+ finally:
2445
+ if trace:
2446
+ try:
2447
+ self.stata.run("set trace off")
2448
+ except Exception as e:
2449
+ logger.warning("Failed to turn off Stata trace mode: %s", e)
2450
+
2451
+ stdout = ""
2452
+ stderr = ""
2453
+ success = rc == 0 and exc is None
2454
+ error = None
2455
+ if not success:
2456
+ msg = str(exc) if exc else f"Stata error r({rc})"
2457
+ error = ErrorEnvelope(
2458
+ message=msg,
2459
+ rc=rc,
2460
+ command=code,
2461
+ stdout=ret_text,
2462
+ )
2463
+
2464
+ return CommandResponse(
2465
+ command=code,
2466
+ rc=rc,
2467
+ stdout=stdout,
2468
+ stderr=None,
2469
+ success=success,
2470
+ error=error,
2471
+ )
2472
+
2473
+ def _get_preserved_rc(self) -> int:
2474
+ """Fetch current RC without mutating it."""
2475
+ try:
2476
+ from sfi import Scalar
2477
+ return int(float(Scalar.getValue("c(rc)") or 0))
2478
+ except Exception:
2479
+ return 0
2480
+
2481
+ def _restore_state(self, hold_name: Optional[str], rc: int) -> None:
2482
+ """Restores return results and RC in a single block."""
2483
+ code = ""
2484
+ if hold_name:
2485
+ code += f"capture _return restore {hold_name}\n"
2486
+
2487
+ if rc > 0:
2488
+ code += f"capture error {rc}\n"
2489
+ else:
2490
+ code += "capture\n"
2491
+
2492
+ try:
2493
+ self.stata.run(code, echo=False)
2494
+ self._last_results = None
2495
+ except Exception:
2496
+ pass
2497
+
2498
+ def _exec_no_capture_silent(self, code: str, echo: bool = False, trace: bool = False) -> CommandResponse:
2499
+ """Executes code silently, preserving ALL state (RC, r, e, s)."""
2500
+ hold_name = f"_mcp_sh_{uuid.uuid4().hex[:8]}"
2501
+ preserved_rc = self._get_preserved_rc()
2502
+ output_buffer, error_buffer = StringIO(), StringIO()
2503
+ rc = 0
2504
+
2505
+ with self._exec_lock, self._redirect_io(output_buffer, error_buffer):
2506
+ try:
2507
+ # Bundle everything to minimize round-trips and ensure invisibility.
2508
+ # Use braces to capture multi-line code correctly.
2509
+ inner_code = f"{{\n{code}\n}}" if "\n" in code.strip() else code
2510
+ trace_on = "set trace on\n" if trace else ""
2511
+ trace_off = "set trace off\n" if trace else ""
2512
+ full_cmd = (
2513
+ f"capture _return hold {hold_name}\n"
2514
+ f"{trace_on}"
2515
+ f"capture noisily {inner_code}\n"
2516
+ f"local mcp_rc = _rc\n"
2517
+ f"{trace_off}"
2518
+ f"capture _return restore {hold_name}\n"
2519
+ f"capture error {preserved_rc}"
2520
+ )
2521
+ self.stata.run(full_cmd, echo=echo)
2522
+ from sfi import Macro
2523
+ try: rc = int(float(Macro.getLocal("mcp_rc") or 0))
2524
+ except: rc = 0
2525
+ except Exception as e:
2526
+ rc = self._parse_rc_from_text(str(e)) or 1
2527
+
2528
+ return CommandResponse(
2529
+ command=code, rc=rc,
2530
+ stdout=output_buffer.getvalue(),
2531
+ stderr=error_buffer.getvalue(),
2532
+ success=rc == 0
2533
+ )
2534
+
2535
+ def exec_lightweight(self, code: str) -> CommandResponse:
2536
+ """
2537
+ Executes a command using simple stdout redirection (no SMCL logs).
2538
+ Much faster on Windows as it avoids FS operations.
2539
+ LIMITED: Does not support error envelopes or complex return code parsing.
2540
+ """
2541
+ if not self._initialized:
2542
+ self.init()
2543
+
2544
+ code = self._maybe_rewrite_graph_name_in_command(code)
2545
+
2546
+ output_buffer = StringIO()
2547
+ error_buffer = StringIO()
2548
+ rc = 0
2549
+ exc = None
2550
+
2551
+ with self._exec_lock:
2552
+ with self._redirect_io(output_buffer, error_buffer):
2553
+ try:
2554
+ self.stata.run(code, echo=False)
2555
+ except SystemError as e:
2556
+ import traceback
2557
+ traceback.print_exc()
2558
+ exc = e
2559
+ rc = 1
2560
+ except Exception as e:
2561
+ exc = e
2562
+ rc = 1
2563
+
2564
+ stdout = output_buffer.getvalue()
2565
+ stderr = error_buffer.getvalue()
2566
+
2567
+ return CommandResponse(
2568
+ command=code,
2569
+ rc=rc,
2570
+ stdout=stdout,
2571
+ stderr=stderr if not exc else str(exc),
2572
+ success=(rc == 0),
2573
+ error=None
2574
+ )
2575
+
2576
+ async def run_command_streaming(
2577
+ self,
2578
+ code: str,
2579
+ *,
2580
+ notify_log: Callable[[str], Awaitable[None]],
2581
+ notify_progress: Optional[Callable[[float, Optional[float], Optional[str]], Awaitable[None]]] = None,
2582
+ echo: bool = True,
2583
+ trace: bool = False,
2584
+ max_output_lines: Optional[int] = None,
2585
+ cwd: Optional[str] = None,
2586
+ auto_cache_graphs: bool = False,
2587
+ on_graph_cached: Optional[Callable[[str, bool], Awaitable[None]]] = None,
2588
+ emit_graph_ready: bool = False,
2589
+ graph_ready_task_id: Optional[str] = None,
2590
+ graph_ready_format: str = "svg",
2591
+ ) -> CommandResponse:
2592
+ if not self._initialized:
2593
+ self.init()
2594
+
2595
+ code = self._maybe_rewrite_graph_name_in_command(code)
2596
+ auto_cache_graphs = auto_cache_graphs or emit_graph_ready
2597
+ total_lines = 0 # Commands (not do-files) do not have line-based progress
2598
+
2599
+ if cwd is not None and not os.path.isdir(cwd):
2600
+ return CommandResponse(
2601
+ command=code,
2602
+ rc=601,
2603
+ stdout="",
2604
+ stderr=None,
2605
+ success=False,
2606
+ error=ErrorEnvelope(
2607
+ message=f"cwd not found: {cwd}",
2608
+ rc=601,
2609
+ command=code,
2610
+ ),
2611
+ )
2612
+
2613
+ start_time = time.time()
2614
+ exc: Optional[Exception] = None
2615
+ smcl_content = ""
2616
+ smcl_path = None
2617
+
2618
+ # Setup streaming graph cache if enabled
2619
+ graph_cache = self._init_streaming_graph_cache(auto_cache_graphs, on_graph_cached, notify_log)
2620
+
2621
+ _log_file, log_path, tail, tee = self._create_streaming_log(trace=trace)
2622
+
2623
+ # Create SMCL log path for authoritative output capture
2624
+ start_offset = 0
2625
+ if self._persistent_log_path:
2626
+ smcl_path = self._persistent_log_path
2627
+ smcl_log_name = self._persistent_log_name
2628
+ try:
2629
+ start_offset = os.path.getsize(smcl_path)
2630
+ except OSError:
2631
+ start_offset = 0
2632
+ else:
2633
+ smcl_path = self._create_smcl_log_path()
2634
+ smcl_log_name = self._make_smcl_log_name()
2635
+
2636
+ # Inform the MCP client immediately where to read/tail the output.
2637
+ # We provide the cleaned plain-text log_path as the primary 'path' to satisfy
2638
+ # requirements for clean logs without maintenance boilerplate.
2639
+ await notify_log(json.dumps({"event": "log_path", "path": log_path, "smcl_path": smcl_path}))
2640
+
2641
+ rc = -1
2642
+ path_for_stata = code.replace("\\", "/")
2643
+ command = f'{path_for_stata}'
2644
+
2645
+ # Capture initial graph signatures to detect additions/changes
2646
+ graph_ready_initial = self._capture_graph_state(graph_cache, emit_graph_ready)
2647
+ self._current_command_code = code
2648
+
2649
+ # Increment AFTER capture so detected modifications are based on state BEFORE this command
2650
+ self._increment_command_idx()
2651
+
2652
+ graph_poll_state = [0.0]
2653
+ graph_poll_interval = 0.75
2654
+
2655
+ async def on_chunk_for_graphs(_chunk: str) -> None:
2656
+ now = time.monotonic()
2657
+ if graph_poll_state and now - graph_poll_state[0] < graph_poll_interval:
2658
+ return
2659
+ # Background the graph check so we don't block SMCL streaming or task completion
2660
+ asyncio.create_task(
2661
+ self._maybe_cache_graphs_on_chunk(
2662
+ graph_cache=graph_cache,
2663
+ emit_graph_ready=emit_graph_ready,
2664
+ notify_log=notify_log,
2665
+ graph_ready_task_id=graph_ready_task_id,
2666
+ graph_ready_format=graph_ready_format,
2667
+ graph_ready_initial=graph_ready_initial,
2668
+ last_check=graph_poll_state,
2669
+ )
2670
+ )
2671
+
2672
+ done = anyio.Event()
2673
+
2674
+ try:
2675
+ async with anyio.create_task_group() as tg:
2676
+ async def stream_smcl() -> None:
2677
+ try:
2678
+ await self._stream_smcl_log(
2679
+ smcl_path=smcl_path,
2680
+ notify_log=notify_log,
2681
+ done=done,
2682
+ on_chunk=on_chunk_for_graphs if graph_cache else None,
2683
+ start_offset=start_offset,
2684
+ tee=tee,
2685
+ )
2686
+ except Exception as exc:
2687
+ logger.debug("SMCL streaming failed: %s", exc)
2688
+
2689
+ tg.start_soon(stream_smcl)
2690
+
2691
+ if notify_progress is not None:
2692
+ if total_lines > 0:
2693
+ await notify_progress(0, float(total_lines), f"Executing command: 0/{total_lines}")
2694
+ else:
2695
+ await notify_progress(0, None, "Running command")
2696
+
2697
+ try:
2698
+ run_blocking = lambda: self._run_streaming_blocking(
2699
+ command=command,
2700
+ tee=tee,
2701
+ cwd=cwd,
2702
+ trace=trace,
2703
+ echo=echo,
2704
+ smcl_path=smcl_path,
2705
+ smcl_log_name=smcl_log_name,
2706
+ hold_attr="_hold_name_stream",
2707
+ require_smcl_log=True,
2708
+ )
2709
+ try:
2710
+ rc, exc = await anyio.to_thread.run_sync(
2711
+ run_blocking,
2712
+ abandon_on_cancel=True,
2713
+ )
2714
+ except TypeError:
2715
+ rc, exc = await anyio.to_thread.run_sync(run_blocking)
2716
+ except Exception as e:
2717
+ exc = e
2718
+ if rc in (-1, 0):
2719
+ rc = 1
2720
+ except get_cancelled_exc_class():
2721
+ self._request_break_in()
2722
+ await self._wait_for_stata_stop()
2723
+ raise
2724
+ finally:
2725
+ done.set()
2726
+ except* Exception as exc_group:
2727
+ logger.debug("SMCL streaming task group failed: %s", exc_group)
2728
+ finally:
2729
+ tee.close()
2730
+
2731
+ # Read SMCL content as the authoritative source
2732
+ smcl_content = self._read_smcl_file(smcl_path, start_offset=start_offset)
2733
+ # Clean internal maintenance immediately
2734
+ smcl_content = self._clean_internal_smcl(smcl_content, strip_output=False)
2735
+
2736
+
2737
+ graph_ready_emitted = 0
2738
+ if graph_cache:
2739
+ asyncio.create_task(
2740
+ self._cache_new_graphs(
2741
+ graph_cache,
2742
+ notify_progress=notify_progress,
2743
+ total_lines=total_lines,
2744
+ completed_label="Command",
2745
+ )
2746
+ )
2747
+ if emit_graph_ready:
2748
+ graph_ready_emitted = await self._maybe_cache_graphs_on_chunk(
2749
+ graph_cache=graph_cache,
2750
+ emit_graph_ready=emit_graph_ready,
2751
+ notify_log=notify_log,
2752
+ graph_ready_task_id=graph_ready_task_id,
2753
+ graph_ready_format=graph_ready_format,
2754
+ graph_ready_initial=graph_ready_initial,
2755
+ last_check=graph_poll_state,
2756
+ force=True,
2757
+ )
2758
+ if emit_graph_ready and not graph_ready_emitted and graph_ready_initial is not None:
2759
+ try:
2760
+ graph_ready_emitted = await self._emit_graph_ready_events(
2761
+ graph_ready_initial,
2762
+ notify_log,
2763
+ graph_ready_task_id,
2764
+ graph_ready_format,
2765
+ )
2766
+ except Exception as exc:
2767
+ logger.debug("graph_ready fallback emission failed: %s", exc)
2768
+ if emit_graph_ready and not graph_ready_emitted:
2769
+ try:
2770
+ fallback_names = self._extract_named_graphs(code)
2771
+ if fallback_names:
2772
+ async with self._ensure_graph_ready_lock():
2773
+ await self._emit_graph_ready_for_graphs(
2774
+ list(dict.fromkeys(fallback_names)),
2775
+ notify_log=notify_log,
2776
+ task_id=graph_ready_task_id,
2777
+ export_format=graph_ready_format,
2778
+ graph_ready_initial=graph_ready_initial,
2779
+ )
2780
+ except Exception as exc:
2781
+ logger.debug("graph_ready fallback emission failed: %s", exc)
2782
+
2783
+ combined = self._build_combined_log(tail, smcl_path, rc, trace, exc, start_offset=start_offset)
2784
+
2785
+ # Use SMCL content as primary source for RC detection only when RC is ambiguous
2786
+ if exc is not None or rc in (-1, 1):
2787
+ parsed_rc = self._parse_rc_from_smcl(smcl_content)
2788
+ if parsed_rc is not None and parsed_rc != 0:
2789
+ rc = parsed_rc
2790
+ elif rc in (-1, 1): # Also check text if rc is generic 1 or unset
2791
+ parsed_rc_text = self._parse_rc_from_text(combined)
2792
+ if parsed_rc_text is not None:
2793
+ rc = parsed_rc_text
2794
+ elif rc == -1:
2795
+ rc = 0 # Default to success if no error trace found
2796
+
2797
+ # If RC looks wrong but SMCL shows no error markers, treat as success.
2798
+ if rc != 0 and smcl_content:
2799
+ has_err_tag = "{err}" in smcl_content
2800
+ rc_match = re.search(r"(?<!\w)r\((\d+)\)", smcl_content)
2801
+ if rc_match:
2802
+ try:
2803
+ rc = int(rc_match.group(1))
2804
+ except Exception:
2805
+ pass
2806
+ else:
2807
+ text_rc = None
2808
+ try:
2809
+ text_rc = self._parse_rc_from_text(self._smcl_to_text(smcl_content))
2810
+ except Exception:
2811
+ text_rc = None
2812
+ if not has_err_tag and text_rc is None:
2813
+ rc = 0
2814
+
2815
+ # If RC looks wrong but SMCL shows no error markers, treat as success.
2816
+ if rc != 0 and smcl_content:
2817
+ has_err_tag = "{err}" in smcl_content
2818
+ rc_match = re.search(r"(?<!\w)r\((\d+)\)", smcl_content)
2819
+ if rc_match:
2820
+ try:
2821
+ rc = int(rc_match.group(1))
2822
+ except Exception:
2823
+ pass
2824
+ else:
2825
+ text_rc = None
2826
+ try:
2827
+ text_rc = self._parse_rc_from_text(self._smcl_to_text(smcl_content))
2828
+ except Exception:
2829
+ text_rc = None
2830
+ if not has_err_tag and text_rc is None:
2831
+ rc = 0
2832
+
2833
+ success = (rc == 0 and exc is None)
2834
+ stderr_final = None
2835
+ error = None
2836
+
2837
+ # authoritative output (Preserve SMCL tags as requested by user)
2838
+ stdout_final = smcl_content if smcl_content else combined
2839
+ # Clean the final output of internal maintenance artifacts
2840
+ stdout_final = self._clean_internal_smcl(stdout_final)
2841
+
2842
+ # NOTE: We keep stdout_final populated even if log_path is set,
2843
+ # so the user gets the exact SMCL result in the tool output.
2844
+ # server.py may still clear it for token efficiency.
2845
+
2846
+ if not success:
2847
+ # Use SMCL as authoritative source for error extraction
2848
+ if smcl_content:
2849
+ msg, context = self._extract_error_from_smcl(smcl_content, rc)
2850
+ else:
2851
+ # Fallback to combined log
2852
+ msg, context = self._extract_error_and_context(combined, rc)
2853
+
2854
+ error = ErrorEnvelope(
2855
+ message=msg,
2856
+ context=context,
2857
+ rc=rc,
2858
+ command=command,
2859
+ log_path=log_path,
2860
+ snippet=smcl_content[-800:] if smcl_content else combined[-800:],
2861
+ smcl_output=smcl_content,
2862
+ )
2863
+ # Put summary in stderr
2864
+ stderr_final = context
2865
+
2866
+ duration = time.time() - start_time
2867
+ logger.info(
2868
+ "stata.run(stream) rc=%s success=%s trace=%s duration_ms=%.2f code_preview=%s",
2869
+ rc,
2870
+ success,
2871
+ trace,
2872
+ duration * 1000,
2873
+ code.replace("\n", "\\n")[:120],
2874
+ )
2875
+
2876
+ result = CommandResponse(
2877
+ command=code,
2878
+ rc=rc,
2879
+ stdout=stdout_final,
2880
+ stderr=stderr_final,
2881
+ log_path=log_path,
2882
+ success=success,
2883
+ error=error,
2884
+ smcl_output=smcl_content,
2885
+ )
2886
+
2887
+ if notify_progress is not None:
2888
+ await notify_progress(1, 1, "Finished")
2889
+
2890
+ return result
2891
+
2892
+ async def run_do_file_streaming(
2893
+ self,
2894
+ path: str,
2895
+ *,
2896
+ notify_log: Callable[[str], Awaitable[None]],
2897
+ notify_progress: Optional[Callable[[float, Optional[float], Optional[str]], Awaitable[None]]] = None,
2898
+ echo: bool = True,
2899
+ trace: bool = False,
2900
+ max_output_lines: Optional[int] = None,
2901
+ cwd: Optional[str] = None,
2902
+ auto_cache_graphs: bool = False,
2903
+ on_graph_cached: Optional[Callable[[str, bool], Awaitable[None]]] = None,
2904
+ emit_graph_ready: bool = False,
2905
+ graph_ready_task_id: Optional[str] = None,
2906
+ graph_ready_format: str = "svg",
2907
+ ) -> CommandResponse:
2908
+ effective_path, command, error_response = self._resolve_do_file_path(path, cwd)
2909
+ if error_response is not None:
2910
+ return error_response
2911
+
2912
+ total_lines = self._count_do_file_lines(effective_path)
2913
+ dofile_text = ""
2914
+ try:
2915
+ dofile_text = pathlib.Path(effective_path).read_text(encoding="utf-8", errors="replace")
2916
+ except Exception:
2917
+ dofile_text = ""
2918
+ executed_lines = 0
2919
+ last_progress_time = 0.0
2920
+ dot_prompt = re.compile(r"^\.\s+\S")
2921
+
2922
+ async def on_chunk_for_progress(chunk: str) -> None:
2923
+ nonlocal executed_lines, last_progress_time
2924
+ if total_lines <= 0 or notify_progress is None:
2925
+ return
2926
+ for line in chunk.splitlines():
2927
+ if dot_prompt.match(line):
2928
+ executed_lines += 1
2929
+ if executed_lines > total_lines:
2930
+ executed_lines = total_lines
2931
+
2932
+ now = time.monotonic()
2933
+ if executed_lines > 0 and (now - last_progress_time) >= 0.25:
2934
+ last_progress_time = now
2935
+ await notify_progress(
2936
+ float(executed_lines),
2937
+ float(total_lines),
2938
+ f"Executing do-file: {executed_lines}/{total_lines}",
2939
+ )
2940
+
2941
+ if not self._initialized:
2942
+ self.init()
2943
+
2944
+ auto_cache_graphs = auto_cache_graphs or emit_graph_ready
2945
+
2946
+ start_time = time.time()
2947
+ exc: Optional[Exception] = None
2948
+ smcl_content = ""
2949
+ smcl_path = None
2950
+
2951
+ graph_cache = self._init_streaming_graph_cache(auto_cache_graphs, on_graph_cached, notify_log)
2952
+ _log_file, log_path, tail, tee = self._create_streaming_log(trace=trace)
2953
+
2954
+ smcl_path = self._create_smcl_log_path()
2955
+ smcl_log_name = self._make_smcl_log_name()
2956
+ start_offset = 0
2957
+ if self._persistent_log_path:
2958
+ smcl_path = self._persistent_log_path
2959
+ smcl_log_name = self._persistent_log_name
2960
+ try:
2961
+ self.stata.run(f"capture quietly log flush {smcl_log_name}", echo=False)
2962
+ start_offset = os.path.getsize(smcl_path)
2963
+ except OSError:
2964
+ start_offset = 0
2965
+
2966
+ # Inform the MCP client immediately where to read/tail the output.
2967
+ # We provide the cleaned plain-text log_path as the primary 'path' to satisfy
2968
+ # requirements for clean logs without maintenance boilerplate.
2969
+ await notify_log(json.dumps({"event": "log_path", "path": log_path, "smcl_path": smcl_path}))
2970
+
2971
+ rc = -1
2972
+ graph_ready_initial = self._capture_graph_state(graph_cache, emit_graph_ready)
2973
+ self._current_command_code = dofile_text if dofile_text else command
2974
+
2975
+ # Increment AFTER capture
2976
+ self._increment_command_idx()
2977
+
2978
+ graph_poll_state = [0.0]
2979
+
2980
+ done = anyio.Event()
2981
+
2982
+ try:
2983
+ async with anyio.create_task_group() as tg:
2984
+ async def on_chunk_for_graphs(_chunk: str) -> None:
2985
+ # Background the graph check so we don't block SMCL streaming or task completion.
2986
+ # Use tg.start_soon instead of asyncio.create_task to ensure all checks
2987
+ # finish before the command is considered complete.
2988
+ tg.start_soon(
2989
+ functools.partial(
2990
+ self._maybe_cache_graphs_on_chunk,
2991
+ graph_cache=graph_cache,
2992
+ emit_graph_ready=emit_graph_ready,
2993
+ notify_log=notify_log,
2994
+ graph_ready_task_id=graph_ready_task_id,
2995
+ graph_ready_format=graph_ready_format,
2996
+ graph_ready_initial=graph_ready_initial,
2997
+ last_check=graph_poll_state,
2998
+ )
2999
+ )
3000
+
3001
+ async def actual_on_chunk(chunk: str) -> None:
3002
+ # Inform progress tracker
3003
+ await on_chunk_for_progress(chunk)
3004
+
3005
+ # Background graph detection
3006
+ if graph_cache:
3007
+ await on_chunk_for_graphs(chunk)
3008
+
3009
+ async def stream_smcl() -> None:
3010
+ try:
3011
+ await self._stream_smcl_log(
3012
+ smcl_path=smcl_path,
3013
+ notify_log=notify_log,
3014
+ done=done,
3015
+ on_chunk=actual_on_chunk,
3016
+ start_offset=start_offset,
3017
+ tee=tee,
3018
+ )
3019
+ except Exception as exc:
3020
+ logger.debug("SMCL streaming failed: %s", exc)
3021
+
3022
+ tg.start_soon(stream_smcl)
3023
+
3024
+ if notify_progress is not None:
3025
+ if total_lines > 0:
3026
+ await notify_progress(0, float(total_lines), f"Executing do-file: 0/{total_lines}")
3027
+ else:
3028
+ await notify_progress(0, None, "Running do-file")
3029
+
3030
+ try:
3031
+ run_blocking = lambda: self._run_streaming_blocking(
3032
+ command=command,
3033
+ tee=tee,
3034
+ cwd=cwd,
3035
+ trace=trace,
3036
+ echo=echo,
3037
+ smcl_path=smcl_path,
3038
+ smcl_log_name=smcl_log_name,
3039
+ hold_attr="_hold_name_do",
3040
+ require_smcl_log=True,
3041
+ )
3042
+ try:
3043
+ rc, exc = await anyio.to_thread.run_sync(
3044
+ run_blocking,
3045
+ abandon_on_cancel=True,
3046
+ )
3047
+ except TypeError:
3048
+ rc, exc = await anyio.to_thread.run_sync(run_blocking)
3049
+ except Exception as e:
3050
+ exc = e
3051
+ if rc in (-1, 0):
3052
+ rc = 1
3053
+ except get_cancelled_exc_class():
3054
+ self._request_break_in()
3055
+ await self._wait_for_stata_stop()
3056
+ raise
3057
+ finally:
3058
+ done.set()
3059
+ except* Exception as exc_group:
3060
+ logger.debug("SMCL streaming task group failed: %s", exc_group)
3061
+ finally:
3062
+ tee.close()
3063
+
3064
+ # Read SMCL content as the authoritative source
3065
+ smcl_content = self._read_smcl_file(smcl_path, start_offset=start_offset)
3066
+ # Clean internal maintenance immediately
3067
+ smcl_content = self._clean_internal_smcl(smcl_content, strip_output=False)
3068
+
3069
+
3070
+ graph_ready_emitted = 0
3071
+ if graph_cache:
3072
+ asyncio.create_task(
3073
+ self._cache_new_graphs(
3074
+ graph_cache,
3075
+ notify_progress=notify_progress,
3076
+ total_lines=total_lines,
3077
+ completed_label="Do-file",
3078
+ )
3079
+ )
3080
+ if emit_graph_ready:
3081
+ graph_ready_emitted = await self._maybe_cache_graphs_on_chunk(
3082
+ graph_cache=graph_cache,
3083
+ emit_graph_ready=emit_graph_ready,
3084
+ notify_log=notify_log,
3085
+ graph_ready_task_id=graph_ready_task_id,
3086
+ graph_ready_format=graph_ready_format,
3087
+ graph_ready_initial=graph_ready_initial,
3088
+ last_check=graph_poll_state,
3089
+ force=True,
3090
+ )
3091
+ if emit_graph_ready and not graph_ready_emitted and graph_ready_initial is not None:
3092
+ try:
3093
+ graph_ready_emitted = await self._emit_graph_ready_events(
3094
+ graph_ready_initial,
3095
+ notify_log,
3096
+ graph_ready_task_id,
3097
+ graph_ready_format,
3098
+ )
3099
+ except Exception as exc:
3100
+ logger.debug("graph_ready fallback emission failed: %s", exc)
3101
+ if emit_graph_ready and not graph_ready_emitted:
3102
+ try:
3103
+ fallback_names = self._extract_named_graphs(dofile_text)
3104
+ if fallback_names:
3105
+ async with self._ensure_graph_ready_lock():
3106
+ await self._emit_graph_ready_for_graphs(
3107
+ list(dict.fromkeys(fallback_names)),
3108
+ notify_log=notify_log,
3109
+ task_id=graph_ready_task_id,
3110
+ export_format=graph_ready_format,
3111
+ graph_ready_initial=graph_ready_initial,
3112
+ )
3113
+ except Exception as exc:
3114
+ logger.debug("graph_ready fallback emission failed: %s", exc)
3115
+
3116
+ combined = self._build_combined_log(tail, smcl_path, rc, trace, exc, start_offset=start_offset)
3117
+
3118
+ # Use SMCL content as primary source for RC detection only when RC is ambiguous
3119
+ if exc is not None or rc in (-1, 1):
3120
+ parsed_rc = self._parse_rc_from_smcl(smcl_content)
3121
+ if parsed_rc is not None and parsed_rc != 0:
3122
+ rc = parsed_rc
3123
+ elif rc in (-1, 1):
3124
+ parsed_rc_text = self._parse_rc_from_text(combined)
3125
+ if parsed_rc_text is not None:
3126
+ rc = parsed_rc_text
3127
+ elif rc == -1:
3128
+ rc = 0 # Default to success if no error found
3129
+
3130
+ # If RC looks wrong but SMCL shows no error markers, treat as success.
3131
+ if rc != 0 and smcl_content:
3132
+ has_err_tag = "{err}" in smcl_content
3133
+ rc_match = re.search(r"(?<!\w)r\((\d+)\)", smcl_content)
3134
+ if rc_match:
3135
+ try:
3136
+ rc = int(rc_match.group(1))
3137
+ except Exception:
3138
+ pass
3139
+ else:
3140
+ text_rc = None
3141
+ try:
3142
+ text_rc = self._parse_rc_from_text(self._smcl_to_text(smcl_content))
3143
+ except Exception:
3144
+ text_rc = None
3145
+ if not has_err_tag and text_rc is None:
3146
+ rc = 0
3147
+
3148
+ success = (rc == 0 and exc is None)
3149
+ stderr_final = None
3150
+ error = None
3151
+
3152
+ # authoritative output (Preserve SMCL tags as requested by user)
3153
+ stdout_final = smcl_content if smcl_content else combined
3154
+ # Clean the final output of internal maintenance artifacts
3155
+ stdout_final = self._clean_internal_smcl(stdout_final)
3156
+
3157
+ # NOTE: We keep stdout_final populated even if log_path is set,
3158
+ # so the user gets the exact SMCL result in the tool output.
3159
+ # server.py may still clear it for token efficiency.
3160
+
3161
+ if not success:
3162
+ # Use SMCL as authoritative source for error extraction
3163
+ if smcl_content:
3164
+ msg, context = self._extract_error_from_smcl(smcl_content, rc)
3165
+ else:
3166
+ # Fallback to combined log
3167
+ msg, context = self._extract_error_and_context(combined, rc)
3168
+
3169
+ error = ErrorEnvelope(
3170
+ message=msg,
3171
+ context=context,
3172
+ rc=rc,
3173
+ command=command,
3174
+ log_path=log_path,
3175
+ snippet=smcl_content[-800:] if smcl_content else combined[-800:],
3176
+ smcl_output=smcl_content,
3177
+ )
3178
+ # Put summary in stderr
3179
+ stderr_final = context
3180
+ # Token Efficiency optimization: we keep stdout for local users/tests
3181
+ # but if it's very large, we might truncate it later
3182
+
3183
+ duration = time.time() - start_time
3184
+ logger.info(
3185
+ "stata.run(do stream) rc=%s success=%s trace=%s duration_ms=%.2f path=%s",
3186
+ rc,
3187
+ success,
3188
+ trace,
3189
+ duration * 1000,
3190
+ effective_path,
3191
+ )
3192
+
3193
+ result = CommandResponse(
3194
+ command=command,
3195
+ rc=rc,
3196
+ stdout=stdout_final,
3197
+ stderr=stderr_final,
3198
+ log_path=log_path,
3199
+ success=success,
3200
+ error=error,
3201
+ smcl_output=smcl_content,
3202
+ )
3203
+
3204
+ if notify_progress is not None:
3205
+ if total_lines > 0:
3206
+ await notify_progress(float(total_lines), float(total_lines), f"Executing do-file: {total_lines}/{total_lines}")
3207
+ else:
3208
+ await notify_progress(1, 1, "Finished")
3209
+
3210
+ return result
3211
+
3212
+ def run_command_structured(self, code: str, echo: bool = True, trace: bool = False, max_output_lines: Optional[int] = None, cwd: Optional[str] = None) -> CommandResponse:
3213
+ """Runs a Stata command and returns a structured envelope.
3214
+
3215
+ Args:
3216
+ code: The Stata command to execute.
3217
+ echo: If True, the command itself is included in the output.
3218
+ trace: If True, enables trace mode for debugging.
3219
+ max_output_lines: If set, truncates stdout to this many lines (token efficiency).
3220
+ """
3221
+ result = self._exec_with_capture(code, echo=echo, trace=trace, cwd=cwd)
3222
+
3223
+ return self._truncate_command_output(result, max_output_lines)
3224
+
3225
+ def get_data(self, start: int = 0, count: int = 50) -> List[Dict[str, Any]]:
3226
+ """Returns valid JSON-serializable data."""
3227
+ if not self._initialized:
3228
+ self.init()
3229
+
3230
+ if count > self.MAX_DATA_ROWS:
3231
+ count = self.MAX_DATA_ROWS
3232
+
3233
+ with self._exec_lock:
3234
+ try:
3235
+ # Use pystata integration to retrieve data
3236
+ df = self.stata.pdataframe_from_data()
3237
+
3238
+ # Slice
3239
+ sliced = df.iloc[start : start + count]
3240
+
3241
+ # Convert to dict
3242
+ return sliced.to_dict(orient="records")
3243
+ except Exception as e:
3244
+ return [{"error": f"Failed to retrieve data: {e}"}]
3245
+
3246
+ def list_variables(self) -> List[Dict[str, str]]:
3247
+ """Returns list of variables with labels."""
3248
+ if not self._initialized:
3249
+ self.init()
3250
+
3251
+ # We can use sfi to be efficient
3252
+ from sfi import Data # type: ignore[import-not-found]
3253
+ vars_info = []
3254
+ with self._exec_lock:
3255
+ for i in range(Data.getVarCount()):
3256
+ var_index = i # 0-based
3257
+ name = Data.getVarName(var_index)
3258
+ label = Data.getVarLabel(var_index)
3259
+ type_str = Data.getVarType(var_index) # Returns int
3260
+
3261
+ vars_info.append({
3262
+ "name": name,
3263
+ "label": label,
3264
+ "type": str(type_str),
3265
+ })
3266
+ return vars_info
3267
+
3268
+ def get_dataset_state(self) -> Dict[str, Any]:
3269
+ """Return basic dataset state without mutating the dataset."""
3270
+ if not self._initialized:
3271
+ self.init()
3272
+
3273
+ from sfi import Data, Macro # type: ignore[import-not-found]
3274
+
3275
+ with self._exec_lock:
3276
+ n = int(Data.getObsTotal())
3277
+ k = int(Data.getVarCount())
3278
+
3279
+ frame = "default"
3280
+ sortlist = ""
3281
+ changed = False
3282
+ # Use a combined fetch for dataset state to minimize roundtrips
3283
+ try:
3284
+ state_bundle = (
3285
+ "macro define mcp_frame \"`c(frame)'\"\n"
3286
+ "macro define mcp_sortlist \"`c(sortlist)'\"\n"
3287
+ "macro define mcp_changed \"`c(changed)'\""
3288
+ )
3289
+ self.stata.run(state_bundle, echo=False)
3290
+ frame = str(Macro.getGlobal("mcp_frame") or "default")
3291
+ sortlist = str(Macro.getGlobal("mcp_sortlist") or "")
3292
+ changed = bool(int(float(Macro.getGlobal("mcp_changed") or "0")))
3293
+ self.stata.run("macro drop mcp_frame mcp_sortlist mcp_changed", echo=False)
3294
+ except Exception:
3295
+ logger.debug("Failed to get dataset state macros", exc_info=True)
3296
+
3297
+ return {"frame": frame, "n": n, "k": k, "sortlist": sortlist, "changed": changed}
3298
+
3299
+ def _require_data_in_memory(self) -> None:
3300
+ state = self.get_dataset_state()
3301
+ if int(state.get("k", 0) or 0) == 0 and int(state.get("n", 0) or 0) == 0:
3302
+ # Stata empty dataset could still have k>0 n==0; treat that as ok.
3303
+ raise RuntimeError("No data in memory")
3304
+
3305
+ def _get_var_index_map(self) -> Dict[str, int]:
3306
+ from sfi import Data # type: ignore[import-not-found]
3307
+
3308
+ out: Dict[str, int] = {}
3309
+ with self._exec_lock:
3310
+ for i in range(int(Data.getVarCount())):
3311
+ try:
3312
+ out[str(Data.getVarName(i))] = i
3313
+ except Exception:
3314
+ continue
3315
+ return out
3316
+
3317
+ def list_variables_rich(self) -> List[Dict[str, Any]]:
3318
+ """Return variable metadata (name/type/label/format/valueLabel) without modifying the dataset."""
3319
+ if not self._initialized:
3320
+ self.init()
3321
+
3322
+ from sfi import Data # type: ignore[import-not-found]
3323
+
3324
+ vars_info: List[Dict[str, Any]] = []
3325
+ for i in range(int(Data.getVarCount())):
3326
+ name = str(Data.getVarName(i))
3327
+ label = None
3328
+ fmt = None
3329
+ vtype = None
3330
+ value_label = None
3331
+ try:
3332
+ label = Data.getVarLabel(i)
3333
+ except Exception:
3334
+ label = None
3335
+ try:
3336
+ fmt = Data.getVarFormat(i)
3337
+ except Exception:
3338
+ fmt = None
3339
+ try:
3340
+ vtype = Data.getVarType(i)
3341
+ except Exception:
3342
+ vtype = None
3343
+
3344
+ vars_info.append(
3345
+ {
3346
+ "name": name,
3347
+ "type": str(vtype) if vtype is not None else None,
3348
+ "label": label if label else None,
3349
+ "format": fmt if fmt else None,
3350
+ "valueLabel": value_label,
3351
+ }
3352
+ )
3353
+ return vars_info
3354
+
3355
+ @staticmethod
3356
+ def _is_stata_missing(value: Any) -> bool:
3357
+ if value is None:
3358
+ return True
3359
+ if isinstance(value, float):
3360
+ # Stata missing values typically show up as very large floats via sfi.Data.get
3361
+ return value > 8.0e307
3362
+ return False
3363
+
3364
+ def _normalize_cell(self, value: Any, *, max_chars: int) -> tuple[Any, bool]:
3365
+ if self._is_stata_missing(value):
3366
+ return ".", False
3367
+ if isinstance(value, str):
3368
+ if len(value) > max_chars:
3369
+ return value[:max_chars], True
3370
+ return value, False
3371
+ return value, False
3372
+
3373
+ def get_page(
3374
+ self,
3375
+ *,
3376
+ offset: int,
3377
+ limit: int,
3378
+ vars: List[str],
3379
+ include_obs_no: bool,
3380
+ max_chars: int,
3381
+ obs_indices: Optional[List[int]] = None,
3382
+ ) -> Dict[str, Any]:
3383
+ if not self._initialized:
3384
+ self.init()
3385
+
3386
+ from sfi import Data # type: ignore[import-not-found]
3387
+
3388
+ state = self.get_dataset_state()
3389
+ n = int(state.get("n", 0) or 0)
3390
+ k = int(state.get("k", 0) or 0)
3391
+ if k == 0 and n == 0:
3392
+ raise RuntimeError("No data in memory")
3393
+
3394
+ var_map = self._get_var_index_map()
3395
+ for v in vars:
3396
+ if v not in var_map:
3397
+ raise ValueError(f"Invalid variable: {v}")
3398
+
3399
+ if obs_indices is None:
3400
+ start = offset
3401
+ end = min(offset + limit, n)
3402
+ if start >= n:
3403
+ rows: list[list[Any]] = []
3404
+ returned = 0
3405
+ obs_list: list[int] = []
3406
+ else:
3407
+ obs_list = list(range(start, end))
3408
+ raw_rows = Data.get(var=vars, obs=obs_list)
3409
+ rows = raw_rows
3410
+ returned = len(rows)
3411
+ else:
3412
+ start = offset
3413
+ end = min(offset + limit, len(obs_indices))
3414
+ obs_list = obs_indices[start:end]
3415
+ raw_rows = Data.get(var=vars, obs=obs_list) if obs_list else []
3416
+ rows = raw_rows
3417
+ returned = len(rows)
3418
+
3419
+ out_vars = list(vars)
3420
+ out_rows: list[list[Any]] = []
3421
+ truncated_cells = 0
3422
+
3423
+ if include_obs_no:
3424
+ out_vars = ["_n"] + out_vars
3425
+
3426
+ for idx, raw in enumerate(rows):
3427
+ norm_row: list[Any] = []
3428
+ if include_obs_no:
3429
+ norm_row.append(int(obs_list[idx]) + 1)
3430
+ for cell in raw:
3431
+ norm, truncated = self._normalize_cell(cell, max_chars=max_chars)
3432
+ if truncated:
3433
+ truncated_cells += 1
3434
+ norm_row.append(norm)
3435
+ out_rows.append(norm_row)
3436
+
3437
+ return {
3438
+ "vars": out_vars,
3439
+ "rows": out_rows,
3440
+ "returned": returned,
3441
+ "truncated_cells": truncated_cells,
3442
+ }
3443
+
3444
+ def get_arrow_stream(
3445
+ self,
3446
+ *,
3447
+ offset: int,
3448
+ limit: int,
3449
+ vars: List[str],
3450
+ include_obs_no: bool,
3451
+ obs_indices: Optional[List[int]] = None,
3452
+ ) -> bytes:
3453
+ """
3454
+ Returns an Apache Arrow IPC stream (as bytes) for the requested data page.
3455
+ Uses Polars if available (faster), falls back to Pandas.
3456
+ """
3457
+ if not self._initialized:
3458
+ self.init()
3459
+
3460
+ import pyarrow as pa
3461
+ from sfi import Data # type: ignore[import-not-found]
3462
+
3463
+ use_polars = _get_polars_available()
3464
+ if use_polars:
3465
+ import polars as pl
3466
+ else:
3467
+ import pandas as pd
3468
+
3469
+ state = self.get_dataset_state()
3470
+ n = int(state.get("n", 0) or 0)
3471
+ k = int(state.get("k", 0) or 0)
3472
+ if k == 0 and n == 0:
3473
+ raise RuntimeError("No data in memory")
3474
+
3475
+ var_map = self._get_var_index_map()
3476
+ for v in vars:
3477
+ if v not in var_map:
3478
+ raise ValueError(f"Invalid variable: {v}")
3479
+
3480
+ # Determine observations to fetch
3481
+ if obs_indices is None:
3482
+ start = offset
3483
+ end = min(offset + limit, n)
3484
+ obs_list = list(range(start, end)) if start < n else []
3485
+ else:
3486
+ start = offset
3487
+ end = min(offset + limit, len(obs_indices))
3488
+ obs_list = obs_indices[start:end]
3489
+
3490
+ try:
3491
+ if not obs_list:
3492
+ # Empty schema-only table
3493
+ if use_polars:
3494
+ schema_cols = {}
3495
+ if include_obs_no:
3496
+ schema_cols["_n"] = pl.Int64
3497
+ for v in vars:
3498
+ schema_cols[v] = pl.Utf8
3499
+ table = pl.DataFrame(schema=schema_cols).to_arrow()
3500
+ else:
3501
+ columns = {}
3502
+ if include_obs_no:
3503
+ columns["_n"] = pa.array([], type=pa.int64())
3504
+ for v in vars:
3505
+ columns[v] = pa.array([], type=pa.string())
3506
+ table = pa.table(columns)
3507
+ else:
3508
+ # Fetch all data in one C-call
3509
+ raw_data = Data.get(var=vars, obs=obs_list, valuelabel=False)
3510
+
3511
+ if use_polars:
3512
+ df = pl.DataFrame(raw_data, schema=vars, orient="row")
3513
+ if include_obs_no:
3514
+ obs_nums = [i + 1 for i in obs_list]
3515
+ df = df.with_columns(pl.Series("_n", obs_nums, dtype=pl.Int64))
3516
+ df = df.select(["_n"] + vars)
3517
+ table = df.to_arrow()
3518
+ else:
3519
+ df = pd.DataFrame(raw_data, columns=vars)
3520
+ if include_obs_no:
3521
+ df.insert(0, "_n", [i + 1 for i in obs_list])
3522
+ table = pa.Table.from_pandas(df, preserve_index=False)
3523
+
3524
+ # Serialize to IPC Stream
3525
+ sink = pa.BufferOutputStream()
3526
+ with pa.RecordBatchStreamWriter(sink, table.schema) as writer:
3527
+ writer.write_table(table)
3528
+
3529
+ return sink.getvalue().to_pybytes()
3530
+
3531
+ except Exception as e:
3532
+ raise RuntimeError(f"Failed to generate Arrow stream: {e}")
3533
+
3534
+ _FILTER_IDENT = re.compile(r"\b[A-Za-z_][A-Za-z0-9_]*\b")
3535
+
3536
+ def _extract_filter_vars(self, filter_expr: str) -> List[str]:
3537
+ tokens = set(self._FILTER_IDENT.findall(filter_expr or ""))
3538
+ # Exclude python keywords we might inject.
3539
+ exclude = {"and", "or", "not", "True", "False", "None"}
3540
+ var_map = self._get_var_index_map()
3541
+ vars_used = [t for t in tokens if t not in exclude and t in var_map]
3542
+ return sorted(vars_used)
3543
+
3544
+ def _compile_filter_expr(self, filter_expr: str) -> Any:
3545
+ expr = (filter_expr or "").strip()
3546
+ if not expr:
3547
+ raise ValueError("Empty filter")
3548
+
3549
+ # Stata boolean operators.
3550
+ expr = expr.replace("&", " and ").replace("|", " or ")
3551
+
3552
+ # Replace missing literal '.' (but not numeric decimals like 0.5).
3553
+ expr = re.sub(r"(?<![0-9])\.(?![0-9A-Za-z_])", "None", expr)
3554
+
3555
+ try:
3556
+ return compile(expr, "<filterExpr>", "eval")
3557
+ except Exception as e:
3558
+ raise ValueError(f"Invalid filter expression: {e}")
3559
+
3560
+ def validate_filter_expr(self, filter_expr: str) -> None:
3561
+ if not self._initialized:
3562
+ self.init()
3563
+ state = self.get_dataset_state()
3564
+ if int(state.get("k", 0) or 0) == 0 and int(state.get("n", 0) or 0) == 0:
3565
+ raise RuntimeError("No data in memory")
3566
+
3567
+ vars_used = self._extract_filter_vars(filter_expr)
3568
+ if not vars_used:
3569
+ # still allow constant expressions like "1" or "True"
3570
+ self._compile_filter_expr(filter_expr)
3571
+ return
3572
+ self._compile_filter_expr(filter_expr)
3573
+
3574
+ def compute_view_indices(self, filter_expr: str, *, chunk_size: int = 5000) -> List[int]:
3575
+ if not self._initialized:
3576
+ self.init()
3577
+
3578
+ from sfi import Data # type: ignore[import-not-found]
3579
+
3580
+ state = self.get_dataset_state()
3581
+ n = int(state.get("n", 0) or 0)
3582
+ k = int(state.get("k", 0) or 0)
3583
+ if k == 0 and n == 0:
3584
+ raise RuntimeError("No data in memory")
3585
+
3586
+ vars_used = self._extract_filter_vars(filter_expr)
3587
+ code = self._compile_filter_expr(filter_expr)
3588
+ _ = self._get_var_index_map()
3589
+
3590
+ is_string_vars = []
3591
+ if vars_used:
3592
+ try:
3593
+ from sfi import Variable # type: ignore
3594
+ is_string_vars = [Variable.isString(v) for v in vars_used]
3595
+ except (ImportError, AttributeError):
3596
+ try:
3597
+ is_string_vars = [Data.isVarTypeStr(v) or Data.isVarTypeStrL(v) for v in vars_used]
3598
+ except AttributeError:
3599
+ # Stata 19+ compatibility
3600
+ is_string_vars = [Data.isVarTypeString(v) for v in vars_used]
3601
+
3602
+ indices: List[int] = []
3603
+ for start in range(0, n, chunk_size):
3604
+ end = min(start + chunk_size, n)
3605
+ obs_list = list(range(start, end))
3606
+ raw_rows = Data.get(var=vars_used, obs=obs_list) if vars_used else [[None] for _ in obs_list]
3607
+
3608
+ # Try Rust optimization for the chunk
3609
+ if vars_used and raw_rows:
3610
+ # Transpose rows to columns for Rust
3611
+ cols = []
3612
+ # Extract columns
3613
+ for j in range(len(vars_used)):
3614
+ col_data_list = [row[j] for row in raw_rows]
3615
+ if not is_string_vars[j]:
3616
+ import numpy as np
3617
+ col_data = np.array(col_data_list, dtype=np.float64)
3618
+ else:
3619
+ col_data = col_data_list
3620
+ cols.append(col_data)
3621
+
3622
+ rust_indices = compute_filter_indices(filter_expr, vars_used, cols, is_string_vars)
3623
+ if rust_indices is not None:
3624
+ indices.extend([int(obs_list[i]) for i in rust_indices])
3625
+ continue
3626
+
3627
+ for row_i, obs in enumerate(obs_list):
3628
+ env: Dict[str, Any] = {}
3629
+ if vars_used:
3630
+ for j, v in enumerate(vars_used):
3631
+ val = raw_rows[row_i][j]
3632
+ env[v] = None if self._is_stata_missing(val) else val
3633
+
3634
+ ok = False
3635
+ try:
3636
+ ok = bool(eval(code, {"__builtins__": {}}, env))
3637
+ except NameError as e:
3638
+ raise ValueError(f"Invalid filter: {e}")
3639
+ except Exception as e:
3640
+ raise ValueError(f"Invalid filter: {e}")
3641
+
3642
+ if ok:
3643
+ indices.append(int(obs))
3644
+
3645
+ return indices
3646
+
3647
+ def apply_sort(self, sort_spec: List[str]) -> None:
3648
+ """
3649
+ Apply sorting to the dataset using gsort.
3650
+
3651
+ Args:
3652
+ sort_spec: List of variables to sort by, with optional +/- prefix.
3653
+ e.g., ["-price", "+mpg"] sorts by price descending, then mpg ascending.
3654
+ No prefix is treated as ascending (+).
3655
+
3656
+ Raises:
3657
+ ValueError: If sort_spec is invalid or contains invalid variables
3658
+ RuntimeError: If no data in memory or sort command fails
3659
+ """
3660
+ if not self._initialized:
3661
+ self.init()
3662
+
3663
+ state = self.get_dataset_state()
3664
+ if int(state.get("k", 0) or 0) == 0 and int(state.get("n", 0) or 0) == 0:
3665
+ raise RuntimeError("No data in memory")
3666
+
3667
+ if not sort_spec or not isinstance(sort_spec, list):
3668
+ raise ValueError("sort_spec must be a non-empty list")
3669
+
3670
+ # Validate all variables exist
3671
+ var_map = self._get_var_index_map()
3672
+ for spec in sort_spec:
3673
+ if not isinstance(spec, str) or not spec:
3674
+ raise ValueError(f"Invalid sort specification: {spec!r}")
3675
+ # Extract variable name (remove +/- prefix if present)
3676
+ varname = spec.lstrip("+-")
3677
+ if not varname:
3678
+ raise ValueError(f"Invalid sort specification: {spec!r}")
3679
+
3680
+ if varname not in var_map:
3681
+ raise ValueError(f"Variable not found: {varname}")
3682
+
3683
+ # Build gsort command
3684
+ # gsort uses - for descending, + or nothing for ascending
3685
+ gsort_args = []
3686
+ for spec in sort_spec:
3687
+ if spec.startswith("-") or spec.startswith("+"):
3688
+ gsort_args.append(spec)
3689
+ else:
3690
+ # No prefix means ascending, add + explicitly for clarity
3691
+ gsort_args.append(f"+{spec}")
3692
+
3693
+ cmd = f"gsort {' '.join(gsort_args)}"
3694
+
3695
+ try:
3696
+ # Sorting is hot-path for UI paging; use lightweight execution.
3697
+ result = self.exec_lightweight(cmd)
3698
+ if not result.success:
3699
+ error_msg = result.stderr or "Sort failed"
3700
+ raise RuntimeError(f"Failed to sort dataset: {error_msg}")
3701
+ except Exception as e:
3702
+ if isinstance(e, RuntimeError):
3703
+ raise
3704
+ raise RuntimeError(f"Failed to sort dataset: {e}")
3705
+
3706
+ def get_variable_details(self, varname: str) -> str:
3707
+ """Returns codebook/summary for a specific variable while preserving state."""
3708
+ # Use _exec_no_capture_silent to preserve r()/e() results
3709
+ resp = self._exec_no_capture_silent(f"codebook {varname}", echo=False)
3710
+ if resp.success:
3711
+ # _exec_no_capture_silent captures output in resp.error.stdout if it fails,
3712
+ # but wait, it doesn't return stdout in CommandResponse for success?
3713
+ # Let me check CommandResponse creation in _exec_no_capture_silent.
3714
+ pass
3715
+ return resp.stdout or ""
3716
+
3717
+ def list_variables_structured(self) -> VariablesResponse:
3718
+ vars_info: List[VariableInfo] = []
3719
+ for item in self.list_variables():
3720
+ vars_info.append(
3721
+ VariableInfo(
3722
+ name=item.get("name", ""),
3723
+ label=item.get("label"),
3724
+ type=item.get("type"),
3725
+ )
3726
+ )
3727
+ return VariablesResponse(variables=vars_info)
3728
+
3729
+ def list_graphs(self, *, force_refresh: bool = False) -> List[str]:
3730
+ """Returns list of graphs in memory with TTL caching."""
3731
+ if not self._initialized:
3732
+ self.init()
3733
+
3734
+ import time
3735
+
3736
+ # Prevent recursive Stata calls - if we're already executing, return cached or empty
3737
+ if self._is_executing:
3738
+ with self._list_graphs_cache_lock:
3739
+ if self._list_graphs_cache is not None:
3740
+ logger.debug("Recursive list_graphs call prevented, returning cached value")
3741
+ if self._list_graphs_cache and hasattr(self._list_graphs_cache[0], "name"):
3742
+ return [g.name for g in self._list_graphs_cache]
3743
+ return self._list_graphs_cache
3744
+ else:
3745
+ logger.debug("Recursive list_graphs call prevented, returning empty list")
3746
+ return []
3747
+
3748
+ # Check if cache is valid
3749
+ current_time = time.time()
3750
+ with self._list_graphs_cache_lock:
3751
+ if (not force_refresh and self._list_graphs_cache is not None and
3752
+ current_time - self._list_graphs_cache_time < self.LIST_GRAPHS_TTL):
3753
+ if self._list_graphs_cache and hasattr(self._list_graphs_cache[0], "name"):
3754
+ return [g.name for g in self._list_graphs_cache]
3755
+ return self._list_graphs_cache
3756
+
3757
+ # Cache miss or expired, fetch fresh data
3758
+ with self._exec_lock:
3759
+ try:
3760
+ # Preservation of r() results is critical because this can be called
3761
+ # automatically after every user command (e.g., during streaming).
3762
+ import time
3763
+ hold_name = f"_mcp_ghold_{int(time.time() * 1000 % 1000000)}"
3764
+ try:
3765
+ self.stata.run(f"capture _return hold {hold_name}", echo=False)
3766
+ except SystemError:
3767
+ import traceback
3768
+ sys.stderr.write(traceback.format_exc())
3769
+ sys.stderr.flush()
3770
+ raise
3771
+
3772
+ try:
3773
+ # Bundle name listing and metadata retrieval into one Stata call for efficiency
3774
+ bundle = (
3775
+ "macro define mcp_graph_list \"\"\n"
3776
+ "global mcp_graph_details \"\"\n"
3777
+ "quietly graph dir, memory\n"
3778
+ "macro define mcp_graph_list \"`r(list)'\"\n"
3779
+ "if \"`r(list)'\" != \"\" {\n"
3780
+ " foreach g in `r(list)' {\n"
3781
+ " quietly graph describe `g'\n"
3782
+ " global mcp_graph_details \"$mcp_graph_details `g'|`r(command_date)' `r(command_time)';\"\n"
3783
+ " }\n"
3784
+ "}"
3785
+ )
3786
+ self.stata.run(bundle, echo=False)
3787
+ from sfi import Macro # type: ignore[import-not-found]
3788
+ graph_list_str = Macro.getGlobal("mcp_graph_list")
3789
+ details_str = Macro.getGlobal("mcp_graph_details")
3790
+ # Cleanup global to keep Stata environment tidy
3791
+ self.stata.run("macro drop mcp_graph_details", echo=False)
3792
+ finally:
3793
+ try:
3794
+ self.stata.run(f"capture _return restore {hold_name}", echo=False)
3795
+ except SystemError:
3796
+ import traceback
3797
+ sys.stderr.write(traceback.format_exc())
3798
+ sys.stderr.flush()
3799
+ raise
3800
+
3801
+ import shlex
3802
+ raw_list = shlex.split(graph_list_str or "")
3803
+
3804
+ # Parse details: "name1|date time; name2|date time;"
3805
+ details_map = {}
3806
+ if details_str:
3807
+ for item in details_str.split(';'):
3808
+ item = item.strip()
3809
+ if not item or '|' not in item:
3810
+ continue
3811
+ gname, ts = item.split('|', 1)
3812
+ details_map[gname.strip()] = ts.strip()
3813
+
3814
+ # Map internal Stata names back to user-facing names when we have an alias.
3815
+ reverse = getattr(self, "_graph_name_reverse", {})
3816
+
3817
+ graph_infos = []
3818
+ for n in raw_list:
3819
+ graph_infos.append(GraphInfo(
3820
+ name=reverse.get(n, n),
3821
+ active=False,
3822
+ created=details_map.get(n)
3823
+ ))
3824
+
3825
+ # Update cache
3826
+ with self._list_graphs_cache_lock:
3827
+ self._list_graphs_cache = graph_infos
3828
+ self._list_graphs_cache_time = time.time()
3829
+
3830
+ return [g.name for g in graph_infos]
3831
+
3832
+ except Exception as e:
3833
+ # On error, return cached result if available, otherwise empty list
3834
+ with self._list_graphs_cache_lock:
3835
+ if self._list_graphs_cache is not None:
3836
+ logger.warning(f"list_graphs failed, returning cached result: {e}")
3837
+ if self._list_graphs_cache and hasattr(self._list_graphs_cache[0], "name"):
3838
+ return [g.name for g in self._list_graphs_cache]
3839
+ return self._list_graphs_cache
3840
+ logger.warning(f"list_graphs failed, no cache available: {e}")
3841
+ return []
3842
+
3843
+ def list_graphs_structured(self) -> GraphListResponse:
3844
+ self.list_graphs()
3845
+
3846
+ with self._list_graphs_cache_lock:
3847
+ if not self._list_graphs_cache:
3848
+ return GraphListResponse(graphs=[])
3849
+
3850
+ # The cache now contains GraphInfo objects
3851
+ graphs = [g.model_copy() for g in self._list_graphs_cache]
3852
+
3853
+ if graphs:
3854
+ # Most recently created/displayed graph is active in Stata
3855
+ graphs[-1].active = True
3856
+
3857
+ return GraphListResponse(graphs=graphs)
3858
+
3859
+ def invalidate_list_graphs_cache(self) -> None:
3860
+ """Invalidate the list_graphs cache to force fresh data on next call."""
3861
+ with self._list_graphs_cache_lock:
3862
+ self._list_graphs_cache = None
3863
+ self._list_graphs_cache_time = 0
3864
+
3865
+ def export_graph(self, graph_name: str = None, filename: str = None, format: str = "pdf") -> str:
3866
+ """Exports graph to a temp file (pdf or png) and returns the path.
3867
+
3868
+ On Windows, PyStata can crash when exporting PNGs directly. For PNG on
3869
+ Windows, we save the graph to .gph and invoke the Stata executable in
3870
+ batch mode to export the PNG out-of-process.
3871
+ """
3872
+ import tempfile
3873
+
3874
+ fmt = (format or "pdf").strip().lower()
3875
+ if fmt not in {"pdf", "png", "svg"}:
3876
+ raise ValueError(f"Unsupported graph export format: {format}. Allowed: pdf, png, svg.")
3877
+
3878
+
3879
+ if not filename:
3880
+ suffix = f".{fmt}"
3881
+ # Use validated temp dir to avoid Windows write permission errors
3882
+ with tempfile.NamedTemporaryFile(prefix="mcp_stata_", suffix=suffix, dir=get_writable_temp_dir(), delete=False) as tmp:
3883
+ filename = tmp.name
3884
+ register_temp_file(filename)
3885
+ else:
3886
+ # Ensure fresh start
3887
+ p_filename = pathlib.Path(filename)
3888
+ if p_filename.exists():
3889
+ try:
3890
+ p_filename.unlink()
3891
+ except Exception:
3892
+ pass
3893
+
3894
+ # Keep the user-facing path as a normal absolute path
3895
+ user_filename = pathlib.Path(filename).absolute()
3896
+
3897
+ if fmt == "png" and is_windows():
3898
+ # 1) Save graph to a .gph file from the embedded session
3899
+ with tempfile.NamedTemporaryFile(prefix="mcp_stata_graph_", suffix=".gph", dir=get_writable_temp_dir(), delete=False) as gph_tmp:
3900
+ gph_path = pathlib.Path(gph_tmp.name)
3901
+ register_temp_file(gph_path)
3902
+ gph_path_for_stata = gph_path.as_posix()
3903
+ # Make the target graph current, then save without name() (which isn't accepted there)
3904
+ if graph_name:
3905
+ self._exec_no_capture_silent(f'quietly graph display {graph_name}', echo=False)
3906
+ save_cmd = f'quietly graph save "{gph_path_for_stata}", replace'
3907
+ save_resp = self._exec_no_capture_silent(save_cmd, echo=False)
3908
+ if not save_resp.success:
3909
+ msg = save_resp.error.message if save_resp.error else f"graph save failed (rc={save_resp.rc})"
3910
+ raise RuntimeError(msg)
3911
+
3912
+ # 2) Prepare a do-file to export PNG externally
3913
+ user_filename_fwd = user_filename.as_posix()
3914
+ do_lines = [
3915
+ f'quietly graph use "{gph_path_for_stata}"',
3916
+ f'quietly graph export "{user_filename_fwd}", replace as(png)',
3917
+ "exit",
3918
+ ]
3919
+ with tempfile.NamedTemporaryFile(prefix="mcp_stata_export_", suffix=".do", dir=get_writable_temp_dir(), delete=False, mode="w", encoding="ascii") as do_tmp:
3920
+ do_tmp.write("\n".join(do_lines))
3921
+ do_path = pathlib.Path(do_tmp.name)
3922
+ register_temp_file(do_path)
3923
+
3924
+ stata_exe = getattr(self, "_stata_exec_path", None)
3925
+ if not stata_exe or not pathlib.Path(stata_exe).exists():
3926
+ raise RuntimeError("Stata executable path unavailable for PNG export")
3927
+
3928
+ workdir = do_path.parent
3929
+ log_path = do_path.with_suffix(".log")
3930
+ register_temp_file(log_path)
3931
+
3932
+ cmd = [str(stata_exe), "/e", "do", str(do_path)]
3933
+ try:
3934
+ completed = subprocess.run(
3935
+ cmd,
3936
+ capture_output=True,
3937
+ text=True,
3938
+ timeout=30,
3939
+ cwd=workdir,
3940
+ )
3941
+ except subprocess.TimeoutExpired:
3942
+ raise RuntimeError("External Stata export timed out")
3943
+ finally:
3944
+ try:
3945
+ do_path.unlink()
3946
+ except Exception:
3947
+ # Ignore errors during temporary do-file cleanup (file may not exist or be locked)
3948
+ logger.warning("Failed to remove temporary do-file: %s", do_path, exc_info=True)
3949
+
3950
+ try:
3951
+ gph_path.unlink()
3952
+ except Exception:
3953
+ logger.warning("Failed to remove temporary graph file: %s", gph_path, exc_info=True)
3954
+
3955
+ try:
3956
+ if log_path.exists():
3957
+ log_path.unlink()
3958
+ except Exception:
3959
+ logger.warning("Failed to remove temporary log file: %s", log_path, exc_info=True)
3960
+
3961
+ if completed.returncode != 0:
3962
+ err = completed.stderr.strip() or completed.stdout.strip() or str(completed.returncode)
3963
+ raise RuntimeError(f"External Stata export failed: {err}")
3964
+
3965
+ else:
3966
+ # Stata prefers forward slashes in its command parser on Windows
3967
+ filename_for_stata = user_filename.as_posix()
3968
+
3969
+ if graph_name:
3970
+ resolved = self._resolve_graph_name_for_stata(graph_name)
3971
+ # Use display + export without name() for maximum compatibility.
3972
+ # name(NAME) often fails in PyStata for non-active graphs (r(693)).
3973
+ # Graph identifiers must NOT be quoted in 'graph display'.
3974
+ disp_resp = self._exec_no_capture_silent(f'quietly graph display {resolved}', echo=False)
3975
+ if not disp_resp.success:
3976
+ # graph display failed, likely rc=111 or 693
3977
+ msg = disp_resp.error.message if disp_resp.error else f"Graph display failed (rc={disp_resp.rc})"
3978
+ # Normalize for test expectations
3979
+ if disp_resp.rc == 111:
3980
+ msg = f"graph {resolved} not found r(111);"
3981
+ raise RuntimeError(msg)
3982
+
3983
+ cmd = f'quietly graph export "{filename_for_stata}", replace as({fmt})'
3984
+
3985
+ # Avoid stdout/stderr redirection for graph export because PyStata's
3986
+ # output thread can crash on Windows when we swap stdio handles.
3987
+ resp = self._exec_no_capture_silent(cmd, echo=False)
3988
+ if not resp.success:
3989
+ # Retry once after a short pause in case Stata had a transient file handle issue
3990
+ time.sleep(0.2)
3991
+ resp_retry = self._exec_no_capture_silent(cmd, echo=False)
3992
+ if not resp_retry.success:
3993
+ msg = resp_retry.error.message if resp_retry.error else f"graph export failed (rc={resp_retry.rc})"
3994
+ raise RuntimeError(msg)
3995
+ resp = resp_retry
3996
+
3997
+ if user_filename.exists():
3998
+ try:
3999
+ size = user_filename.stat().st_size
4000
+ if size == 0:
4001
+ raise RuntimeError(f"Graph export failed: produced empty file {user_filename}")
4002
+ if size > self.MAX_GRAPH_BYTES:
4003
+ raise RuntimeError(
4004
+ f"Graph export failed: file too large (> {self.MAX_GRAPH_BYTES} bytes): {user_filename}"
4005
+ )
4006
+ except Exception as size_err:
4007
+ # Clean up oversized or unreadable files
4008
+ try:
4009
+ user_filename.unlink()
4010
+ except Exception:
4011
+ pass
4012
+ raise size_err
4013
+ return str(user_filename)
4014
+
4015
+ # If file missing, it failed. Check output for details.
4016
+ msg = resp.error.message if resp.error else "graph export failed: file missing"
4017
+ raise RuntimeError(msg)
4018
+
4019
+ def get_help(self, topic: str, plain_text: bool = False) -> str:
4020
+ """Returns help text as Markdown (default) or plain text."""
4021
+ if not self._initialized:
4022
+ self.init()
4023
+
4024
+ with self._exec_lock:
4025
+ # Try to locate the .sthlp help file
4026
+ # We use 'capture' to avoid crashing if not found.
4027
+ # Combined into a single bundle to prevent r(fn) from being cleared.
4028
+ from sfi import Macro # type: ignore[import-not-found]
4029
+ bundle = (
4030
+ f"capture findfile {topic}.sthlp\n"
4031
+ "macro define mcp_help_file \"`r(fn)'\""
4032
+ )
4033
+ self.stata.run(bundle, echo=False)
4034
+ fn = Macro.getGlobal("mcp_help_file")
4035
+
4036
+ if fn and os.path.exists(fn):
4037
+ try:
4038
+ with open(fn, 'r', encoding='utf-8', errors='replace') as f:
4039
+ smcl = f.read()
4040
+ if plain_text:
4041
+ return self._smcl_to_text(smcl)
4042
+ try:
4043
+ return smcl_to_markdown(smcl, adopath=os.path.dirname(fn), current_file=os.path.splitext(os.path.basename(fn))[0])
4044
+ except Exception as parse_err:
4045
+ logger.warning("SMCL to Markdown failed, falling back to plain text: %s", parse_err)
4046
+ return self._smcl_to_text(smcl)
4047
+ except Exception as e:
4048
+ logger.warning("Help file read failed for %s: %s", topic, e)
4049
+
4050
+ # If no help file found, return a fallback message
4051
+ return f"Help file for '{topic}' not found."
4052
+
4053
+ def get_stored_results(self, force_fresh: bool = False) -> Dict[str, Any]:
4054
+ """Returns e() and r() results using SFI for maximum reliability."""
4055
+ if not force_fresh and self._last_results is not None:
4056
+ return self._last_results
4057
+
4058
+ if not self._initialized:
4059
+ self.init()
4060
+
4061
+ with self._exec_lock:
4062
+ # Capture the current RC first using SFI (non-mutating)
4063
+ try:
4064
+ from sfi import Scalar, Macro
4065
+ preserved_rc = int(float(Scalar.getValue("c(rc)") or 0))
4066
+ except Exception:
4067
+ preserved_rc = 0
4068
+
4069
+ results = {"r": {}, "e": {}, "s": {}}
4070
+
4071
+ try:
4072
+ # Fetch lists of names. macro define `: ...' is non-mutating for results.
4073
+ fetch_names_block = (
4074
+ "macro define mcp_r_sc \"`: r(scalars)'\"\n"
4075
+ "macro define mcp_r_ma \"`: r(macros)'\"\n"
4076
+ "macro define mcp_e_sc \"`: e(scalars)'\"\n"
4077
+ "macro define mcp_e_ma \"`: e(macros)'\"\n"
4078
+ "macro define mcp_s_sc \"`: s(scalars)'\"\n"
4079
+ "macro define mcp_s_ma \"`: s(macros)'\"\n"
4080
+ )
4081
+ self.stata.run(fetch_names_block, echo=False)
4082
+
4083
+ for rclass in ["r", "e", "s"]:
4084
+ sc_names = (Macro.getGlobal(f"mcp_{rclass}_sc") or "").split()
4085
+ ma_names = (Macro.getGlobal(f"mcp_{rclass}_ma") or "").split()
4086
+
4087
+ # Fetch Scalars via SFI (fast, non-mutating)
4088
+ for name in sc_names:
4089
+ try:
4090
+ val = Scalar.getValue(f"{rclass}({name})")
4091
+ results[rclass][name] = val
4092
+ except Exception:
4093
+ pass
4094
+
4095
+ # Fetch Macros via global expansion
4096
+ if ma_names:
4097
+ # Bundle macro copying to minimize roundtrips
4098
+ # We use global macros as a transfer area
4099
+ copy_block = ""
4100
+ for name in ma_names:
4101
+ copy_block += f"macro define mcp_m_{rclass}_{name} \"`{rclass}({name})'\"\n"
4102
+
4103
+ if copy_block:
4104
+ self.stata.run(copy_block, echo=False)
4105
+ for name in ma_names:
4106
+ results[rclass][name] = Macro.getGlobal(f"mcp_m_{rclass}_{name}")
4107
+
4108
+ # Cleanup and Restore state
4109
+ self.stata.run("macro drop mcp_*", echo=False)
4110
+
4111
+ if preserved_rc > 0:
4112
+ self.stata.run(f"capture error {preserved_rc}", echo=False)
4113
+ else:
4114
+ self.stata.run("capture", echo=False)
4115
+
4116
+ self._last_results = results
4117
+ return results
4118
+ except Exception as e:
4119
+ logger.error(f"SFI-based get_stored_results failed: {e}")
4120
+ return {"r": {}, "e": {}}
4121
+
4122
+ def invalidate_graph_cache(self, graph_name: str = None) -> None:
4123
+ """Invalidate cache for specific graph or all graphs.
4124
+
4125
+ Args:
4126
+ graph_name: Specific graph name to invalidate. If None, clears all cache.
4127
+ """
4128
+ self._initialize_cache()
4129
+
4130
+ with self._cache_lock:
4131
+ if graph_name is None:
4132
+ # Clear all cache
4133
+ self._preemptive_cache.clear()
4134
+ else:
4135
+ # Clear specific graph cache
4136
+ if graph_name in self._preemptive_cache:
4137
+ del self._preemptive_cache[graph_name]
4138
+ # Also clear hash if present
4139
+ hash_key = f"{graph_name}_hash"
4140
+ if hash_key in self._preemptive_cache:
4141
+ del self._preemptive_cache[hash_key]
4142
+
4143
+ def _initialize_cache(self) -> None:
4144
+ """Initialize cache in a thread-safe manner."""
4145
+ import tempfile
4146
+ import threading
4147
+ import os
4148
+ import uuid
4149
+
4150
+ with StataClient._cache_init_lock: # Use class-level lock
4151
+ if not hasattr(self, '_cache_initialized'):
4152
+ self._preemptive_cache = {}
4153
+ self._cache_access_times = {} # Track access times for LRU
4154
+ self._cache_sizes = {} # Track individual cache item sizes
4155
+ self._total_cache_size = 0 # Track total cache size in bytes
4156
+ # Use unique identifier to avoid conflicts
4157
+ unique_id = f"preemptive_cache_{uuid.uuid4().hex[:8]}_{os.getpid()}"
4158
+ self._preemptive_cache_dir = tempfile.mkdtemp(prefix=unique_id, dir=get_writable_temp_dir())
4159
+ register_temp_dir(self._preemptive_cache_dir)
4160
+ self._cache_lock = threading.Lock()
4161
+ self._cache_initialized = True
4162
+
4163
+ # Register cleanup function
4164
+ import atexit
4165
+ atexit.register(self._cleanup_cache)
4166
+ else:
4167
+ # Cache already initialized, but directory might have been removed.
4168
+ if (not hasattr(self, '_preemptive_cache_dir') or
4169
+ not self._preemptive_cache_dir or
4170
+ not os.path.isdir(self._preemptive_cache_dir)):
4171
+ unique_id = f"preemptive_cache_{uuid.uuid4().hex[:8]}_{os.getpid()}"
4172
+ self._preemptive_cache_dir = tempfile.mkdtemp(prefix=unique_id, dir=get_writable_temp_dir())
4173
+ register_temp_dir(self._preemptive_cache_dir)
4174
+
4175
+ def _cleanup_cache(self) -> None:
4176
+ """Clean up cache directory and files."""
4177
+ import os
4178
+ import shutil
4179
+
4180
+ if hasattr(self, '_preemptive_cache_dir') and self._preemptive_cache_dir:
4181
+ try:
4182
+ shutil.rmtree(self._preemptive_cache_dir, ignore_errors=True)
4183
+ except Exception:
4184
+ pass # Best effort cleanup
4185
+
4186
+ if hasattr(self, '_preemptive_cache'):
4187
+ self._preemptive_cache.clear()
4188
+
4189
+ def _evict_cache_if_needed(self, new_item_size: int = 0) -> None:
4190
+ """
4191
+ Evict least recently used cache items if cache size limits are exceeded.
4192
+
4193
+ NOTE: The caller is responsible for holding ``self._cache_lock`` while
4194
+ invoking this method, so that eviction and subsequent cache insertion
4195
+ (if any) occur within a single critical section.
4196
+ """
4197
+ import time
4198
+
4199
+ # Check if we need to evict based on count or size
4200
+ needs_eviction = (
4201
+ len(self._preemptive_cache) > StataClient.MAX_CACHE_SIZE or
4202
+ self._total_cache_size + new_item_size > StataClient.MAX_CACHE_BYTES
4203
+ )
4204
+
4205
+ if not needs_eviction:
4206
+ return
4207
+
4208
+ # Sort by access time (oldest first)
4209
+ items_by_access = sorted(
4210
+ self._cache_access_times.items(),
4211
+ key=lambda x: x[1]
4212
+ )
4213
+
4214
+ evicted_count = 0
4215
+ for graph_name, access_time in items_by_access:
4216
+ if (len(self._preemptive_cache) < StataClient.MAX_CACHE_SIZE and
4217
+ self._total_cache_size + new_item_size <= StataClient.MAX_CACHE_BYTES):
4218
+ break
4219
+
4220
+ # Remove from cache
4221
+ if graph_name in self._preemptive_cache:
4222
+ cache_path = self._preemptive_cache[graph_name]
4223
+
4224
+ # Remove file
4225
+ try:
4226
+ if os.path.exists(cache_path):
4227
+ os.remove(cache_path)
4228
+ except Exception:
4229
+ pass
4230
+
4231
+ # Update tracking
4232
+ item_size = self._cache_sizes.get(graph_name, 0)
4233
+ del self._preemptive_cache[graph_name]
4234
+ del self._cache_access_times[graph_name]
4235
+ if graph_name in self._cache_sizes:
4236
+ del self._cache_sizes[graph_name]
4237
+ self._total_cache_size -= item_size
4238
+ evicted_count += 1
4239
+
4240
+ # Remove hash entry if exists
4241
+ hash_key = f"{graph_name}_hash"
4242
+ if hash_key in self._preemptive_cache:
4243
+ del self._preemptive_cache[hash_key]
4244
+
4245
+ if evicted_count > 0:
4246
+ logger.debug(f"Evicted {evicted_count} items from graph cache due to size limits")
4247
+
4248
+ def _get_content_hash(self, data: bytes) -> str:
4249
+ """Generate content hash for cache validation."""
4250
+ import hashlib
4251
+ return hashlib.md5(data).hexdigest()
4252
+
4253
+ def _sanitize_filename(self, name: str) -> str:
4254
+ """Sanitize graph name for safe file system usage."""
4255
+ import re
4256
+ # Remove or replace problematic characters
4257
+ safe_name = re.sub(r'[<>:"/\\|?*]', '_', name)
4258
+ safe_name = re.sub(r'[^\w\-_.]', '_', safe_name)
4259
+ # Limit length
4260
+ return safe_name[:100] if len(safe_name) > 100 else safe_name
4261
+
4262
+ def _validate_graph_exists(self, graph_name: str) -> bool:
4263
+ """Validate that graph still exists in Stata."""
4264
+ try:
4265
+ # First try to get graph list to verify existence
4266
+ graph_list = self.list_graphs(force_refresh=True)
4267
+ if graph_name not in graph_list:
4268
+ return False
4269
+
4270
+ # Additional validation by attempting to display the graph
4271
+ resolved = self._resolve_graph_name_for_stata(graph_name)
4272
+ cmd = f'quietly graph display {resolved}'
4273
+ resp = self._exec_no_capture_silent(cmd, echo=False)
4274
+ return resp.success
4275
+ except Exception:
4276
+ return False
4277
+
4278
+ def _is_cache_valid(self, graph_name: str, cache_path: str) -> bool:
4279
+ """Check if cached content is still valid using internal signatures."""
4280
+ try:
4281
+ if not os.path.exists(cache_path) or os.path.getsize(cache_path) == 0:
4282
+ return False
4283
+
4284
+ current_sig = self._get_graph_signature(graph_name)
4285
+ cached_sig = self._preemptive_cache.get(f"{graph_name}_sig")
4286
+
4287
+ # If we have a signature match, it's valid for the current command session
4288
+ if cached_sig and cached_sig == current_sig:
4289
+ return True
4290
+
4291
+ # Otherwise it's invalid (needs refresh for new command)
4292
+ return False
4293
+ except Exception:
4294
+ return False
4295
+
4296
+ def export_graphs_all(self) -> GraphExportResponse:
4297
+ """Exports all graphs to file paths."""
4298
+ exports: List[GraphExport] = []
4299
+ graph_names = self.list_graphs(force_refresh=True)
4300
+
4301
+ if not graph_names:
4302
+ return GraphExportResponse(graphs=exports)
4303
+
4304
+ import tempfile
4305
+ import os
4306
+ import threading
4307
+ import uuid
4308
+ import time
4309
+ import logging
4310
+
4311
+ # Initialize cache in thread-safe manner
4312
+ self._initialize_cache()
4313
+
4314
+ def _cache_keyed_svg_path(name: str) -> str:
4315
+ import hashlib
4316
+ safe_name = self._sanitize_filename(name)
4317
+ suffix = hashlib.md5((name or "").encode("utf-8")).hexdigest()[:8]
4318
+ return os.path.join(self._preemptive_cache_dir, f"{safe_name}_{suffix}.svg")
4319
+
4320
+ def _export_svg_bytes(name: str) -> bytes:
4321
+ resolved = self._resolve_graph_name_for_stata(name)
4322
+
4323
+ temp_dir = get_writable_temp_dir()
4324
+ safe_temp_name = self._sanitize_filename(name)
4325
+ unique_filename = f"{safe_temp_name}_{uuid.uuid4().hex[:8]}_{os.getpid()}_{int(time.time())}.svg"
4326
+ svg_path = os.path.join(temp_dir, unique_filename)
4327
+ svg_path_for_stata = svg_path.replace("\\", "/")
4328
+
4329
+ try:
4330
+ # We use name identifier WITHOUT quotes for Stata 19 compatibility
4331
+ # but we use quotes for the file path.
4332
+ export_cmd = f'quietly graph export "{svg_path_for_stata}", name({resolved}) replace as(svg)'
4333
+ export_resp = self._exec_no_capture_silent(export_cmd, echo=False)
4334
+
4335
+ if not export_resp.success:
4336
+ # Fallback for complex names if the unquoted version failed
4337
+ # but only if it's not a generic r(1)
4338
+ if export_resp.rc != 1:
4339
+ export_cmd_quoted = f'quietly graph export "{svg_path_for_stata}", name("{resolved}") replace as(svg)'
4340
+ export_resp = self._exec_no_capture_silent(export_cmd_quoted, echo=False)
4341
+
4342
+ if not export_resp.success:
4343
+ # Final resort: display and then export active
4344
+ display_cmd = f'quietly graph display {resolved}'
4345
+ display_resp = self._exec_no_capture_silent(display_cmd, echo=False)
4346
+ if display_resp.success:
4347
+ export_cmd2 = f'quietly graph export "{svg_path_for_stata}", replace as(svg)'
4348
+ export_resp = self._exec_no_capture_silent(export_cmd2, echo=False)
4349
+ else:
4350
+ export_resp = display_resp
4351
+
4352
+ if export_resp.success and os.path.exists(svg_path) and os.path.getsize(svg_path) > 0:
4353
+ with open(svg_path, "rb") as f:
4354
+ return f.read()
4355
+
4356
+ # If we reached here, something failed.
4357
+ error_info = getattr(export_resp, 'error', None)
4358
+ error_msg = error_info.message if error_info else f"Stata error r({export_resp.rc})"
4359
+ raise RuntimeError(f"Failed to export graph {name}: {error_msg}")
4360
+ finally:
4361
+ if os.path.exists(svg_path):
4362
+ try:
4363
+ os.remove(svg_path)
4364
+ except OSError as e:
4365
+ logger.warning(f"Failed to cleanup temp file {svg_path}: {e}")
4366
+
4367
+ cached_graphs = {}
4368
+ uncached_graphs = []
4369
+ cache_errors = []
4370
+
4371
+ with self._cache_lock:
4372
+ for name in graph_names:
4373
+ if name in self._preemptive_cache:
4374
+ cached_path = self._preemptive_cache[name]
4375
+ if os.path.exists(cached_path) and os.path.getsize(cached_path) > 0:
4376
+ # Additional validation: check if graph content has changed
4377
+ if self._is_cache_valid(name, cached_path):
4378
+ cached_graphs[name] = cached_path
4379
+ else:
4380
+ uncached_graphs.append(name)
4381
+ # Remove stale cache entry
4382
+ del self._preemptive_cache[name]
4383
+ else:
4384
+ uncached_graphs.append(name)
4385
+ # Remove invalid cache entry
4386
+ if name in self._preemptive_cache:
4387
+ del self._preemptive_cache[name]
4388
+ else:
4389
+ uncached_graphs.append(name)
4390
+
4391
+ for name, cached_path in cached_graphs.items():
4392
+ try:
4393
+ exports.append(GraphExport(name=name, file_path=cached_path))
4394
+ except Exception as e:
4395
+ cache_errors.append(f"Failed to read cached graph {name}: {e}")
4396
+ # Fall back to uncached processing
4397
+ uncached_graphs.append(name)
4398
+
4399
+ if uncached_graphs:
4400
+ successful_graphs = []
4401
+ failed_graphs = []
4402
+ memory_results = {}
4403
+
4404
+ for name in uncached_graphs:
4405
+ try:
4406
+ svg_data = _export_svg_bytes(name)
4407
+ memory_results[name] = svg_data
4408
+ successful_graphs.append(name)
4409
+ except Exception as e:
4410
+ failed_graphs.append(name)
4411
+ cache_errors.append(f"Failed to cache graph {name}: {e}")
4412
+
4413
+ for name in successful_graphs:
4414
+ result = memory_results[name]
4415
+
4416
+ cache_path = _cache_keyed_svg_path(name)
4417
+
4418
+ try:
4419
+ with open(cache_path, 'wb') as f:
4420
+ f.write(result)
4421
+
4422
+ # Update cache with size tracking and eviction
4423
+ import time
4424
+ item_size = len(result)
4425
+ self._evict_cache_if_needed(item_size)
4426
+
4427
+ with self._cache_lock:
4428
+ self._preemptive_cache[name] = cache_path
4429
+ # Store content hash for validation
4430
+ self._preemptive_cache[f"{name}_hash"] = self._get_content_hash(result)
4431
+ # Update tracking
4432
+ self._cache_access_times[name] = time.time()
4433
+ self._cache_sizes[name] = item_size
4434
+ self._total_cache_size += item_size
4435
+
4436
+ exports.append(GraphExport(name=name, file_path=cache_path))
4437
+ except Exception as e:
4438
+ cache_errors.append(f"Failed to cache graph {name}: {e}")
4439
+ # Still return the result even if caching fails
4440
+ # Create temp file for immediate use
4441
+ safe_name = self._sanitize_filename(name)
4442
+ temp_path = os.path.join(get_writable_temp_dir(), f"{safe_name}_{uuid.uuid4().hex[:8]}.svg")
4443
+ with open(temp_path, 'wb') as f:
4444
+ f.write(result)
4445
+ register_temp_file(temp_path)
4446
+ exports.append(GraphExport(name=name, file_path=temp_path))
4447
+
4448
+ # Log errors if any occurred
4449
+ if cache_errors:
4450
+ logger = logging.getLogger(__name__)
4451
+ for error in cache_errors:
4452
+ logger.warning(error)
4453
+
4454
+ return GraphExportResponse(graphs=exports)
4455
+
4456
+ def cache_graph_on_creation(self, graph_name: str) -> bool:
4457
+ """Revolutionary method to cache a graph immediately after creation.
4458
+
4459
+ Call this method right after creating a graph to pre-emptively cache it.
4460
+ This eliminates all export wait time for future access.
4461
+
4462
+ Args:
4463
+ graph_name: Name of the graph to cache
4464
+
4465
+ Returns:
4466
+ True if caching succeeded, False otherwise
4467
+ """
4468
+ import os
4469
+ import logging
4470
+ logger = logging.getLogger("mcp_stata.stata_client")
4471
+
4472
+ # Initialize cache in thread-safe manner
4473
+ self._initialize_cache()
4474
+
4475
+ # Invalidate list_graphs cache since a new graph was created
4476
+ self.invalidate_list_graphs_cache()
4477
+
4478
+ # Check if already cached and valid
4479
+ with self._cache_lock:
4480
+ if graph_name in self._preemptive_cache:
4481
+ cache_path = self._preemptive_cache[graph_name]
4482
+ if os.path.exists(cache_path) and os.path.getsize(cache_path) > 0:
4483
+ if self._is_cache_valid(graph_name, cache_path):
4484
+ # Update access time for LRU
4485
+ import time
4486
+ self._cache_access_times[graph_name] = time.time()
4487
+ return True
4488
+ else:
4489
+ # Remove stale cache entry
4490
+ del self._preemptive_cache[graph_name]
4491
+ if graph_name in self._cache_access_times:
4492
+ del self._cache_access_times[graph_name]
4493
+ if graph_name in self._cache_sizes:
4494
+ self._total_cache_size -= self._cache_sizes[graph_name]
4495
+ del self._cache_sizes[graph_name]
4496
+ # Remove hash entry if exists
4497
+ hash_key = f"{graph_name}_hash"
4498
+ if hash_key in self._preemptive_cache:
4499
+ del self._preemptive_cache[hash_key]
4500
+
4501
+ try:
4502
+ # Include signature in filename to force client-side refresh
4503
+ import hashlib
4504
+ sig = self._get_graph_signature(graph_name)
4505
+ safe_name = self._sanitize_filename(sig)
4506
+ suffix = hashlib.md5((sig or "").encode("utf-8")).hexdigest()[:8]
4507
+ cache_path = os.path.join(self._preemptive_cache_dir, f"{safe_name}_{suffix}.svg")
4508
+ cache_path_for_stata = cache_path.replace("\\", "/")
4509
+
4510
+ resolved_graph_name = self._resolve_graph_name_for_stata(graph_name)
4511
+ safe_name = resolved_graph_name.strip()
4512
+
4513
+ # The most reliable and efficient strategy for capturing distinct graphs in
4514
+ # PyStata background tasks:
4515
+ # 1. Ensure the specific graph is active in the Stata engine via 'graph display'.
4516
+ # 2. Export with the explicit name() option to ensure isolation.
4517
+ # Graph names in Stata should NOT be quoted.
4518
+
4519
+ maintenance = [
4520
+ f"quietly graph display {safe_name}",
4521
+ f"quietly graph export \"{cache_path_for_stata}\", name({safe_name}) replace as(svg)"
4522
+ ]
4523
+
4524
+ resp = self._exec_no_capture_silent("\n".join(maintenance), echo=False)
4525
+
4526
+ if resp.success and os.path.exists(cache_path) and os.path.getsize(cache_path) > 0:
4527
+ # Read the data to compute hash
4528
+ with open(cache_path, 'rb') as f:
4529
+ data = f.read()
4530
+
4531
+ # Update cache with size tracking and eviction
4532
+ import time
4533
+ item_size = len(data)
4534
+ self._evict_cache_if_needed(item_size)
4535
+
4536
+ with self._cache_lock:
4537
+ # Clear any old versions of this graph from the path cache
4538
+ # (Optional but keeps it clean)
4539
+ old_path = self._preemptive_cache.get(graph_name)
4540
+ if old_path and old_path != cache_path:
4541
+ try:
4542
+ os.remove(old_path)
4543
+ except Exception:
4544
+ pass
4545
+
4546
+ self._preemptive_cache[graph_name] = cache_path
4547
+ # Store content hash for validation
4548
+ self._preemptive_cache[f"{graph_name}_hash"] = self._get_content_hash(data)
4549
+ # Store signature for fast validation
4550
+ self._preemptive_cache[f"{graph_name}_sig"] = self._get_graph_signature(graph_name)
4551
+ # Update tracking
4552
+ self._cache_access_times[graph_name] = time.time()
4553
+ self._cache_sizes[graph_name] = item_size
4554
+ self._total_cache_size += item_size
4555
+
4556
+ return True
4557
+ else:
4558
+ error_msg = getattr(resp, 'error', 'Unknown error')
4559
+ logger = logging.getLogger(__name__)
4560
+ logger.warning(f"Failed to cache graph {graph_name}: {error_msg}")
4561
+
4562
+ except Exception as e:
4563
+ logger = logging.getLogger(__name__)
4564
+ logger.warning(f"Exception caching graph {graph_name}: {e}")
4565
+
4566
+ return False
4567
+
4568
+ def run_do_file(self, path: str, echo: bool = True, trace: bool = False, max_output_lines: Optional[int] = None, cwd: Optional[str] = None) -> CommandResponse:
4569
+ effective_path, command, error_response = self._resolve_do_file_path(path, cwd)
4570
+ if error_response is not None:
4571
+ return error_response
4572
+
4573
+ if not self._initialized:
4574
+ self.init()
4575
+
4576
+ start_time = time.time()
4577
+ exc: Optional[Exception] = None
4578
+ smcl_content = ""
4579
+ smcl_path = None
4580
+
4581
+ _log_file, log_path, tail, tee = self._create_streaming_log(trace=trace)
4582
+ smcl_path = self._create_smcl_log_path()
4583
+ smcl_log_name = self._make_smcl_log_name()
4584
+
4585
+ rc = -1
4586
+ try:
4587
+ rc, exc = self._run_streaming_blocking(
4588
+ command=command,
4589
+ tee=tee,
4590
+ cwd=cwd,
4591
+ trace=trace,
4592
+ echo=echo,
4593
+ smcl_path=smcl_path,
4594
+ smcl_log_name=smcl_log_name,
4595
+ hold_attr="_hold_name_do_sync",
4596
+ require_smcl_log=True,
4597
+ )
4598
+ except Exception as e:
4599
+ exc = e
4600
+ rc = 1
4601
+ finally:
4602
+ tee.close()
4603
+
4604
+ # Read SMCL content as the authoritative source
4605
+ smcl_content = self._read_smcl_file(smcl_path)
4606
+ smcl_content = self._clean_internal_smcl(smcl_content, strip_output=False)
4607
+
4608
+ combined = self._build_combined_log(tail, log_path, rc, trace, exc)
4609
+
4610
+ # Use SMCL content as primary source for RC detection if not already captured
4611
+ if rc == -1 and not exc:
4612
+ parsed_rc = self._parse_rc_from_smcl(smcl_content)
4613
+ if parsed_rc is not None:
4614
+ rc = parsed_rc
4615
+ else:
4616
+ # Fallback to text parsing
4617
+ parsed_rc = self._parse_rc_from_text(combined)
4618
+ rc = parsed_rc if parsed_rc is not None else 0
4619
+ elif exc and rc == 1:
4620
+ # Try to parse more specific RC from exception message
4621
+ parsed_rc = self._parse_rc_from_text(str(exc))
4622
+ if parsed_rc is not None:
4623
+ rc = parsed_rc
4624
+
4625
+ # If RC looks wrong but SMCL shows no error markers, treat as success.
4626
+ if rc != 0 and smcl_content:
4627
+ has_err_tag = "{err}" in smcl_content
4628
+ rc_match = re.search(r"(?<!\w)r\((\d+)\)", smcl_content)
4629
+ if rc_match:
4630
+ try:
4631
+ rc = int(rc_match.group(1))
4632
+ except Exception:
4633
+ pass
4634
+ else:
4635
+ text_rc = None
4636
+ try:
4637
+ text_rc = self._parse_rc_from_text(self._smcl_to_text(smcl_content))
4638
+ except Exception:
4639
+ text_rc = None
4640
+ if not has_err_tag and text_rc is None:
4641
+ rc = 0
4642
+
4643
+ success = (rc == 0 and exc is None)
4644
+ error = None
4645
+
4646
+ if not success:
4647
+ # Use SMCL as authoritative source for error extraction
4648
+ if smcl_content:
4649
+ msg, context = self._extract_error_from_smcl(smcl_content, rc)
4650
+ else:
4651
+ # Fallback to combined log
4652
+ msg, context = self._extract_error_and_context(combined, rc)
4653
+
4654
+ error = ErrorEnvelope(
4655
+ message=msg,
4656
+ rc=rc,
4657
+ snippet=context,
4658
+ command=command,
4659
+ log_path=log_path,
4660
+ smcl_output=smcl_content,
4661
+ )
4662
+
4663
+ duration = time.time() - start_time
4664
+ logger.info(
4665
+ "stata.run(do) rc=%s success=%s trace=%s duration_ms=%.2f path=%s",
4666
+ rc,
4667
+ success,
4668
+ trace,
4669
+ duration * 1000,
4670
+ effective_path,
4671
+ )
4672
+
4673
+ try:
4674
+ with open(log_path, "w", encoding="utf-8", errors="replace") as handle:
4675
+ handle.write(smcl_content)
4676
+ except Exception:
4677
+ pass
4678
+
4679
+ return CommandResponse(
4680
+ command=command,
4681
+ rc=rc,
4682
+ stdout="",
4683
+ stderr=None,
4684
+ log_path=log_path,
4685
+ success=success,
4686
+ error=error,
4687
+ smcl_output=smcl_content,
4688
+ )
4689
+
4690
+ def load_data(self, source: str, clear: bool = True, max_output_lines: Optional[int] = None) -> CommandResponse:
4691
+ src = source.strip()
4692
+ clear_suffix = ", clear" if clear else ""
4693
+
4694
+ if src.startswith("sysuse "):
4695
+ cmd = f"{src}{clear_suffix}"
4696
+ elif src.startswith("webuse "):
4697
+ cmd = f"{src}{clear_suffix}"
4698
+ elif src.startswith("use "):
4699
+ cmd = f"{src}{clear_suffix}"
4700
+ elif "://" in src or src.endswith(".dta") or os.path.sep in src:
4701
+ cmd = f'use "{src}"{clear_suffix}'
4702
+ else:
4703
+ cmd = f"sysuse {src}{clear_suffix}"
4704
+
4705
+ result = self._exec_with_capture(cmd, echo=True, trace=False)
4706
+ return self._truncate_command_output(result, max_output_lines)
4707
+
4708
+ def codebook(self, varname: str, trace: bool = False, max_output_lines: Optional[int] = None) -> CommandResponse:
4709
+ result = self._exec_with_capture(f"codebook {varname}", trace=trace)
4710
+ return self._truncate_command_output(result, max_output_lines)