mcp-stata 1.7.6__py3-none-any.whl → 1.16.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcp-stata might be problematic. Click here for more details.

mcp_stata/stata_client.py CHANGED
@@ -1,22 +1,27 @@
1
- import base64
1
+ import asyncio
2
+ import io
3
+ import inspect
2
4
  import json
3
5
  import logging
4
6
  import os
7
+ import platform
5
8
  import re
6
9
  import subprocess
7
10
  import sys
8
- import threading
9
- from importlib.metadata import PackageNotFoundError, version
10
11
  import tempfile
12
+ import threading
11
13
  import time
12
- from contextlib import contextmanager
14
+ import uuid
15
+ from contextlib import contextmanager, redirect_stdout, redirect_stderr
16
+ from importlib.metadata import PackageNotFoundError, version
13
17
  from io import StringIO
14
- from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple
18
+ from typing import Any, Awaitable, Callable, Dict, Generator, List, Optional, Tuple
15
19
 
16
20
  import anyio
17
21
  from anyio import get_cancelled_exc_class
18
22
 
19
- from .discovery import find_stata_path
23
+ from .discovery import find_stata_candidates
24
+ from .config import MAX_LIMIT
20
25
  from .models import (
21
26
  CommandResponse,
22
27
  ErrorEnvelope,
@@ -33,6 +38,29 @@ from .graph_detector import StreamingGraphCache
33
38
 
34
39
  logger = logging.getLogger("mcp_stata")
35
40
 
41
+ _POLARS_AVAILABLE: Optional[bool] = None
42
+
43
+ def _check_polars_available() -> bool:
44
+ """
45
+ Check if Polars can be safely imported.
46
+ Must detect problematic platforms BEFORE attempting import,
47
+ since the crash is a fatal signal, not a catchable exception.
48
+ """
49
+ if sys.platform == "win32" and platform.machine().lower() in ("arm64", "aarch64"):
50
+ return False
51
+
52
+ try:
53
+ import polars # noqa: F401
54
+ return True
55
+ except ImportError:
56
+ return False
57
+
58
+
59
+ def _get_polars_available() -> bool:
60
+ global _POLARS_AVAILABLE
61
+ if _POLARS_AVAILABLE is None:
62
+ _POLARS_AVAILABLE = _check_polars_available()
63
+ return _POLARS_AVAILABLE
36
64
 
37
65
  # ============================================================================
38
66
  # MODULE-LEVEL DISCOVERY CACHE
@@ -40,26 +68,30 @@ logger = logging.getLogger("mcp_stata")
40
68
  # This cache ensures Stata discovery runs exactly once per process lifetime
41
69
  _discovery_lock = threading.Lock()
42
70
  _discovery_result: Optional[Tuple[str, str]] = None # (path, edition)
71
+ _discovery_candidates: Optional[List[Tuple[str, str]]] = None
43
72
  _discovery_attempted = False
44
73
  _discovery_error: Optional[Exception] = None
45
74
 
46
75
 
47
- def _get_discovered_stata() -> Tuple[str, str]:
76
+ def _get_discovery_candidates() -> List[Tuple[str, str]]:
48
77
  """
49
- Get the discovered Stata path and edition, running discovery only once.
78
+ Get ordered discovery candidates, running discovery only once.
50
79
 
51
80
  Returns:
52
- Tuple of (stata_executable_path, edition)
81
+ List of (stata_executable_path, edition) ordered by preference.
53
82
 
54
83
  Raises:
55
84
  RuntimeError: If Stata discovery fails
56
85
  """
57
- global _discovery_result, _discovery_attempted, _discovery_error
86
+ global _discovery_result, _discovery_candidates, _discovery_attempted, _discovery_error
58
87
 
59
88
  with _discovery_lock:
60
89
  # If we've already successfully discovered Stata, return cached result
61
90
  if _discovery_result is not None:
62
- return _discovery_result
91
+ return _discovery_candidates or [_discovery_result]
92
+
93
+ if _discovery_candidates is not None:
94
+ return _discovery_candidates
63
95
 
64
96
  # If we've already attempted and failed, re-raise the cached error
65
97
  if _discovery_attempted and _discovery_error is not None:
@@ -83,13 +115,17 @@ def _get_discovered_stata() -> Tuple[str, str]:
83
115
  logger.info("mcp-stata version: %s", pkg_version)
84
116
 
85
117
  # Run discovery
86
- stata_exec_path, edition = find_stata_path()
118
+ candidates = find_stata_candidates()
87
119
 
88
120
  # Cache the successful result
89
- _discovery_result = (stata_exec_path, edition)
90
- logger.info("Discovery found Stata at: %s (%s)", stata_exec_path, edition)
121
+ _discovery_candidates = candidates
122
+ if candidates:
123
+ _discovery_result = candidates[0]
124
+ logger.info("Discovery found Stata at: %s (%s)", _discovery_result[0], _discovery_result[1])
125
+ else:
126
+ raise FileNotFoundError("No Stata candidates discovered")
91
127
 
92
- return _discovery_result
128
+ return candidates
93
129
 
94
130
  except FileNotFoundError as e:
95
131
  _discovery_error = e
@@ -102,23 +138,49 @@ def _get_discovered_stata() -> Tuple[str, str]:
102
138
  ) from e
103
139
 
104
140
 
141
+ def _get_discovered_stata() -> Tuple[str, str]:
142
+ """
143
+ Preserve existing API: return the highest-priority discovered Stata candidate.
144
+ """
145
+ candidates = _get_discovery_candidates()
146
+ if not candidates:
147
+ raise RuntimeError("Stata binary not found: no candidates discovered")
148
+ return candidates[0]
149
+
150
+
105
151
  class StataClient:
106
152
  _initialized = False
107
153
  _exec_lock: threading.Lock
108
154
  _cache_init_lock = threading.Lock() # Class-level lock for cache initialization
109
155
  _is_executing = False # Flag to prevent recursive Stata calls
110
- MAX_DATA_ROWS = 500
156
+ MAX_DATA_ROWS = MAX_LIMIT
111
157
  MAX_GRAPH_BYTES = 50 * 1024 * 1024 # Maximum graph exports (~50MB)
112
158
  MAX_CACHE_SIZE = 100 # Maximum number of graphs to cache
113
159
  MAX_CACHE_BYTES = 500 * 1024 * 1024 # Maximum cache size in bytes (~500MB)
114
160
  LIST_GRAPHS_TTL = 0.075 # TTL for list_graphs cache (75ms)
115
161
 
162
+ def __init__(self):
163
+ self._exec_lock = threading.RLock()
164
+ self._is_executing = False
165
+ self._command_idx = 0 # Counter for user-initiated commands
166
+ self._initialized = False
167
+ from .graph_detector import GraphCreationDetector
168
+ self._graph_detector = GraphCreationDetector(self)
169
+
116
170
  def __new__(cls):
117
171
  inst = super(StataClient, cls).__new__(cls)
118
- inst._exec_lock = threading.Lock()
172
+ inst._exec_lock = threading.RLock()
119
173
  inst._is_executing = False
174
+ inst._command_idx = 0
175
+ from .graph_detector import GraphCreationDetector
176
+ inst._graph_detector = GraphCreationDetector(inst)
120
177
  return inst
121
178
 
179
+ def _increment_command_idx(self) -> int:
180
+ """Increment and return the command counter."""
181
+ self._command_idx += 1
182
+ return self._command_idx
183
+
122
184
  @contextmanager
123
185
  def _redirect_io(self, out_buf, err_buf):
124
186
  """Safely redirect stdout/stderr for the duration of a Stata call."""
@@ -129,38 +191,6 @@ class StataClient:
129
191
  finally:
130
192
  sys.stdout, sys.stderr = backup_stdout, backup_stderr
131
193
 
132
- def _select_stata_error_message(self, text: str, fallback: str) -> str:
133
- """
134
- Helper for tests and legacy callers to extract the clean error message.
135
- """
136
- if not text:
137
- return fallback
138
-
139
- lines = text.splitlines()
140
- trace_pattern = re.compile(r'^\s*[-=.]')
141
- noise_pattern = re.compile(r'^(?:\}|\{txt\}|\{com\}|end of do-file)')
142
-
143
- for line in reversed(lines):
144
- stripped = line.strip()
145
- if not stripped:
146
- continue
147
- if trace_pattern.match(line):
148
- continue
149
- if noise_pattern.match(stripped):
150
- continue
151
- if stripped.startswith("r(") and stripped.endswith(");"):
152
- # If we hit r(123); we might want the line ABOVE it if it's not noise
153
- continue
154
-
155
- # Preserve SMCL tags
156
- return stripped
157
-
158
- # If we couldn't find a better message, try to find r(N);
159
- match = re.search(r"r\(\d+\);", text)
160
- if match:
161
- return match.group(0)
162
-
163
- return fallback
164
194
 
165
195
  @staticmethod
166
196
  def _stata_quote(value: str) -> str:
@@ -181,6 +211,613 @@ class StataClient:
181
211
  finally:
182
212
  sys.stdout, sys.stderr = backup_stdout, backup_stderr
183
213
 
214
+ @staticmethod
215
+ def _safe_unlink(path: str) -> None:
216
+ if not path:
217
+ return
218
+ try:
219
+ if os.path.exists(path):
220
+ os.unlink(path)
221
+ except Exception:
222
+ pass
223
+
224
+ def _create_smcl_log_path(
225
+ self,
226
+ *,
227
+ prefix: str = "mcp_smcl_",
228
+ max_hex: Optional[int] = None,
229
+ base_dir: Optional[str] = None,
230
+ ) -> str:
231
+ hex_id = uuid.uuid4().hex if max_hex is None else uuid.uuid4().hex[:max_hex]
232
+ base = os.path.realpath(tempfile.gettempdir())
233
+ smcl_path = os.path.join(base, f"{prefix}{hex_id}.smcl")
234
+ self._safe_unlink(smcl_path)
235
+ return smcl_path
236
+
237
+ @staticmethod
238
+ def _make_smcl_log_name() -> str:
239
+ return f"_mcp_smcl_{uuid.uuid4().hex[:8]}"
240
+
241
+ def _open_smcl_log(self, smcl_path: str, log_name: str, *, quiet: bool = False) -> bool:
242
+ path_for_stata = smcl_path.replace("\\", "/")
243
+ base_cmd = f"log using \"{path_for_stata}\", replace smcl name({log_name})"
244
+ unnamed_cmd = f"log using \"{path_for_stata}\", replace smcl"
245
+ for attempt in range(4):
246
+ try:
247
+ logger.debug(
248
+ "_open_smcl_log attempt=%s log_name=%s path=%s",
249
+ attempt + 1,
250
+ log_name,
251
+ smcl_path,
252
+ )
253
+ logger.warning(
254
+ "SMCL open attempt %s cwd=%s path=%s",
255
+ attempt + 1,
256
+ os.getcwd(),
257
+ smcl_path,
258
+ )
259
+ logger.debug(
260
+ "SMCL open attempt=%s cwd=%s path=%s cmd=%s",
261
+ attempt + 1,
262
+ os.getcwd(),
263
+ smcl_path,
264
+ base_cmd,
265
+ )
266
+ try:
267
+ close_ret = self.stata.run("capture log close _all", echo=False)
268
+ if close_ret:
269
+ logger.warning("SMCL close_all output: %s", close_ret)
270
+ except Exception:
271
+ pass
272
+ cmd = f"{'quietly ' if quiet else ''}{base_cmd}"
273
+ try:
274
+ output_buf = StringIO()
275
+ with redirect_stdout(output_buf), redirect_stderr(output_buf):
276
+ self.stata.run(cmd, echo=False)
277
+ ret = output_buf.getvalue().strip()
278
+ if ret:
279
+ logger.warning("SMCL log open output: %s", ret)
280
+ except Exception as e:
281
+ logger.warning("SMCL log open failed (attempt %s): %s", attempt + 1, e)
282
+ logger.warning("SMCL log open failed: %r", e)
283
+ try:
284
+ retry_buf = StringIO()
285
+ with redirect_stdout(retry_buf), redirect_stderr(retry_buf):
286
+ self.stata.run(base_cmd, echo=False)
287
+ ret = retry_buf.getvalue().strip()
288
+ if ret:
289
+ logger.warning("SMCL log open output (no quiet): %s", ret)
290
+ except Exception as inner:
291
+ logger.warning("SMCL log open retry failed: %s", inner)
292
+ query_buf = StringIO()
293
+ try:
294
+ with redirect_stdout(query_buf), redirect_stderr(query_buf):
295
+ self.stata.run("log query", echo=False)
296
+ except Exception as query_err:
297
+ query_buf.write(f"log query failed: {query_err!r}")
298
+ query_ret = query_buf.getvalue().strip()
299
+ logger.warning("SMCL log query output: %s", query_ret)
300
+
301
+ if query_ret:
302
+ query_lower = query_ret.lower()
303
+ log_confirmed = "log:" in query_lower and "smcl" in query_lower and " on" in query_lower
304
+ if log_confirmed:
305
+ self._last_smcl_log_named = True
306
+ logger.info("SMCL log confirmed: %s", path_for_stata)
307
+ return True
308
+ logger.warning("SMCL log not confirmed after open; query_ret=%s", query_ret)
309
+ try:
310
+ unnamed_output = StringIO()
311
+ with redirect_stdout(unnamed_output), redirect_stderr(unnamed_output):
312
+ self.stata.run(unnamed_cmd, echo=False)
313
+ unnamed_ret = unnamed_output.getvalue().strip()
314
+ if unnamed_ret:
315
+ logger.warning("SMCL log open output (unnamed): %s", unnamed_ret)
316
+ except Exception as e:
317
+ logger.warning("SMCL log open failed (unnamed, attempt %s): %s", attempt + 1, e)
318
+ unnamed_query_buf = StringIO()
319
+ try:
320
+ with redirect_stdout(unnamed_query_buf), redirect_stderr(unnamed_query_buf):
321
+ self.stata.run("log query", echo=False)
322
+ except Exception as query_err:
323
+ unnamed_query_buf.write(f"log query failed: {query_err!r}")
324
+ unnamed_query = unnamed_query_buf.getvalue().strip()
325
+ if unnamed_query:
326
+ unnamed_lower = unnamed_query.lower()
327
+ unnamed_confirmed = "log:" in unnamed_lower and "smcl" in unnamed_lower and " on" in unnamed_lower
328
+ if unnamed_confirmed:
329
+ self._last_smcl_log_named = False
330
+ logger.info("SMCL log confirmed (unnamed): %s", path_for_stata)
331
+ return True
332
+ except Exception as e:
333
+ logger.warning("Failed to open SMCL log (attempt %s): %s", attempt + 1, e)
334
+ if attempt < 3:
335
+ time.sleep(0.1)
336
+ logger.warning("Failed to open SMCL log with cmd: %s", cmd)
337
+ return False
338
+
339
+ def _close_smcl_log(self, log_name: str) -> None:
340
+ try:
341
+ use_named = getattr(self, "_last_smcl_log_named", None)
342
+ if use_named is False:
343
+ self.stata.run("capture log close", echo=False)
344
+ else:
345
+ self.stata.run(f"capture log close {log_name}", echo=False)
346
+ except Exception:
347
+ pass
348
+
349
+ def _restore_results_from_hold(self, hold_attr: str) -> None:
350
+ if not hasattr(self, hold_attr):
351
+ return
352
+ hold_name = getattr(self, hold_attr)
353
+ try:
354
+ self.stata.run(f"capture _return restore {hold_name}", echo=False)
355
+ self._last_results = self.get_stored_results(force_fresh=True)
356
+ except Exception:
357
+ pass
358
+ finally:
359
+ try:
360
+ delattr(self, hold_attr)
361
+ except Exception:
362
+ pass
363
+
364
+ def _create_streaming_log(self, *, trace: bool) -> tuple[tempfile.NamedTemporaryFile, str, TailBuffer, FileTeeIO]:
365
+ log_file = tempfile.NamedTemporaryFile(
366
+ prefix="mcp_stata_",
367
+ suffix=".log",
368
+ delete=False,
369
+ mode="w",
370
+ encoding="utf-8",
371
+ errors="replace",
372
+ buffering=1,
373
+ )
374
+ log_path = log_file.name
375
+ tail = TailBuffer(max_chars=200000 if trace else 20000)
376
+ tee = FileTeeIO(log_file, tail)
377
+ return log_file, log_path, tail, tee
378
+
379
+ def _init_streaming_graph_cache(
380
+ self,
381
+ auto_cache_graphs: bool,
382
+ on_graph_cached: Optional[Callable[[str, bool], Awaitable[None]]],
383
+ notify_log: Callable[[str], Awaitable[None]],
384
+ ) -> Optional[StreamingGraphCache]:
385
+ if not auto_cache_graphs:
386
+ return None
387
+ graph_cache = StreamingGraphCache(self, auto_cache=True)
388
+ graph_cache_callback = self._create_graph_cache_callback(on_graph_cached, notify_log)
389
+ graph_cache.add_cache_callback(graph_cache_callback)
390
+ return graph_cache
391
+
392
+ def _capture_graph_state(
393
+ self,
394
+ graph_cache: Optional[StreamingGraphCache],
395
+ emit_graph_ready: bool,
396
+ ) -> Optional[dict[str, str]]:
397
+ # Capture initial graph state BEFORE execution starts
398
+ if graph_cache:
399
+ # Clear detection state for the new command (detected/removed sets)
400
+ # but preserve _last_graph_state signatures for modification detection.
401
+ graph_cache.detector.clear_detection_state()
402
+ try:
403
+ graph_cache._initial_graphs = set(self.list_graphs(force_refresh=True))
404
+ logger.debug(f"Initial graph state captured: {graph_cache._initial_graphs}")
405
+ except Exception as e:
406
+ logger.debug(f"Failed to capture initial graph state: {e}")
407
+ graph_cache._initial_graphs = set()
408
+
409
+ graph_ready_initial = None
410
+ if emit_graph_ready:
411
+ try:
412
+ graph_ready_initial = {}
413
+ for graph_name in self.list_graphs(force_refresh=True):
414
+ graph_ready_initial[graph_name] = self._get_graph_signature(graph_name)
415
+ logger.debug("Graph-ready initial state captured: %s", set(graph_ready_initial))
416
+ except Exception as e:
417
+ logger.debug("Failed to capture graph-ready state: %s", e)
418
+ graph_ready_initial = {}
419
+ return graph_ready_initial
420
+
421
+ async def _cache_new_graphs(
422
+ self,
423
+ graph_cache: Optional[StreamingGraphCache],
424
+ *,
425
+ notify_progress: Optional[Callable[[float, Optional[float], Optional[str]], Awaitable[None]]],
426
+ total_lines: int,
427
+ completed_label: str,
428
+ ) -> None:
429
+ if not graph_cache or not graph_cache.auto_cache:
430
+ return
431
+ try:
432
+ cached_graphs = []
433
+ # Use detector to find new OR modified graphs
434
+ pystata_detected = await anyio.to_thread.run_sync(graph_cache.detector._detect_graphs_via_pystata)
435
+
436
+ # Combine with any pending graphs in queue
437
+ with graph_cache._lock:
438
+ to_process = set(pystata_detected) | set(graph_cache._graphs_to_cache)
439
+ graph_cache._graphs_to_cache.clear()
440
+
441
+ if to_process:
442
+ logger.info(f"Detected {len(to_process)} new or modified graph(s): {sorted(to_process)}")
443
+
444
+ for graph_name in to_process:
445
+ if graph_name in graph_cache._cached_graphs:
446
+ continue
447
+
448
+ try:
449
+ cache_result = await anyio.to_thread.run_sync(
450
+ self.cache_graph_on_creation,
451
+ graph_name,
452
+ )
453
+ if cache_result:
454
+ cached_graphs.append(graph_name)
455
+ graph_cache._cached_graphs.add(graph_name)
456
+
457
+ for callback in graph_cache._cache_callbacks:
458
+ try:
459
+ result = callback(graph_name, cache_result)
460
+ if inspect.isawaitable(result):
461
+ await result
462
+ except Exception:
463
+ pass
464
+ except Exception as e:
465
+ logger.error(f"Error caching graph {graph_name}: {e}")
466
+
467
+ if cached_graphs and notify_progress:
468
+ await notify_progress(
469
+ float(total_lines) if total_lines > 0 else 1,
470
+ float(total_lines) if total_lines > 0 else 1,
471
+ f"{completed_label} completed. Cached {len(cached_graphs)} graph(s): {', '.join(cached_graphs)}",
472
+ )
473
+ except Exception as e:
474
+ logger.error(f"Post-execution graph detection failed: {e}")
475
+
476
+ def _emit_graph_ready_task(
477
+ self,
478
+ *,
479
+ emit_graph_ready: bool,
480
+ graph_ready_initial: Optional[dict[str, str]],
481
+ notify_log: Callable[[str], Awaitable[None]],
482
+ graph_ready_task_id: Optional[str],
483
+ graph_ready_format: str,
484
+ ) -> None:
485
+ if emit_graph_ready and graph_ready_initial is not None:
486
+ try:
487
+ asyncio.create_task(
488
+ self._emit_graph_ready_events(
489
+ graph_ready_initial,
490
+ notify_log,
491
+ graph_ready_task_id,
492
+ graph_ready_format,
493
+ )
494
+ )
495
+ except Exception as e:
496
+ logger.warning("graph_ready emission failed to start: %s", e)
497
+
498
+ async def _stream_smcl_log(
499
+ self,
500
+ *,
501
+ smcl_path: str,
502
+ notify_log: Callable[[str], Awaitable[None]],
503
+ done: anyio.Event,
504
+ on_chunk: Optional[Callable[[str], Awaitable[None]]] = None,
505
+ ) -> None:
506
+ last_pos = 0
507
+ emitted_debug_chunks = 0
508
+ # Wait for Stata to create the SMCL file
509
+ while not done.is_set() and not os.path.exists(smcl_path):
510
+ await anyio.sleep(0.05)
511
+
512
+ try:
513
+ def _read_content() -> str:
514
+ try:
515
+ with open(smcl_path, "r", encoding="utf-8", errors="replace") as f:
516
+ f.seek(last_pos)
517
+ return f.read()
518
+ except PermissionError:
519
+ if os.name == "nt":
520
+ try:
521
+ res = subprocess.run(f'type "{smcl_path}"', shell=True, capture_output=True)
522
+ full_content = res.stdout.decode("utf-8", errors="replace")
523
+ if len(full_content) > last_pos:
524
+ return full_content[last_pos:]
525
+ return ""
526
+ except Exception:
527
+ return ""
528
+ return ""
529
+ except FileNotFoundError:
530
+ return ""
531
+
532
+ while not done.is_set():
533
+ chunk = await anyio.to_thread.run_sync(_read_content)
534
+ if chunk:
535
+ last_pos += len(chunk)
536
+ try:
537
+ await notify_log(chunk)
538
+ except Exception as exc:
539
+ logger.debug("notify_log failed: %s", exc)
540
+ if on_chunk is not None:
541
+ try:
542
+ await on_chunk(chunk)
543
+ except Exception as exc:
544
+ logger.debug("on_chunk callback failed: %s", exc)
545
+ await anyio.sleep(0.05)
546
+
547
+ chunk = await anyio.to_thread.run_sync(_read_content)
548
+ if on_chunk is not None:
549
+ # Final check even if last chunk is empty, to ensure
550
+ # graphs created at the very end are detected.
551
+ try:
552
+ await on_chunk(chunk or "")
553
+ except Exception as exc:
554
+ logger.debug("final on_chunk check failed: %s", exc)
555
+
556
+ if chunk:
557
+ last_pos += len(chunk)
558
+ try:
559
+ await notify_log(chunk)
560
+ except Exception as exc:
561
+ logger.debug("notify_log failed: %s", exc)
562
+
563
+ except Exception as e:
564
+ logger.warning(f"Log streaming failed: {e}")
565
+
566
+ def _run_streaming_blocking(
567
+ self,
568
+ *,
569
+ command: str,
570
+ tee: FileTeeIO,
571
+ cwd: Optional[str],
572
+ trace: bool,
573
+ echo: bool,
574
+ smcl_path: str,
575
+ smcl_log_name: str,
576
+ hold_attr: str,
577
+ require_smcl_log: bool = False,
578
+ ) -> tuple[int, Optional[Exception]]:
579
+ rc = -1
580
+ exc: Optional[Exception] = None
581
+ with self._exec_lock:
582
+ self._is_executing = True
583
+ try:
584
+ from sfi import Scalar, SFIToolkit # Import SFI tools
585
+ with self._temp_cwd(cwd):
586
+ logger.debug(
587
+ "opening SMCL log name=%s path=%s cwd=%s",
588
+ smcl_log_name,
589
+ smcl_path,
590
+ os.getcwd(),
591
+ )
592
+ try:
593
+ log_opened = self._open_smcl_log(smcl_path, smcl_log_name, quiet=True)
594
+ except Exception as e:
595
+ log_opened = False
596
+ logger.warning("_open_smcl_log raised: %r", e)
597
+ logger.info("SMCL log_opened=%s path=%s", log_opened, smcl_path)
598
+ if require_smcl_log and not log_opened:
599
+ exc = RuntimeError("Failed to open SMCL log")
600
+ logger.error("SMCL log open failed for %s", smcl_path)
601
+ rc = 1
602
+ if exc is None:
603
+ try:
604
+ with self._redirect_io_streaming(tee, tee):
605
+ try:
606
+ if trace:
607
+ self.stata.run("set trace on")
608
+ logger.debug("running Stata command echo=%s: %s", echo, command)
609
+ ret = self.stata.run(command, echo=echo)
610
+ if ret:
611
+ logger.debug("stata.run output: %s", ret)
612
+
613
+ setattr(self, hold_attr, f"mcp_hold_{uuid.uuid4().hex[:8]}")
614
+ self.stata.run(
615
+ f"capture _return hold {getattr(self, hold_attr)}",
616
+ echo=False,
617
+ )
618
+
619
+ if isinstance(ret, str) and ret:
620
+ try:
621
+ tee.write(ret)
622
+ except Exception:
623
+ pass
624
+ try:
625
+ rc = self._get_rc_from_scalar(Scalar)
626
+ except Exception:
627
+ pass
628
+ except Exception as e:
629
+ exc = e
630
+ logger.error("stata.run failed: %r", e)
631
+ if rc in (-1, 0):
632
+ rc = 1
633
+ finally:
634
+ if trace:
635
+ try:
636
+ self.stata.run("set trace off")
637
+ except Exception:
638
+ pass
639
+ finally:
640
+ self._close_smcl_log(smcl_log_name)
641
+ self._restore_results_from_hold(hold_attr)
642
+ return rc, exc
643
+ # If we get here, SMCL log failed and we're required to stop.
644
+ return rc, exc
645
+ finally:
646
+ self._is_executing = False
647
+ return rc, exc
648
+
649
+ def _resolve_do_file_path(
650
+ self,
651
+ path: str,
652
+ cwd: Optional[str],
653
+ ) -> tuple[Optional[str], Optional[str], Optional[CommandResponse]]:
654
+ if cwd is not None and not os.path.isdir(cwd):
655
+ return None, None, CommandResponse(
656
+ command=f'do "{path}"',
657
+ rc=601,
658
+ stdout="",
659
+ stderr=None,
660
+ success=False,
661
+ error=ErrorEnvelope(
662
+ message=f"cwd not found: {cwd}",
663
+ rc=601,
664
+ command=path,
665
+ ),
666
+ )
667
+
668
+ effective_path = path
669
+ if cwd is not None and not os.path.isabs(path):
670
+ effective_path = os.path.abspath(os.path.join(cwd, path))
671
+
672
+ if not os.path.exists(effective_path):
673
+ return None, None, CommandResponse(
674
+ command=f'do "{effective_path}"',
675
+ rc=601,
676
+ stdout="",
677
+ stderr=None,
678
+ success=False,
679
+ error=ErrorEnvelope(
680
+ message=f"Do-file not found: {effective_path}",
681
+ rc=601,
682
+ command=effective_path,
683
+ ),
684
+ )
685
+
686
+ path_for_stata = effective_path.replace("\\", "/")
687
+ command = f'do "{path_for_stata}"'
688
+ return effective_path, command, None
689
+
690
+ @contextmanager
691
+ def _smcl_log_capture(self) -> "Generator[Tuple[str, str], None, None]":
692
+ """
693
+ Context manager that wraps command execution in a named SMCL log.
694
+
695
+ This runs alongside any user logs (named logs can coexist).
696
+ Yields (log_name, log_path) tuple for use within the context.
697
+ The SMCL file is NOT deleted automatically - caller should clean up.
698
+
699
+ Usage:
700
+ with self._smcl_log_capture() as (log_name, smcl_path):
701
+ self.stata.run(cmd)
702
+ # After context, read smcl_path for raw SMCL output
703
+ """
704
+ # Use a unique name but DO NOT join start with mkstemp to avoid existing file locks.
705
+ # Stata will create the file.
706
+ smcl_path = self._create_smcl_log_path()
707
+ # Unique log name to avoid collisions with user logs
708
+ log_name = self._make_smcl_log_name()
709
+
710
+ try:
711
+ # Open named SMCL log (quietly to avoid polluting output)
712
+ log_opened = self._open_smcl_log(smcl_path, log_name, quiet=True)
713
+ if not log_opened:
714
+ # Still yield, consumer might see empty file or handle error,
715
+ # but we can't do much if Stata refuses to log.
716
+ pass
717
+
718
+ yield log_name, smcl_path
719
+ finally:
720
+ # Always close our named log
721
+ self._close_smcl_log(log_name)
722
+
723
+ def _read_smcl_file(self, path: str) -> str:
724
+ """Read SMCL file contents, handling encoding issues and Windows file locks."""
725
+ try:
726
+ with open(path, 'r', encoding='utf-8', errors='replace') as f:
727
+ return f.read()
728
+ except PermissionError:
729
+ if os.name == "nt":
730
+ # Windows Fallback: Try to use 'type' command to bypass exclusive lock
731
+ try:
732
+ res = subprocess.run(f'type "{path}"', shell=True, capture_output=True)
733
+ if res.returncode == 0:
734
+ return res.stdout.decode('utf-8', errors='replace')
735
+ except Exception as e:
736
+ logger.debug(f"Combined fallback read failed: {e}")
737
+ logger.warning(f"Failed to read SMCL file {path} due to lock")
738
+ return ""
739
+ except Exception as e:
740
+ logger.warning(f"Failed to read SMCL file {path}: {e}")
741
+ return ""
742
+
743
+ def _extract_error_from_smcl(self, smcl_content: str, rc: int) -> Tuple[str, str]:
744
+ """
745
+ Extract error message and context from raw SMCL output.
746
+
747
+ Uses {err} tags as the authoritative source for error detection.
748
+
749
+ Returns:
750
+ Tuple of (error_message, context_string)
751
+ """
752
+ if not smcl_content:
753
+ return f"Stata error r({rc})", ""
754
+
755
+ lines = smcl_content.splitlines()
756
+
757
+ # Search backwards for {err} tags - they indicate error lines
758
+ error_lines = []
759
+ error_start_idx = -1
760
+
761
+ for i in range(len(lines) - 1, -1, -1):
762
+ line = lines[i]
763
+ if '{err}' in line:
764
+ if error_start_idx == -1:
765
+ error_start_idx = i
766
+ # Walk backwards to find consecutive {err} lines
767
+ j = i
768
+ while j >= 0 and '{err}' in lines[j]:
769
+ error_lines.insert(0, lines[j])
770
+ j -= 1
771
+ break
772
+
773
+ if error_lines:
774
+ # Clean SMCL tags from error message
775
+ clean_lines = []
776
+ for line in error_lines:
777
+ # Remove SMCL tags but keep the text content
778
+ cleaned = re.sub(r'\{[^}]*\}', '', line).strip()
779
+ if cleaned:
780
+ clean_lines.append(cleaned)
781
+
782
+ error_msg = " ".join(clean_lines) or f"Stata error r({rc})"
783
+
784
+ # Context is everything from error start to end
785
+ context_start = max(0, error_start_idx - 5) # Include 5 lines before error
786
+ context = "\n".join(lines[context_start:])
787
+
788
+ return error_msg, context
789
+
790
+ # Fallback: no {err} found, return last 30 lines as context
791
+ context_start = max(0, len(lines) - 30)
792
+ context = "\n".join(lines[context_start:])
793
+
794
+ return f"Stata error r({rc})", context
795
+
796
+ def _parse_rc_from_smcl(self, smcl_content: str) -> Optional[int]:
797
+ """Parse return code from SMCL content using specific structural patterns."""
798
+ if not smcl_content:
799
+ return None
800
+
801
+ # 1. Primary check: SMCL search tag {search r(N), ...}
802
+ # This is the most authoritative interactive indicator
803
+ matches = list(re.finditer(r'\{search r\((\d+)\)', smcl_content))
804
+ if matches:
805
+ try:
806
+ return int(matches[-1].group(1))
807
+ except Exception:
808
+ pass
809
+
810
+ # 2. Secondary check: Standalone r(N); pattern
811
+ # This appears at the end of command blocks
812
+ matches = list(re.finditer(r'(?<!\w)r\((\d+)\);?', smcl_content))
813
+ if matches:
814
+ try:
815
+ return int(matches[-1].group(1))
816
+ except Exception:
817
+ pass
818
+
819
+ return None
820
+
184
821
  @staticmethod
185
822
  def _create_graph_cache_callback(on_graph_cached, notify_log):
186
823
  """Create a standardized graph cache callback with proper error handling."""
@@ -203,6 +840,159 @@ class StataClient:
203
840
 
204
841
  return graph_cache_callback
205
842
 
843
+ def _get_cached_graph_path(self, graph_name: str) -> Optional[str]:
844
+ if not hasattr(self, "_cache_lock") or not hasattr(self, "_preemptive_cache"):
845
+ return None
846
+ try:
847
+ with self._cache_lock:
848
+ cache_path = self._preemptive_cache.get(graph_name)
849
+ if not cache_path:
850
+ return None
851
+
852
+ # Double-check validity (e.g. signature match for current command)
853
+ if not self._is_cache_valid(graph_name, cache_path):
854
+ return None
855
+
856
+ return cache_path
857
+ except Exception:
858
+ return None
859
+
860
+ async def _emit_graph_ready_for_graphs(
861
+ self,
862
+ graph_names: List[str],
863
+ *,
864
+ notify_log: Callable[[str], Awaitable[None]],
865
+ task_id: Optional[str],
866
+ export_format: str,
867
+ graph_ready_initial: Optional[dict[str, str]],
868
+ ) -> None:
869
+ if not graph_names:
870
+ return
871
+ fmt = (export_format or "svg").strip().lower()
872
+ for graph_name in graph_names:
873
+ signature = self._get_graph_signature(graph_name)
874
+ if graph_ready_initial is not None:
875
+ previous = graph_ready_initial.get(graph_name)
876
+ if previous is not None and previous == signature:
877
+ continue
878
+ try:
879
+ export_path = None
880
+ if fmt == "svg":
881
+ export_path = self._get_cached_graph_path(graph_name)
882
+ if not export_path:
883
+ export_path = await anyio.to_thread.run_sync(
884
+ lambda: self.export_graph(graph_name, format=fmt)
885
+ )
886
+ payload = {
887
+ "event": "graph_ready",
888
+ "task_id": task_id,
889
+ "graph": {
890
+ "name": graph_name,
891
+ "path": export_path,
892
+ "label": graph_name,
893
+ },
894
+ }
895
+ await notify_log(json.dumps(payload))
896
+ if graph_ready_initial is not None:
897
+ graph_ready_initial[graph_name] = signature
898
+ except Exception as e:
899
+ logger.warning("graph_ready export failed for %s: %s", graph_name, e)
900
+
901
+ async def _maybe_cache_graphs_on_chunk(
902
+ self,
903
+ *,
904
+ graph_cache: Optional[StreamingGraphCache],
905
+ emit_graph_ready: bool,
906
+ notify_log: Callable[[str], Awaitable[None]],
907
+ graph_ready_task_id: Optional[str],
908
+ graph_ready_format: str,
909
+ graph_ready_initial: Optional[dict[str, str]],
910
+ last_check: List[float],
911
+ force: bool = False,
912
+ ) -> None:
913
+ if not graph_cache or not graph_cache.auto_cache:
914
+ return
915
+ if self._is_executing and not force:
916
+ # Skip polling if Stata is busy; it will block on _exec_lock anyway.
917
+ # During final check (force=True), we know it's safe because _run_streaming_blocking has finished.
918
+ return
919
+ now = time.monotonic()
920
+ if not force and last_check and now - last_check[0] < 0.25:
921
+ return
922
+ if last_check:
923
+ last_check[0] = now
924
+ try:
925
+ cached_names = await graph_cache.cache_detected_graphs_with_pystata()
926
+ except Exception as e:
927
+ logger.debug("graph_ready polling failed: %s", e)
928
+ return
929
+ if emit_graph_ready and cached_names:
930
+ await self._emit_graph_ready_for_graphs(
931
+ cached_names,
932
+ notify_log=notify_log,
933
+ task_id=graph_ready_task_id,
934
+ export_format=graph_ready_format,
935
+ graph_ready_initial=graph_ready_initial,
936
+ )
937
+
938
+ async def _emit_graph_ready_events(
939
+ self,
940
+ initial_graphs: dict[str, str],
941
+ notify_log: Callable[[str], Awaitable[None]],
942
+ task_id: Optional[str],
943
+ export_format: str,
944
+ ) -> None:
945
+ try:
946
+ current_graphs = list(self.list_graphs(force_refresh=True))
947
+ except Exception as e:
948
+ logger.warning("graph_ready: list_graphs failed: %s", e)
949
+ return
950
+
951
+ if not current_graphs:
952
+ return
953
+
954
+ for graph_name in current_graphs:
955
+ signature = self._get_graph_signature(graph_name)
956
+ previous = initial_graphs.get(graph_name)
957
+ if previous is not None and previous == signature:
958
+ continue
959
+ try:
960
+ export_path = None
961
+ if export_format == "svg":
962
+ export_path = self._get_cached_graph_path(graph_name)
963
+
964
+ if not export_path:
965
+ export_path = await anyio.to_thread.run_sync(
966
+ lambda: self.export_graph(graph_name, format=export_format)
967
+ )
968
+ payload = {
969
+ "event": "graph_ready",
970
+ "task_id": task_id,
971
+ "graph": {
972
+ "name": graph_name,
973
+ "path": export_path,
974
+ "label": graph_name,
975
+ },
976
+ }
977
+ await notify_log(json.dumps(payload))
978
+ initial_graphs[graph_name] = signature
979
+ except Exception as e:
980
+ logger.warning("graph_ready export failed for %s: %s", graph_name, e)
981
+
982
+ def _get_graph_signature(self, graph_name: str) -> str:
983
+ """
984
+ Get a stable signature for a graph without calling Stata.
985
+ Consistent with GraphCreationDetector implementation.
986
+ """
987
+ if not graph_name:
988
+ return ""
989
+ cmd_idx = getattr(self, "_command_idx", 0)
990
+ # Only include command index for default 'Graph' to detect modifications.
991
+ # For named graphs, we only want to detect them when they are new or renamed.
992
+ if graph_name.lower() == "graph":
993
+ return f"{graph_name}_{cmd_idx}"
994
+ return graph_name
995
+
206
996
  def _request_break_in(self) -> None:
207
997
  """
208
998
  Attempt to interrupt a running Stata command when cancellation is requested.
@@ -272,72 +1062,199 @@ class StataClient:
272
1062
  finally:
273
1063
  os.chdir(prev)
274
1064
 
1065
+ @contextmanager
1066
+ def _safe_redirect_fds(self):
1067
+ """Redirects fd 1 (stdout) to fd 2 (stderr) at the OS level."""
1068
+ # Save original stdout fd
1069
+ try:
1070
+ stdout_fd = os.dup(1)
1071
+ except Exception:
1072
+ # Fallback if we can't dup (e.g. strange environment)
1073
+ yield
1074
+ return
1075
+
1076
+ try:
1077
+ # Redirect OS-level stdout to stderr
1078
+ os.dup2(2, 1)
1079
+ yield
1080
+ finally:
1081
+ # Restore stdout
1082
+ try:
1083
+ os.dup2(stdout_fd, 1)
1084
+ os.close(stdout_fd)
1085
+ except Exception:
1086
+ pass
1087
+
275
1088
  def init(self):
276
1089
  """Initializes usage of pystata using cached discovery results."""
277
1090
  if self._initialized:
278
1091
  return
279
1092
 
1093
+ # Suppress any non-UTF8 banner output from PyStata on stdout, which breaks MCP stdio transport
1094
+ from contextlib import redirect_stdout, redirect_stderr
1095
+
280
1096
  try:
281
1097
  import stata_setup
282
1098
 
283
- # Get discovered Stata path (cached from first call)
284
- stata_exec_path, edition = _get_discovered_stata()
285
-
286
- candidates = []
287
-
288
- # Prefer the binary directory first (documented input for stata_setup)
289
- bin_dir = os.path.dirname(stata_exec_path)
290
- if bin_dir:
291
- candidates.append(bin_dir)
292
-
293
- # 2. App Bundle: .../StataMP.app (macOS only)
294
- curr = bin_dir
295
- app_bundle = None
296
- while len(curr) > 1:
297
- if curr.endswith(".app"):
298
- app_bundle = curr
299
- break
300
- parent = os.path.dirname(curr)
301
- if parent == curr: # Reached root directory, prevent infinite loop on Windows
302
- break
303
- curr = parent
304
-
305
- if app_bundle:
306
- candidates.insert(0, os.path.dirname(app_bundle))
307
- candidates.insert(1, app_bundle)
1099
+ # Get discovered Stata paths (cached from first call)
1100
+ discovery_candidates = _get_discovery_candidates()
1101
+ if not discovery_candidates:
1102
+ raise RuntimeError("No Stata candidates found during discovery")
1103
+
1104
+ logger.info("Initializing Stata engine (attempting up to %d candidate binaries)...", len(discovery_candidates))
308
1105
 
309
- # Deduplicate preserving order
310
- seen = set()
311
- deduped = []
312
- for c in candidates:
313
- if c in seen:
314
- continue
315
- seen.add(c)
316
- deduped.append(c)
317
- candidates = deduped
1106
+ # Diagnostic: force faulthandler to output to stderr for C crashes
1107
+ import faulthandler
1108
+ faulthandler.enable(file=sys.stderr)
1109
+ import subprocess
318
1110
 
319
1111
  success = False
320
- for path in candidates:
321
- try:
322
- stata_setup.config(path, edition)
323
- success = True
324
- logger.debug("stata_setup.config succeeded with path: %s", path)
1112
+ last_error = None
1113
+ chosen_exec: Optional[Tuple[str, str]] = None
1114
+
1115
+ for stata_exec_path, edition in discovery_candidates:
1116
+ candidates = []
1117
+ # Prefer the binary directory first (documented input for stata_setup)
1118
+ bin_dir = os.path.dirname(stata_exec_path)
1119
+
1120
+ # 2. App Bundle: .../StataMP.app (macOS only)
1121
+ curr = bin_dir
1122
+ app_bundle = None
1123
+ while len(curr) > 1:
1124
+ if curr.endswith(".app"):
1125
+ app_bundle = curr
1126
+ break
1127
+ parent = os.path.dirname(curr)
1128
+ if parent == curr:
1129
+ break
1130
+ curr = parent
1131
+
1132
+ ordered_candidates = []
1133
+ if app_bundle:
1134
+ # On macOS, the parent of the .app is often the correct install path
1135
+ # (e.g., /Applications/StataNow containing StataMP.app)
1136
+ parent_dir = os.path.dirname(app_bundle)
1137
+ if parent_dir and parent_dir != "/":
1138
+ ordered_candidates.append(parent_dir)
1139
+ ordered_candidates.append(app_bundle)
1140
+
1141
+ if bin_dir:
1142
+ ordered_candidates.append(bin_dir)
1143
+
1144
+ # Deduplicate preserving order
1145
+ seen = set()
1146
+ candidates = []
1147
+ for c in ordered_candidates:
1148
+ if c not in seen:
1149
+ seen.add(c)
1150
+ candidates.append(c)
1151
+
1152
+ for path in candidates:
1153
+ try:
1154
+ # 1. Pre-flight check in a subprocess to capture hard exits/crashes
1155
+ sys.stderr.write(f"[mcp_stata] DEBUG: Pre-flight check for path '{path}'\n")
1156
+ sys.stderr.flush()
1157
+
1158
+ preflight_code = f"""
1159
+ import sys
1160
+ import stata_setup
1161
+ from contextlib import redirect_stdout, redirect_stderr
1162
+ with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
1163
+ try:
1164
+ stata_setup.config({repr(path)}, {repr(edition)})
1165
+ from pystata import stata
1166
+ # Minimal verification of engine health
1167
+ stata.run('display 1', echo=False)
1168
+ print('PREFLIGHT_OK')
1169
+ except Exception as e:
1170
+ print(f'PREFLIGHT_FAIL: {{e}}', file=sys.stderr)
1171
+ sys.exit(1)
1172
+ """
1173
+
1174
+ try:
1175
+ # Use shorter timeout for pre-flight if feasible,
1176
+ # but keep it safe for slow environments. 15s is usually enough for a ping.
1177
+ res = subprocess.run(
1178
+ [sys.executable, "-c", preflight_code],
1179
+ capture_output=True, text=True, timeout=20
1180
+ )
1181
+ if res.returncode != 0:
1182
+ sys.stderr.write(f"[mcp_stata] Pre-flight failed (rc={res.returncode}) for '{path}'\n")
1183
+ if res.stdout.strip():
1184
+ sys.stderr.write(f"--- Pre-flight stdout ---\n{res.stdout.strip()}\n")
1185
+ if res.stderr.strip():
1186
+ sys.stderr.write(f"--- Pre-flight stderr ---\n{res.stderr.strip()}\n")
1187
+ sys.stderr.flush()
1188
+ last_error = f"Pre-flight failed: {res.stdout.strip()} {res.stderr.strip()}"
1189
+ continue
1190
+ else:
1191
+ sys.stderr.write(f"[mcp_stata] Pre-flight succeeded for '{path}'. Proceeding to in-process init.\n")
1192
+ sys.stderr.flush()
1193
+ except Exception as pre_e:
1194
+ sys.stderr.write(f"[mcp_stata] Pre-flight execution error for '{path}': {repr(pre_e)}\n")
1195
+ sys.stderr.flush()
1196
+ last_error = pre_e
1197
+ continue
1198
+
1199
+ msg = f"[mcp_stata] DEBUG: In-process stata_setup.config('{path}', '{edition}')\n"
1200
+ sys.stderr.write(msg)
1201
+ sys.stderr.flush()
1202
+ # Redirect both sys.stdout/err AND the raw fds to our stderr pipe.
1203
+ with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr), self._safe_redirect_fds():
1204
+ stata_setup.config(path, edition)
1205
+
1206
+ sys.stderr.write(f"[mcp_stata] DEBUG: stata_setup.config succeeded for path: {path}\n")
1207
+ sys.stderr.flush()
1208
+ success = True
1209
+ chosen_exec = (stata_exec_path, edition)
1210
+ logger.info("stata_setup.config succeeded with path: %s", path)
1211
+ break
1212
+ except BaseException as e:
1213
+ last_error = e
1214
+ sys.stderr.write(f"[mcp_stata] WARNING: In-process stata_setup.config caught: {repr(e)}\n")
1215
+ sys.stderr.flush()
1216
+ logger.warning("stata_setup.config failed for path '%s': %s", path, e)
1217
+ if isinstance(e, SystemExit):
1218
+ break
1219
+ continue
1220
+
1221
+ if success:
1222
+ # Cache winning candidate for subsequent lookups
1223
+ global _discovery_result
1224
+ if chosen_exec:
1225
+ _discovery_result = chosen_exec
325
1226
  break
326
- except Exception:
327
- continue
328
1227
 
329
1228
  if not success:
330
- raise RuntimeError(
331
- f"stata_setup.config failed. Tried: {candidates}. "
332
- f"Derived from binary: {stata_exec_path}"
1229
+ error_msg = (
1230
+ f"stata_setup.config failed to initialize Stata. "
1231
+ f"Tried candidates: {discovery_candidates}. "
1232
+ f"Last error: {repr(last_error)}"
333
1233
  )
1234
+ sys.stderr.write(f"[mcp_stata] ERROR: {error_msg}\n")
1235
+ sys.stderr.flush()
1236
+ logger.error(error_msg)
1237
+ raise RuntimeError(error_msg)
334
1238
 
335
1239
  # Cache the binary path for later use (e.g., PNG export on Windows)
336
1240
  self._stata_exec_path = os.path.abspath(stata_exec_path)
337
1241
 
338
- from pystata import stata # type: ignore[import-not-found]
339
- self.stata = stata
340
- self._initialized = True
1242
+ try:
1243
+ sys.stderr.write("[mcp_stata] DEBUG: Importing pystata and warming up...\n")
1244
+ sys.stderr.flush()
1245
+ with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr), self._safe_redirect_fds():
1246
+ from pystata import stata # type: ignore[import-not-found]
1247
+ # Warm up the engine and swallow any late splash screen output
1248
+ stata.run("display 1", echo=False)
1249
+ self.stata = stata
1250
+ self._initialized = True
1251
+ sys.stderr.write("[mcp_stata] DEBUG: pystata warmed up successfully\n")
1252
+ sys.stderr.flush()
1253
+ except BaseException as e:
1254
+ sys.stderr.write(f"[mcp_stata] ERROR: Failed to load pystata or run initial command: {repr(e)}\n")
1255
+ sys.stderr.flush()
1256
+ logger.error("Failed to load pystata or run initial command: %s", e)
1257
+ raise
341
1258
 
342
1259
  # Initialize list_graphs TTL cache
343
1260
  self._list_graphs_cache = None
@@ -408,32 +1325,11 @@ class StataClient:
408
1325
 
409
1326
  return pat.sub(repl, code)
410
1327
 
411
- def _read_return_code(self) -> int:
412
- """Read the last Stata return code without mutating rc."""
413
- try:
414
- from sfi import Macro # type: ignore[import-not-found]
415
- rc_val = Macro.getCValue("rc") # type: ignore[attr-defined]
416
- if rc_val is not None:
417
- return int(float(rc_val))
418
- # If getCValue returns None, fall through to the alternative approach
419
- except Exception:
420
- pass
421
-
422
- # Alternative approach: use a global macro
423
- # CRITICAL: This must be done carefully to avoid mutating c(rc)
424
- try:
425
- self.stata.run("global MCP_RC = c(rc)")
426
- from sfi import Macro as Macro2 # type: ignore[import-not-found]
427
- rc_val = Macro2.getGlobal("MCP_RC")
428
- return int(float(rc_val))
429
- except Exception:
430
- return -1
431
-
432
1328
  def _get_rc_from_scalar(self, Scalar) -> int:
433
1329
  """Safely get return code, handling None values."""
434
1330
  try:
435
1331
  from sfi import Macro
436
- rc_val = Macro.getCValue("rc")
1332
+ rc_val = Macro.getGlobal("_rc")
437
1333
  if rc_val is None:
438
1334
  return -1
439
1335
  return int(float(rc_val))
@@ -441,12 +1337,27 @@ class StataClient:
441
1337
  return -1
442
1338
 
443
1339
  def _parse_rc_from_text(self, text: str) -> Optional[int]:
444
- match = re.search(r"r\((\d+)\)", text)
445
- if match:
1340
+ """Parse return code from plain text using structural patterns."""
1341
+ if not text:
1342
+ return None
1343
+
1344
+ # 1. Primary check: 'search r(N)' pattern (SMCL tag potentially stripped)
1345
+ matches = list(re.finditer(r'search r\((\d+)\)', text))
1346
+ if matches:
446
1347
  try:
447
- return int(match.group(1))
1348
+ return int(matches[-1].group(1))
448
1349
  except Exception:
449
- return None
1350
+ pass
1351
+
1352
+ # 2. Secondary check: Standalone r(N); pattern
1353
+ # This appears at the end of command blocks
1354
+ matches = list(re.finditer(r'(?<!\w)r\((\d+)\);?', text))
1355
+ if matches:
1356
+ try:
1357
+ return int(matches[-1].group(1))
1358
+ except Exception:
1359
+ pass
1360
+
450
1361
  return None
451
1362
 
452
1363
  def _parse_line_from_text(self, text: str) -> Optional[int]:
@@ -458,11 +1369,104 @@ class StataClient:
458
1369
  return None
459
1370
  return None
460
1371
 
1372
+ def _read_log_backwards_until_error(self, path: str, max_bytes: int = 5_000_000) -> str:
1373
+ """
1374
+ Read log file backwards in chunks, stopping when we find {err} tags or reach the start.
1375
+
1376
+ This is more efficient and robust than reading huge fixed tails, as we only read
1377
+ what we need to find the error.
1378
+
1379
+ Args:
1380
+ path: Path to the log file
1381
+ max_bytes: Maximum total bytes to read (safety limit, default 5MB)
1382
+
1383
+ Returns:
1384
+ The relevant portion of the log containing the error and context
1385
+ """
1386
+ try:
1387
+ chunk_size = 50_000 # Read 50KB chunks at a time
1388
+ total_read = 0
1389
+ chunks = []
1390
+
1391
+ with open(path, 'rb') as f:
1392
+ # Get file size
1393
+ f.seek(0, os.SEEK_END)
1394
+ file_size = f.tell()
1395
+
1396
+ if file_size == 0:
1397
+ return ""
1398
+
1399
+ # Start from the end
1400
+ position = file_size
1401
+
1402
+ while position > 0 and total_read < max_bytes:
1403
+ # Calculate how much to read in this chunk
1404
+ read_size = min(chunk_size, position, max_bytes - total_read)
1405
+ position -= read_size
1406
+
1407
+ # Seek and read
1408
+ f.seek(position)
1409
+ chunk = f.read(read_size)
1410
+ chunks.insert(0, chunk)
1411
+ total_read += read_size
1412
+
1413
+ # Decode and check for error tags
1414
+ try:
1415
+ accumulated = b''.join(chunks).decode('utf-8', errors='replace')
1416
+
1417
+ # Check if we've found an error tag
1418
+ if '{err}' in accumulated:
1419
+ # Found it! Read one more chunk for context before the error
1420
+ if position > 0 and total_read < max_bytes:
1421
+ extra_read = min(chunk_size, position, max_bytes - total_read)
1422
+ position -= extra_read
1423
+ f.seek(position)
1424
+ extra_chunk = f.read(extra_read)
1425
+ chunks.insert(0, extra_chunk)
1426
+
1427
+ return b''.join(chunks).decode('utf-8', errors='replace')
1428
+
1429
+ except UnicodeDecodeError:
1430
+ # Continue reading if we hit a decode error (might be mid-character)
1431
+ continue
1432
+
1433
+ # Read everything we've accumulated
1434
+ return b''.join(chunks).decode('utf-8', errors='replace')
1435
+
1436
+ except Exception as e:
1437
+ logger.warning(f"Error reading log backwards: {e}")
1438
+ # Fallback to regular tail read
1439
+ return self._read_log_tail(path, 200_000)
1440
+
1441
+ def _read_log_tail_smart(self, path: str, rc: int, trace: bool = False) -> str:
1442
+ """
1443
+ Smart log tail reader that adapts based on whether an error occurred.
1444
+
1445
+ - If rc == 0: Read normal tail (20KB without trace, 200KB with trace)
1446
+ - If rc != 0: Search backwards dynamically to find the error
1447
+
1448
+ Args:
1449
+ path: Path to the log file
1450
+ rc: Return code from Stata
1451
+ trace: Whether trace mode was enabled
1452
+
1453
+ Returns:
1454
+ Relevant log content
1455
+ """
1456
+ if rc != 0:
1457
+ # Error occurred - search backwards for {err} tags
1458
+ return self._read_log_backwards_until_error(path)
1459
+ else:
1460
+ # Success - just read normal tail
1461
+ tail_size = 200_000 if trace else 20_000
1462
+ return self._read_log_tail(path, tail_size)
1463
+
461
1464
  def _read_log_tail(self, path: str, max_chars: int) -> str:
462
1465
  try:
463
1466
  with open(path, "rb") as f:
464
1467
  f.seek(0, os.SEEK_END)
465
1468
  size = f.tell()
1469
+
466
1470
  if size <= 0:
467
1471
  return ""
468
1472
  read_size = min(size, max_chars)
@@ -472,6 +1476,98 @@ class StataClient:
472
1476
  except Exception:
473
1477
  return ""
474
1478
 
1479
+ def _build_combined_log(
1480
+ self,
1481
+ tail: TailBuffer,
1482
+ path: str,
1483
+ rc: int,
1484
+ trace: bool,
1485
+ exc: Optional[Exception],
1486
+ ) -> str:
1487
+ tail_text = tail.get_value()
1488
+ log_tail = self._read_log_tail_smart(path, rc, trace)
1489
+ if log_tail and len(log_tail) > len(tail_text):
1490
+ tail_text = log_tail
1491
+ return (tail_text or "") + (f"\n{exc}" if exc else "")
1492
+
1493
+ def _truncate_command_output(
1494
+ self,
1495
+ result: CommandResponse,
1496
+ max_output_lines: Optional[int],
1497
+ ) -> CommandResponse:
1498
+ if max_output_lines is None or not result.stdout:
1499
+ return result
1500
+ lines = result.stdout.splitlines()
1501
+ if len(lines) <= max_output_lines:
1502
+ return result
1503
+ truncated_lines = lines[:max_output_lines]
1504
+ truncated_lines.append(
1505
+ f"\n... (output truncated: showing {max_output_lines} of {len(lines)} lines)"
1506
+ )
1507
+ truncated_stdout = "\n".join(truncated_lines)
1508
+ if hasattr(result, "model_copy"):
1509
+ return result.model_copy(update={"stdout": truncated_stdout})
1510
+ return result.copy(update={"stdout": truncated_stdout})
1511
+
1512
+ def _run_plain_capture(self, code: str) -> str:
1513
+ """
1514
+ Run a Stata command while capturing output using a named SMCL log.
1515
+ This is the most reliable way to capture output (like return list)
1516
+ without interfering with user logs or being affected by stdout redirection issues.
1517
+ """
1518
+ if not self._initialized:
1519
+ self.init()
1520
+
1521
+ with self._exec_lock:
1522
+ hold_name = f"mcp_hold_{uuid.uuid4().hex[:8]}"
1523
+ # Hold results BEFORE opening the capture log
1524
+ self.stata.run(f"capture _return hold {hold_name}", echo=False)
1525
+
1526
+ try:
1527
+ with self._smcl_log_capture() as (log_name, smcl_path):
1528
+ # Restore results INSIDE the capture log so return list can see them
1529
+ self.stata.run(f"capture _return restore {hold_name}", echo=False)
1530
+ try:
1531
+ self.stata.run(code, echo=True)
1532
+ except Exception:
1533
+ pass
1534
+ except Exception:
1535
+ # Cleanup hold if log capture failed to open
1536
+ self.stata.run(f"capture _return drop {hold_name}", echo=False)
1537
+ content = ""
1538
+ smcl_path = None
1539
+ else:
1540
+ # Read SMCL content and convert to text
1541
+ content = self._read_smcl_file(smcl_path)
1542
+ # Remove the temp file
1543
+ self._safe_unlink(smcl_path)
1544
+
1545
+ return self._smcl_to_text(content)
1546
+
1547
+ def _count_do_file_lines(self, path: str) -> int:
1548
+ """
1549
+ Count the number of executable lines in a .do file for progress inference.
1550
+
1551
+ Blank lines and comment-only lines (starting with * or //) are ignored.
1552
+ """
1553
+ try:
1554
+ with open(path, "r", encoding="utf-8", errors="replace") as f:
1555
+ lines = f.read().splitlines()
1556
+ except Exception:
1557
+ return 0
1558
+
1559
+ total = 0
1560
+ for line in lines:
1561
+ s = line.strip()
1562
+ if not s:
1563
+ continue
1564
+ if s.startswith("*"):
1565
+ continue
1566
+ if s.startswith("//"):
1567
+ continue
1568
+ total += 1
1569
+ return total
1570
+
475
1571
  def _smcl_to_text(self, smcl: str) -> str:
476
1572
  """Convert simple SMCL markup into plain text for LLM-friendly help."""
477
1573
  # First, keep inline directive content if present (e.g., {bf:word} -> word)
@@ -523,6 +1619,7 @@ class StataClient:
523
1619
  if not self._initialized:
524
1620
  self.init()
525
1621
 
1622
+ self._increment_command_idx()
526
1623
  # Rewrite graph names with special characters to internal aliases
527
1624
  code = self._maybe_rewrite_graph_name_in_command(code)
528
1625
 
@@ -530,17 +1627,44 @@ class StataClient:
530
1627
  error_buffer = StringIO()
531
1628
  rc = 0
532
1629
  sys_error = None
1630
+ error_envelope = None
1631
+ smcl_content = ""
1632
+ smcl_path = None
533
1633
 
534
1634
  with self._exec_lock:
535
1635
  try:
536
- from sfi import Scalar, SFIToolkit # Import SFI tools inside execution block
1636
+ from sfi import Scalar, SFIToolkit
537
1637
  with self._temp_cwd(cwd):
538
- with self._redirect_io(output_buffer, error_buffer):
539
- if trace:
540
- self.stata.run("set trace on")
541
-
542
- # 1. Run the user code
543
- self.stata.run(code, echo=echo)
1638
+ # Create SMCL log for authoritative output capture
1639
+ # Use shorter unique path to avoid Windows path issues
1640
+ smcl_path = self._create_smcl_log_path(prefix="mcp_", max_hex=16, base_dir=cwd)
1641
+ log_name = self._make_smcl_log_name()
1642
+ self._open_smcl_log(smcl_path, log_name)
1643
+
1644
+ try:
1645
+ with self._redirect_io(output_buffer, error_buffer):
1646
+ try:
1647
+ if trace:
1648
+ self.stata.run("set trace on")
1649
+
1650
+ # Run the user code
1651
+ self.stata.run(code, echo=echo)
1652
+
1653
+ # Hold results IMMEDIATELY to prevent clobbering by cleanup
1654
+ self._hold_name = f"mcp_hold_{uuid.uuid4().hex[:8]}"
1655
+ self.stata.run(f"capture _return hold {self._hold_name}", echo=False)
1656
+
1657
+ finally:
1658
+ if trace:
1659
+ try:
1660
+ self.stata.run("set trace off")
1661
+ except Exception:
1662
+ pass
1663
+ finally:
1664
+ # Close SMCL log AFTER output redirection
1665
+ self._close_smcl_log(log_name)
1666
+ # Restore and capture results while still inside the lock
1667
+ self._restore_results_from_hold("_hold_name")
544
1668
 
545
1669
  except Exception as e:
546
1670
  sys_error = str(e)
@@ -548,36 +1672,66 @@ class StataClient:
548
1672
  parsed_rc = self._parse_rc_from_text(sys_error)
549
1673
  rc = parsed_rc if parsed_rc is not None else 1
550
1674
 
1675
+ # Read SMCL content as the authoritative source
1676
+ if smcl_path:
1677
+ smcl_content = self._read_smcl_file(smcl_path)
1678
+ # Clean up SMCL file
1679
+ self._safe_unlink(smcl_path)
1680
+
551
1681
  stdout_content = output_buffer.getvalue()
552
1682
  stderr_content = error_buffer.getvalue()
553
- full_log = stdout_content + "\n" + stderr_content
554
1683
 
555
- # 2. Extract RC from log tail (primary error detection method)
556
- if rc == 1 and not sys_error: # No exception but might have error in log
557
- parsed_rc = self._parse_rc_from_text(full_log)
558
- if parsed_rc is not None:
1684
+ # If RC wasn't captured or is generic, try to parse from SMCL
1685
+ if rc in (0, 1, -1) and smcl_content:
1686
+ parsed_rc = self._parse_rc_from_smcl(smcl_content)
1687
+ if parsed_rc is not None and parsed_rc != 0:
559
1688
  rc = parsed_rc
1689
+ elif rc == -1:
1690
+ rc = 0
1691
+
1692
+ # If stdout is empty but SMCL has content AND command succeeded, use SMCL as stdout
1693
+ # This handles cases where Stata writes to log but not to redirected stdout
1694
+ # For errors, we keep stdout empty and error info goes to ErrorEnvelope
1695
+ if rc == 0 and not stdout_content and smcl_content:
1696
+ # Convert SMCL to plain text for stdout
1697
+ stdout_content = self._smcl_to_text(smcl_content)
560
1698
 
561
- error_envelope = None
562
1699
  if rc != 0:
563
1700
  if sys_error:
564
1701
  msg = sys_error
565
- snippet = sys_error # Include the exception message as snippet
1702
+ context = sys_error
566
1703
  else:
567
- # Extract error message from log tail
568
- msg, context = self._extract_error_and_context(full_log, rc)
569
-
570
- error_envelope = ErrorEnvelope(message=msg, rc=rc, context=context, snippet=full_log[-800:])
1704
+ # Extract error from SMCL (authoritative source)
1705
+ msg, context = self._extract_error_from_smcl(smcl_content, rc)
1706
+
1707
+ error_envelope = ErrorEnvelope(
1708
+ message=msg,
1709
+ rc=rc,
1710
+ context=context,
1711
+ snippet=smcl_content[-800:] if smcl_content else (stdout_content + stderr_content)[-800:],
1712
+ smcl_output=smcl_content # Include raw SMCL for debugging
1713
+ )
1714
+ stderr_content = context
571
1715
 
572
- return CommandResponse(
1716
+ resp = CommandResponse(
573
1717
  command=code,
574
1718
  rc=rc,
575
1719
  stdout=stdout_content,
576
1720
  stderr=stderr_content,
577
1721
  success=(rc == 0),
578
1722
  error=error_envelope,
1723
+ log_path=smcl_path if smcl_path else None,
1724
+ smcl_output=smcl_content,
579
1725
  )
580
1726
 
1727
+ # Capture results immediately after execution, INSIDE the lock
1728
+ try:
1729
+ self._last_results = self.get_stored_results(force_fresh=True)
1730
+ except Exception:
1731
+ self._last_results = None
1732
+
1733
+ return resp
1734
+
581
1735
  def _exec_no_capture(self, code: str, echo: bool = False, trace: bool = False) -> CommandResponse:
582
1736
  """Execute Stata code while leaving stdout/stderr alone."""
583
1737
  if not self._initialized:
@@ -595,10 +1749,8 @@ class StataClient:
595
1749
  ret = self.stata.run(code, echo=echo)
596
1750
  if isinstance(ret, str) and ret:
597
1751
  ret_text = ret
1752
+
598
1753
 
599
- # Robust RC check even for no-capture
600
- rc = self._read_return_code()
601
-
602
1754
  except Exception as e:
603
1755
  exc = e
604
1756
  rc = 1
@@ -631,23 +1783,115 @@ class StataClient:
631
1783
  error=error,
632
1784
  )
633
1785
 
1786
+ def _exec_no_capture_silent(self, code: str, echo: bool = False, trace: bool = False) -> CommandResponse:
1787
+ """Execute Stata code while suppressing stdout/stderr output."""
1788
+ if not self._initialized:
1789
+ self.init()
1790
+
1791
+ exc: Optional[Exception] = None
1792
+ ret_text: Optional[str] = None
1793
+ rc = 0
1794
+
1795
+ with self._exec_lock:
1796
+ try:
1797
+ from sfi import Scalar # Import SFI tools
1798
+ if trace:
1799
+ self.stata.run("set trace on")
1800
+ output_buf = StringIO()
1801
+ with redirect_stdout(output_buf), redirect_stderr(output_buf):
1802
+ ret = self.stata.run(code, echo=echo)
1803
+ if isinstance(ret, str) and ret:
1804
+ ret_text = ret
1805
+ except Exception as e:
1806
+ exc = e
1807
+ rc = 1
1808
+ finally:
1809
+ if trace:
1810
+ try:
1811
+ self.stata.run("set trace off")
1812
+ except Exception as e:
1813
+ logger.warning("Failed to turn off Stata trace mode: %s", e)
1814
+
1815
+ stdout = ""
1816
+ stderr = ""
1817
+ success = rc == 0 and exc is None
1818
+ error = None
1819
+ if not success:
1820
+ msg = str(exc) if exc else f"Stata error r({rc})"
1821
+ error = ErrorEnvelope(
1822
+ message=msg,
1823
+ rc=rc,
1824
+ command=code,
1825
+ stdout=ret_text,
1826
+ )
1827
+
1828
+ return CommandResponse(
1829
+ command=code,
1830
+ rc=rc,
1831
+ stdout=stdout,
1832
+ stderr=None,
1833
+ success=success,
1834
+ error=error,
1835
+ )
1836
+
1837
+ def exec_lightweight(self, code: str) -> CommandResponse:
1838
+ """
1839
+ Executes a command using simple stdout redirection (no SMCL logs).
1840
+ Much faster on Windows as it avoids FS operations.
1841
+ LIMITED: Does not support error envelopes or complex return code parsing.
1842
+ """
1843
+ if not self._initialized:
1844
+ self.init()
1845
+
1846
+ code = self._maybe_rewrite_graph_name_in_command(code)
1847
+
1848
+ output_buffer = StringIO()
1849
+ error_buffer = StringIO()
1850
+ rc = 0
1851
+ exc = None
1852
+
1853
+ with self._exec_lock:
1854
+ with self._redirect_io(output_buffer, error_buffer):
1855
+ try:
1856
+ self.stata.run(code, echo=False)
1857
+ except Exception as e:
1858
+ exc = e
1859
+ rc = 1
1860
+
1861
+ stdout = output_buffer.getvalue()
1862
+ stderr = error_buffer.getvalue()
1863
+
1864
+ return CommandResponse(
1865
+ command=code,
1866
+ rc=rc,
1867
+ stdout=stdout,
1868
+ stderr=stderr if not exc else str(exc),
1869
+ success=(rc == 0),
1870
+ error=None
1871
+ )
1872
+
634
1873
  async def run_command_streaming(
635
- self,
636
- code: str,
637
- *,
638
- notify_log: Callable[[str], Awaitable[None]],
639
- notify_progress: Optional[Callable[[float, Optional[float], Optional[str]], Awaitable[None]]] = None,
640
- echo: bool = True,
641
- trace: bool = False,
642
- max_output_lines: Optional[int] = None,
643
- cwd: Optional[str] = None,
644
- auto_cache_graphs: bool = False,
645
- on_graph_cached: Optional[Callable[[str, bool], Awaitable[None]]] = None,
646
- ) -> CommandResponse:
1874
+ self,
1875
+ code: str,
1876
+ *,
1877
+ notify_log: Callable[[str], Awaitable[None]],
1878
+ notify_progress: Optional[Callable[[float, Optional[float], Optional[str]], Awaitable[None]]] = None,
1879
+ echo: bool = True,
1880
+ trace: bool = False,
1881
+ max_output_lines: Optional[int] = None,
1882
+ cwd: Optional[str] = None,
1883
+ auto_cache_graphs: bool = False,
1884
+ on_graph_cached: Optional[Callable[[str, bool], Awaitable[None]]] = None,
1885
+ emit_graph_ready: bool = False,
1886
+ graph_ready_task_id: Optional[str] = None,
1887
+ graph_ready_format: str = "svg",
1888
+ ) -> CommandResponse:
647
1889
  if not self._initialized:
648
1890
  self.init()
649
1891
 
650
1892
  code = self._maybe_rewrite_graph_name_in_command(code)
1893
+ auto_cache_graphs = auto_cache_graphs or emit_graph_ready
1894
+ total_lines = 0 # Commands (not do-files) do not have line-based progress
651
1895
 
652
1896
  if cwd is not None and not os.path.isdir(cwd):
653
1897
  return CommandResponse(
@@ -665,136 +1909,171 @@ class StataClient:
665
1909
 
666
1910
  start_time = time.time()
667
1911
  exc: Optional[Exception] = None
1912
+ smcl_content = ""
1913
+ smcl_path = None
668
1914
 
669
1915
  # Setup streaming graph cache if enabled
670
- graph_cache = None
671
- if auto_cache_graphs:
672
- graph_cache = StreamingGraphCache(self, auto_cache=True)
673
-
674
- graph_cache_callback = self._create_graph_cache_callback(on_graph_cached, notify_log)
675
-
676
- graph_cache.add_cache_callback(graph_cache_callback)
1916
+ graph_cache = self._init_streaming_graph_cache(auto_cache_graphs, on_graph_cached, notify_log)
677
1917
 
678
- log_file = tempfile.NamedTemporaryFile(
679
- prefix="mcp_stata_",
680
- suffix=".log",
681
- delete=False,
682
- mode="w",
683
- encoding="utf-8",
684
- errors="replace",
685
- buffering=1,
686
- )
687
- log_path = log_file.name
688
- tail = TailBuffer(max_chars=200000 if trace else 20000)
689
- tee = FileTeeIO(log_file, tail)
1918
+ _log_file, log_path, tail, tee = self._create_streaming_log(trace=trace)
1919
+
1920
+ # Create SMCL log path for authoritative output capture
1921
+ smcl_path = self._create_smcl_log_path(base_dir=cwd)
1922
+ smcl_log_name = self._make_smcl_log_name()
690
1923
 
691
1924
  # Inform the MCP client immediately where to read/tail the output.
692
- await notify_log(json.dumps({"event": "log_path", "path": log_path}))
1925
+ await notify_log(json.dumps({"event": "log_path", "path": smcl_path}))
693
1926
 
694
1927
  rc = -1
1928
+ path_for_stata = code.replace("\\", "/")
1929
+ command = f'{path_for_stata}'
695
1930
 
696
- def _run_blocking() -> None:
697
- nonlocal rc, exc
698
- with self._exec_lock:
699
- self._is_executing = True
700
- try:
701
- from sfi import Scalar, SFIToolkit # Import SFI tools
702
- with self._temp_cwd(cwd):
703
- with self._redirect_io_streaming(tee, tee):
704
- try:
705
- if trace:
706
- self.stata.run("set trace on")
707
- ret = self.stata.run(code, echo=echo)
708
- # Some PyStata builds return output as a string rather than printing.
709
- if isinstance(ret, str) and ret:
710
- try:
711
- tee.write(ret)
712
- except Exception:
713
- pass
1931
+ graph_ready_initial = self._capture_graph_state(graph_cache, emit_graph_ready)
1932
+
1933
+ # Increment AFTER capture so detected modifications are based on state BEFORE this command
1934
+ self._increment_command_idx()
1935
+
1936
+ graph_poll_state = [0.0]
1937
+
1938
+ async def on_chunk_for_graphs(_chunk: str) -> None:
1939
+ # Background the graph check so we don't block SMCL streaming or task completion
1940
+ asyncio.create_task(
1941
+ self._maybe_cache_graphs_on_chunk(
1942
+ graph_cache=graph_cache,
1943
+ emit_graph_ready=emit_graph_ready,
1944
+ notify_log=notify_log,
1945
+ graph_ready_task_id=graph_ready_task_id,
1946
+ graph_ready_format=graph_ready_format,
1947
+ graph_ready_initial=graph_ready_initial,
1948
+ last_check=graph_poll_state,
1949
+ )
1950
+ )
714
1951
 
715
- # ROBUST DETECTION & OUTPUT
716
- rc = self._read_return_code()
1952
+ done = anyio.Event()
717
1953
 
718
- except Exception as e:
719
- exc = e
720
- if rc == 0: rc = 1
721
- finally:
722
- if trace:
723
- try:
724
- self.stata.run("set trace off")
725
- except Exception:
726
- pass
1954
+ try:
1955
+ async with anyio.create_task_group() as tg:
1956
+ async def stream_smcl() -> None:
1957
+ try:
1958
+ await self._stream_smcl_log(
1959
+ smcl_path=smcl_path,
1960
+ notify_log=notify_log,
1961
+ done=done,
1962
+ on_chunk=on_chunk_for_graphs if graph_cache else None,
1963
+ )
1964
+ except Exception as exc:
1965
+ logger.debug("SMCL streaming failed: %s", exc)
1966
+
1967
+ tg.start_soon(stream_smcl)
1968
+
1969
+ if notify_progress is not None:
1970
+ if total_lines > 0:
1971
+ await notify_progress(0, float(total_lines), f"Executing command: 0/{total_lines}")
1972
+ else:
1973
+ await notify_progress(0, None, "Running command")
1974
+
1975
+ try:
1976
+ run_blocking = lambda: self._run_streaming_blocking(
1977
+ command=command,
1978
+ tee=tee,
1979
+ cwd=cwd,
1980
+ trace=trace,
1981
+ echo=echo,
1982
+ smcl_path=smcl_path,
1983
+ smcl_log_name=smcl_log_name,
1984
+ hold_attr="_hold_name_stream",
1985
+ require_smcl_log=True,
1986
+ )
1987
+ try:
1988
+ rc, exc = await anyio.to_thread.run_sync(
1989
+ run_blocking,
1990
+ abandon_on_cancel=True,
1991
+ )
1992
+ except TypeError:
1993
+ rc, exc = await anyio.to_thread.run_sync(run_blocking)
1994
+ except Exception as e:
1995
+ exc = e
1996
+ if rc in (-1, 0):
1997
+ rc = 1
1998
+ except get_cancelled_exc_class():
1999
+ self._request_break_in()
2000
+ await self._wait_for_stata_stop()
2001
+ raise
727
2002
  finally:
728
- self._is_executing = False
2003
+ done.set()
2004
+ tee.close()
2005
+ except* Exception as exc_group:
2006
+ logger.debug("SMCL streaming task group failed: %s", exc_group)
729
2007
 
730
- try:
731
- if notify_progress is not None:
732
- await notify_progress(0, None, "Running Stata command")
733
-
734
- await anyio.to_thread.run_sync(_run_blocking, abandon_on_cancel=True)
735
- except get_cancelled_exc_class():
736
- # Best-effort cancellation: signal Stata to break, wait briefly, then propagate.
737
- self._request_break_in()
738
- await self._wait_for_stata_stop()
739
- raise
740
- finally:
741
- tee.close()
2008
+ # Read SMCL content as the authoritative source
2009
+ smcl_content = self._read_smcl_file(smcl_path)
742
2010
 
743
- # Cache detected graphs after command completes
744
2011
  if graph_cache:
745
- try:
746
- # Use the enhanced pystata-integrated caching method
747
- if hasattr(graph_cache, 'cache_detected_graphs_with_pystata'):
748
- cached_graphs = await graph_cache.cache_detected_graphs_with_pystata()
749
- else:
750
- cached_graphs = await graph_cache.cache_detected_graphs()
751
-
752
- if cached_graphs and notify_progress:
753
- await notify_progress(1, 1, f"Command completed. Cached {len(cached_graphs)} graphs: {', '.join(cached_graphs)}")
754
- except Exception as e:
755
- logger.warning(f"Failed to cache detected graphs: {e}")
2012
+ asyncio.create_task(
2013
+ self._cache_new_graphs(
2014
+ graph_cache,
2015
+ notify_progress=notify_progress,
2016
+ total_lines=total_lines,
2017
+ completed_label="Command",
2018
+ )
2019
+ )
756
2020
 
757
- tail_text = tail.get_value()
758
- log_tail = self._read_log_tail(log_path, 200000 if trace else 20000)
759
- if log_tail and len(log_tail) > len(tail_text):
760
- tail_text = log_tail
761
- combined = (tail_text or "") + (f"\n{exc}" if exc else "")
2021
+ combined = self._build_combined_log(tail, smcl_path, rc, trace, exc)
762
2022
 
2023
+ # Use SMCL content as primary source for RC detection
2024
+ if not exc or rc in (1, -1):
2025
+ parsed_rc = self._parse_rc_from_smcl(smcl_content)
2026
+ if parsed_rc is not None and parsed_rc != 0:
2027
+ rc = parsed_rc
2028
+ elif rc in (-1, 0, 1): # Also check text if rc is generic 1 or unset
2029
+ parsed_rc_text = self._parse_rc_from_text(combined)
2030
+ if parsed_rc_text is not None:
2031
+ rc = parsed_rc_text
2032
+ elif rc == -1:
2033
+ rc = 0 # Default to success if no error trace found
2034
+
763
2035
  success = (rc == 0 and exc is None)
2036
+ stderr_final = None
764
2037
  error = None
765
2038
 
766
2039
  if not success:
767
- # Use robust extractor
768
- msg, context = self._extract_error_and_context(combined, rc)
769
-
2040
+ # Use SMCL as authoritative source for error extraction
2041
+ if smcl_content:
2042
+ msg, context = self._extract_error_from_smcl(smcl_content, rc)
2043
+ else:
2044
+ # Fallback to combined log
2045
+ msg, context = self._extract_error_and_context(combined, rc)
2046
+
770
2047
  error = ErrorEnvelope(
771
2048
  message=msg,
772
2049
  context=context,
773
2050
  rc=rc,
774
- command=code,
2051
+ command=command,
775
2052
  log_path=log_path,
776
- snippet=combined[-800:] # Keep snippet for backward compat
2053
+ snippet=smcl_content[-800:] if smcl_content else combined[-800:],
2054
+ smcl_output=smcl_content,
777
2055
  )
2056
+ stderr_final = context
778
2057
 
779
2058
  duration = time.time() - start_time
780
- code_preview = code.replace("\n", "\\n")
781
2059
  logger.info(
782
2060
  "stata.run(stream) rc=%s success=%s trace=%s duration_ms=%.2f code_preview=%s",
783
2061
  rc,
784
2062
  success,
785
2063
  trace,
786
2064
  duration * 1000,
787
- code_preview[:120],
2065
+ code.replace("\n", "\\n")[:120],
788
2066
  )
789
2067
 
790
2068
  result = CommandResponse(
791
2069
  command=code,
792
2070
  rc=rc,
793
2071
  stdout="",
794
- stderr=None,
2072
+ stderr=stderr_final,
795
2073
  log_path=log_path,
796
2074
  success=success,
797
2075
  error=error,
2076
+ smcl_output=smcl_content,
798
2077
  )
799
2078
 
800
2079
  if notify_progress is not None:
@@ -802,69 +2081,25 @@ class StataClient:
802
2081
 
803
2082
  return result
804
2083
 
805
- def _count_do_file_lines(self, path: str) -> int:
806
- try:
807
- with open(path, "r", encoding="utf-8", errors="replace") as f:
808
- lines = f.read().splitlines()
809
- except Exception:
810
- return 0
811
-
812
- total = 0
813
- for line in lines:
814
- s = line.strip()
815
- if not s:
816
- continue
817
- if s.startswith("*"):
818
- continue
819
- if s.startswith("//"):
820
- continue
821
- total += 1
822
- return total
823
-
824
2084
  async def run_do_file_streaming(
825
- self,
826
- path: str,
827
- *,
828
- notify_log: Callable[[str], Awaitable[None]],
829
- notify_progress: Optional[Callable[[float, Optional[float], Optional[str]], Awaitable[None]]] = None,
830
- echo: bool = True,
831
- trace: bool = False,
832
- max_output_lines: Optional[int] = None,
833
- cwd: Optional[str] = None,
834
- auto_cache_graphs: bool = False,
835
- on_graph_cached: Optional[Callable[[str, bool], Awaitable[None]]] = None,
836
- ) -> CommandResponse:
837
- if cwd is not None and not os.path.isdir(cwd):
838
- return CommandResponse(
839
- command=f'do "{path}"',
840
- rc=601,
841
- stdout="",
842
- stderr=None,
843
- success=False,
844
- error=ErrorEnvelope(
845
- message=f"cwd not found: {cwd}",
846
- rc=601,
847
- command=path,
848
- ),
849
- )
850
-
851
- effective_path = path
852
- if cwd is not None and not os.path.isabs(path):
853
- effective_path = os.path.abspath(os.path.join(cwd, path))
854
-
855
- if not os.path.exists(effective_path):
856
- return CommandResponse(
857
- command=f'do "{effective_path}"',
858
- rc=601,
859
- stdout="",
860
- stderr=None,
861
- success=False,
862
- error=ErrorEnvelope(
863
- message=f"Do-file not found: {effective_path}",
864
- rc=601,
865
- command=effective_path,
866
- ),
867
- )
2085
+ self,
2086
+ path: str,
2087
+ *,
2088
+ notify_log: Callable[[str], Awaitable[None]],
2089
+ notify_progress: Optional[Callable[[float, Optional[float], Optional[str]], Awaitable[None]]] = None,
2090
+ echo: bool = True,
2091
+ trace: bool = False,
2092
+ max_output_lines: Optional[int] = None,
2093
+ cwd: Optional[str] = None,
2094
+ auto_cache_graphs: bool = False,
2095
+ on_graph_cached: Optional[Callable[[str, bool], Awaitable[None]]] = None,
2096
+ emit_graph_ready: bool = False,
2097
+ graph_ready_task_id: Optional[str] = None,
2098
+ graph_ready_format: str = "svg",
2099
+ ) -> CommandResponse:
2100
+ effective_path, command, error_response = self._resolve_do_file_path(path, cwd)
2101
+ if error_response is not None:
2102
+ return error_response
868
2103
 
869
2104
  total_lines = self._count_do_file_lines(effective_path)
870
2105
  executed_lines = 0
@@ -893,174 +2128,145 @@ class StataClient:
893
2128
  if not self._initialized:
894
2129
  self.init()
895
2130
 
2131
+ auto_cache_graphs = auto_cache_graphs or emit_graph_ready
2132
+
896
2133
  start_time = time.time()
897
2134
  exc: Optional[Exception] = None
2135
+ smcl_content = ""
2136
+ smcl_path = None
898
2137
 
899
- # Setup streaming graph cache if enabled
900
- graph_cache = None
901
- if auto_cache_graphs:
902
- graph_cache = StreamingGraphCache(self, auto_cache=True)
903
-
904
- graph_cache_callback = self._create_graph_cache_callback(on_graph_cached, notify_log)
905
-
906
- graph_cache.add_cache_callback(graph_cache_callback)
2138
+ graph_cache = self._init_streaming_graph_cache(auto_cache_graphs, on_graph_cached, notify_log)
2139
+ _log_file, log_path, tail, tee = self._create_streaming_log(trace=trace)
907
2140
 
908
- log_file = tempfile.NamedTemporaryFile(
909
- prefix="mcp_stata_",
910
- suffix=".log",
911
- delete=False,
912
- mode="w",
913
- encoding="utf-8",
914
- errors="replace",
915
- buffering=1,
916
- )
917
- log_path = log_file.name
918
- tail = TailBuffer(max_chars=200000 if trace else 20000)
919
- tee = FileTeeIO(log_file, tail)
2141
+ base_dir = cwd or os.path.dirname(effective_path)
2142
+ smcl_path = self._create_smcl_log_path(base_dir=base_dir)
2143
+ smcl_log_name = self._make_smcl_log_name()
920
2144
 
921
2145
  # Inform the MCP client immediately where to read/tail the output.
922
- await notify_log(json.dumps({"event": "log_path", "path": log_path}))
2146
+ await notify_log(json.dumps({"event": "log_path", "path": smcl_path}))
923
2147
 
924
2148
  rc = -1
925
- path_for_stata = effective_path.replace("\\", "/")
926
- command = f'do "{path_for_stata}"'
2149
+ graph_ready_initial = self._capture_graph_state(graph_cache, emit_graph_ready)
2150
+
2151
+ # Increment AFTER capture
2152
+ self._increment_command_idx()
2153
+
2154
+ graph_poll_state = [0.0]
2155
+
2156
+ async def on_chunk_for_graphs(_chunk: str) -> None:
2157
+ # Background the graph check so we don't block SMCL streaming or task completion
2158
+ asyncio.create_task(
2159
+ self._maybe_cache_graphs_on_chunk(
2160
+ graph_cache=graph_cache,
2161
+ emit_graph_ready=emit_graph_ready,
2162
+ notify_log=notify_log,
2163
+ graph_ready_task_id=graph_ready_task_id,
2164
+ graph_ready_format=graph_ready_format,
2165
+ graph_ready_initial=graph_ready_initial,
2166
+ last_check=graph_poll_state,
2167
+ )
2168
+ )
927
2169
 
928
- # Capture initial graph state BEFORE execution starts
2170
+ on_chunk_callback = on_chunk_for_progress
929
2171
  if graph_cache:
930
- try:
931
- graph_cache._initial_graphs = set(self.list_graphs())
932
- logger.debug(f"Initial graph state captured: {graph_cache._initial_graphs}")
933
- except Exception as e:
934
- logger.debug(f"Failed to capture initial graph state: {e}")
935
- graph_cache._initial_graphs = set()
936
-
937
- def _run_blocking() -> None:
938
- nonlocal rc, exc
939
- with self._exec_lock:
940
- # Set execution flag to prevent recursive Stata calls
941
- self._is_executing = True
942
- try:
943
- from sfi import Scalar, SFIToolkit # Import SFI tools
944
- with self._temp_cwd(cwd):
945
- with self._redirect_io_streaming(tee, tee):
946
- try:
947
- if trace:
948
- self.stata.run("set trace on")
949
- ret = self.stata.run(command, echo=echo)
950
- # Some PyStata builds return output as a string rather than printing.
951
- if isinstance(ret, str) and ret:
952
- try:
953
- tee.write(ret)
954
- except Exception:
955
- pass
956
-
957
- # ROBUST DETECTION & OUTPUT
958
- rc = self._read_return_code()
959
-
960
- except Exception as e:
961
- exc = e
962
- if rc == 0: rc = 1
963
- finally:
964
- if trace:
965
- try: self.stata.run("set trace off")
966
- except: pass
967
- finally:
968
- # Clear execution flag
969
- self._is_executing = False
2172
+ async def on_chunk_callback(chunk: str) -> None:
2173
+ await on_chunk_for_progress(chunk)
2174
+ await on_chunk_for_graphs(chunk)
970
2175
 
971
2176
  done = anyio.Event()
972
2177
 
973
- async def _monitor_progress_from_log() -> None:
974
- if notify_progress is None or total_lines <= 0:
975
- return
976
- last_pos = 0
977
- try:
978
- with open(log_path, "r", encoding="utf-8", errors="replace") as f:
979
- while not done.is_set():
980
- f.seek(last_pos)
981
- chunk = f.read()
982
- if chunk:
983
- last_pos = f.tell()
984
- await on_chunk_for_progress(chunk)
985
- await anyio.sleep(0.05)
986
-
987
- f.seek(last_pos)
988
- chunk = f.read()
989
- if chunk:
990
- await on_chunk_for_progress(chunk)
991
- except Exception:
992
- return
993
-
994
- async with anyio.create_task_group() as tg:
995
- tg.start_soon(_monitor_progress_from_log)
996
-
997
- if notify_progress is not None:
998
- if total_lines > 0:
999
- await notify_progress(0, float(total_lines), f"Executing do-file: 0/{total_lines}")
1000
- else:
1001
- await notify_progress(0, None, "Running do-file")
1002
-
1003
- try:
1004
- await anyio.to_thread.run_sync(_run_blocking, abandon_on_cancel=True)
1005
- except get_cancelled_exc_class():
1006
- self._request_break_in()
1007
- await self._wait_for_stata_stop()
1008
- raise
1009
- finally:
1010
- done.set()
1011
- tee.close()
2178
+ try:
2179
+ async with anyio.create_task_group() as tg:
2180
+ async def stream_smcl() -> None:
2181
+ try:
2182
+ await self._stream_smcl_log(
2183
+ smcl_path=smcl_path,
2184
+ notify_log=notify_log,
2185
+ done=done,
2186
+ on_chunk=on_chunk_callback,
2187
+ )
2188
+ except Exception as exc:
2189
+ logger.debug("SMCL streaming failed: %s", exc)
1012
2190
 
1013
- # Robust post-execution graph detection and caching
1014
- if graph_cache and graph_cache.auto_cache:
1015
- try:
1016
- # [Existing graph cache logic kept identical]
1017
- cached_graphs = []
1018
- initial_graphs = getattr(graph_cache, '_initial_graphs', set())
1019
- current_graphs = set(self.list_graphs())
1020
- new_graphs = current_graphs - initial_graphs - graph_cache._cached_graphs
2191
+ tg.start_soon(stream_smcl)
1021
2192
 
1022
- if new_graphs:
1023
- logger.info(f"Detected {len(new_graphs)} new graph(s): {sorted(new_graphs)}")
2193
+ if notify_progress is not None:
2194
+ if total_lines > 0:
2195
+ await notify_progress(0, float(total_lines), f"Executing do-file: 0/{total_lines}")
2196
+ else:
2197
+ await notify_progress(0, None, "Running do-file")
1024
2198
 
1025
- for graph_name in new_graphs:
2199
+ try:
2200
+ run_blocking = lambda: self._run_streaming_blocking(
2201
+ command=command,
2202
+ tee=tee,
2203
+ cwd=cwd,
2204
+ trace=trace,
2205
+ echo=echo,
2206
+ smcl_path=smcl_path,
2207
+ smcl_log_name=smcl_log_name,
2208
+ hold_attr="_hold_name_do",
2209
+ require_smcl_log=True,
2210
+ )
1026
2211
  try:
1027
- cache_result = await anyio.to_thread.run_sync(
1028
- self.cache_graph_on_creation,
1029
- graph_name
2212
+ rc, exc = await anyio.to_thread.run_sync(
2213
+ run_blocking,
2214
+ abandon_on_cancel=True,
1030
2215
  )
1031
- if cache_result:
1032
- cached_graphs.append(graph_name)
1033
- graph_cache._cached_graphs.add(graph_name)
1034
-
1035
- for callback in graph_cache._cache_callbacks:
1036
- try:
1037
- await anyio.to_thread.run_sync(callback, graph_name, cache_result)
1038
- except Exception: pass
1039
- except Exception as e:
1040
- logger.error(f"Error caching graph {graph_name}: {e}")
1041
-
1042
- # Notify progress if graphs were cached
1043
- if cached_graphs and notify_progress:
1044
- await notify_progress(
1045
- float(total_lines) if total_lines > 0 else 1,
1046
- float(total_lines) if total_lines > 0 else 1,
1047
- f"Do-file completed. Cached {len(cached_graphs)} graph(s): {', '.join(cached_graphs)}"
1048
- )
1049
- except Exception as e:
1050
- logger.error(f"Post-execution graph detection failed: {e}")
2216
+ except TypeError:
2217
+ rc, exc = await anyio.to_thread.run_sync(run_blocking)
2218
+ except Exception as e:
2219
+ exc = e
2220
+ if rc in (-1, 0):
2221
+ rc = 1
2222
+ except get_cancelled_exc_class():
2223
+ self._request_break_in()
2224
+ await self._wait_for_stata_stop()
2225
+ raise
2226
+ finally:
2227
+ done.set()
2228
+ tee.close()
2229
+ except* Exception as exc_group:
2230
+ logger.debug("SMCL streaming task group failed: %s", exc_group)
1051
2231
 
1052
- tail_text = tail.get_value()
1053
- log_tail = self._read_log_tail(log_path, 200000 if trace else 20000)
1054
- if log_tail and len(log_tail) > len(tail_text):
1055
- tail_text = log_tail
1056
- combined = (tail_text or "") + (f"\n{exc}" if exc else "")
2232
+ # Read SMCL content as the authoritative source
2233
+ smcl_content = self._read_smcl_file(smcl_path)
2234
+
2235
+ if graph_cache:
2236
+ asyncio.create_task(
2237
+ self._cache_new_graphs(
2238
+ graph_cache,
2239
+ notify_progress=notify_progress,
2240
+ total_lines=total_lines,
2241
+ completed_label="Do-file",
2242
+ )
2243
+ )
2244
+
2245
+ combined = self._build_combined_log(tail, log_path, rc, trace, exc)
1057
2246
 
2247
+ # Use SMCL content as primary source for RC detection
2248
+ if not exc or rc in (1, -1):
2249
+ parsed_rc = self._parse_rc_from_smcl(smcl_content)
2250
+ if parsed_rc is not None and parsed_rc != 0:
2251
+ rc = parsed_rc
2252
+ elif rc in (-1, 0, 1):
2253
+ parsed_rc_text = self._parse_rc_from_text(combined)
2254
+ if parsed_rc_text is not None:
2255
+ rc = parsed_rc_text
2256
+ elif rc == -1:
2257
+ rc = 0 # Default to success if no error found
2258
+
1058
2259
  success = (rc == 0 and exc is None)
2260
+ stderr_final = None
1059
2261
  error = None
1060
2262
 
1061
2263
  if not success:
1062
- # Robust extraction
1063
- msg, context = self._extract_error_and_context(combined, rc)
2264
+ # Use SMCL as authoritative source for error extraction
2265
+ if smcl_content:
2266
+ msg, context = self._extract_error_from_smcl(smcl_content, rc)
2267
+ else:
2268
+ # Fallback to combined log
2269
+ msg, context = self._extract_error_and_context(combined, rc)
1064
2270
 
1065
2271
  error = ErrorEnvelope(
1066
2272
  message=msg,
@@ -1068,8 +2274,10 @@ class StataClient:
1068
2274
  rc=rc,
1069
2275
  command=command,
1070
2276
  log_path=log_path,
1071
- snippet=combined[-800:]
2277
+ snippet=smcl_content[-800:] if smcl_content else combined[-800:],
2278
+ smcl_output=smcl_content,
1072
2279
  )
2280
+ stderr_final = context
1073
2281
 
1074
2282
  duration = time.time() - start_time
1075
2283
  logger.info(
@@ -1085,10 +2293,11 @@ class StataClient:
1085
2293
  command=command,
1086
2294
  rc=rc,
1087
2295
  stdout="",
1088
- stderr=None,
2296
+ stderr=stderr_final,
1089
2297
  log_path=log_path,
1090
2298
  success=success,
1091
2299
  error=error,
2300
+ smcl_output=smcl_content,
1092
2301
  )
1093
2302
 
1094
2303
  if notify_progress is not None:
@@ -1110,22 +2319,7 @@ class StataClient:
1110
2319
  """
1111
2320
  result = self._exec_with_capture(code, echo=echo, trace=trace, cwd=cwd)
1112
2321
 
1113
- # Truncate stdout if requested
1114
- if max_output_lines is not None and result.stdout:
1115
- lines = result.stdout.splitlines()
1116
- if len(lines) > max_output_lines:
1117
- truncated_lines = lines[:max_output_lines]
1118
- truncated_lines.append(f"\n... (output truncated: showing {max_output_lines} of {len(lines)} lines)")
1119
- result = CommandResponse(
1120
- command=result.command,
1121
- rc=result.rc,
1122
- stdout="\n".join(truncated_lines),
1123
- stderr=result.stderr,
1124
- success=result.success,
1125
- error=result.error,
1126
- )
1127
-
1128
- return result
2322
+ return self._truncate_command_output(result, max_output_lines)
1129
2323
 
1130
2324
  def get_data(self, start: int = 0, count: int = 50) -> List[Dict[str, Any]]:
1131
2325
  """Returns valid JSON-serializable data."""
@@ -1135,17 +2329,18 @@ class StataClient:
1135
2329
  if count > self.MAX_DATA_ROWS:
1136
2330
  count = self.MAX_DATA_ROWS
1137
2331
 
1138
- try:
1139
- # Use pystata integration to retrieve data
1140
- df = self.stata.pdataframe_from_data()
2332
+ with self._exec_lock:
2333
+ try:
2334
+ # Use pystata integration to retrieve data
2335
+ df = self.stata.pdataframe_from_data()
1141
2336
 
1142
- # Slice
1143
- sliced = df.iloc[start : start + count]
2337
+ # Slice
2338
+ sliced = df.iloc[start : start + count]
1144
2339
 
1145
- # Convert to dict
1146
- return sliced.to_dict(orient="records")
1147
- except Exception as e:
1148
- return [{"error": f"Failed to retrieve data: {e}"}]
2340
+ # Convert to dict
2341
+ return sliced.to_dict(orient="records")
2342
+ except Exception as e:
2343
+ return [{"error": f"Failed to retrieve data: {e}"}]
1149
2344
 
1150
2345
  def list_variables(self) -> List[Dict[str, str]]:
1151
2346
  """Returns list of variables with labels."""
@@ -1155,17 +2350,18 @@ class StataClient:
1155
2350
  # We can use sfi to be efficient
1156
2351
  from sfi import Data # type: ignore[import-not-found]
1157
2352
  vars_info = []
1158
- for i in range(Data.getVarCount()):
1159
- var_index = i # 0-based
1160
- name = Data.getVarName(var_index)
1161
- label = Data.getVarLabel(var_index)
1162
- type_str = Data.getVarType(var_index) # Returns int
1163
-
1164
- vars_info.append({
1165
- "name": name,
1166
- "label": label,
1167
- "type": str(type_str),
1168
- })
2353
+ with self._exec_lock:
2354
+ for i in range(Data.getVarCount()):
2355
+ var_index = i # 0-based
2356
+ name = Data.getVarName(var_index)
2357
+ label = Data.getVarLabel(var_index)
2358
+ type_str = Data.getVarType(var_index) # Returns int
2359
+
2360
+ vars_info.append({
2361
+ "name": name,
2362
+ "label": label,
2363
+ "type": str(type_str),
2364
+ })
1169
2365
  return vars_info
1170
2366
 
1171
2367
  def get_dataset_state(self) -> Dict[str, Any]:
@@ -1175,24 +2371,28 @@ class StataClient:
1175
2371
 
1176
2372
  from sfi import Data, Macro # type: ignore[import-not-found]
1177
2373
 
1178
- n = int(Data.getObsTotal())
1179
- k = int(Data.getVarCount())
2374
+ with self._exec_lock:
2375
+ n = int(Data.getObsTotal())
2376
+ k = int(Data.getVarCount())
1180
2377
 
1181
- frame = "default"
1182
- sortlist = ""
1183
- changed = False
1184
- try:
1185
- frame = str(Macro.getCValue("frame") or "default")
1186
- except Exception:
1187
2378
  frame = "default"
1188
- try:
1189
- sortlist = str(Macro.getCValue("sortlist") or "")
1190
- except Exception:
1191
2379
  sortlist = ""
1192
- try:
1193
- changed = bool(int(float(Macro.getCValue("changed") or "0")))
1194
- except Exception:
1195
2380
  changed = False
2381
+ try:
2382
+ frame = str(Macro.getGlobal("frame") or "default")
2383
+ except Exception:
2384
+ logger.debug("Failed to get 'frame' macro", exc_info=True)
2385
+ frame = "default"
2386
+ try:
2387
+ sortlist = str(Macro.getGlobal("sortlist") or "")
2388
+ except Exception:
2389
+ logger.debug("Failed to get 'sortlist' macro", exc_info=True)
2390
+ sortlist = ""
2391
+ try:
2392
+ changed = bool(int(float(Macro.getGlobal("changed") or "0")))
2393
+ except Exception:
2394
+ logger.debug("Failed to get 'changed' macro", exc_info=True)
2395
+ changed = False
1196
2396
 
1197
2397
  return {"frame": frame, "n": n, "k": k, "sortlist": sortlist, "changed": changed}
1198
2398
 
@@ -1206,11 +2406,12 @@ class StataClient:
1206
2406
  from sfi import Data # type: ignore[import-not-found]
1207
2407
 
1208
2408
  out: Dict[str, int] = {}
1209
- for i in range(int(Data.getVarCount())):
1210
- try:
1211
- out[str(Data.getVarName(i))] = i
1212
- except Exception:
1213
- continue
2409
+ with self._exec_lock:
2410
+ for i in range(int(Data.getVarCount())):
2411
+ try:
2412
+ out[str(Data.getVarName(i))] = i
2413
+ except Exception:
2414
+ continue
1214
2415
  return out
1215
2416
 
1216
2417
  def list_variables_rich(self) -> List[Dict[str, Any]]:
@@ -1340,6 +2541,96 @@ class StataClient:
1340
2541
  "truncated_cells": truncated_cells,
1341
2542
  }
1342
2543
 
2544
+ def get_arrow_stream(
2545
+ self,
2546
+ *,
2547
+ offset: int,
2548
+ limit: int,
2549
+ vars: List[str],
2550
+ include_obs_no: bool,
2551
+ obs_indices: Optional[List[int]] = None,
2552
+ ) -> bytes:
2553
+ """
2554
+ Returns an Apache Arrow IPC stream (as bytes) for the requested data page.
2555
+ Uses Polars if available (faster), falls back to Pandas.
2556
+ """
2557
+ if not self._initialized:
2558
+ self.init()
2559
+
2560
+ import pyarrow as pa
2561
+ from sfi import Data # type: ignore[import-not-found]
2562
+
2563
+ use_polars = _get_polars_available()
2564
+ if use_polars:
2565
+ import polars as pl
2566
+ else:
2567
+ import pandas as pd
2568
+
2569
+ state = self.get_dataset_state()
2570
+ n = int(state.get("n", 0) or 0)
2571
+ k = int(state.get("k", 0) or 0)
2572
+ if k == 0 and n == 0:
2573
+ raise RuntimeError("No data in memory")
2574
+
2575
+ var_map = self._get_var_index_map()
2576
+ for v in vars:
2577
+ if v not in var_map:
2578
+ raise ValueError(f"Invalid variable: {v}")
2579
+
2580
+ # Determine observations to fetch
2581
+ if obs_indices is None:
2582
+ start = offset
2583
+ end = min(offset + limit, n)
2584
+ obs_list = list(range(start, end)) if start < n else []
2585
+ else:
2586
+ start = offset
2587
+ end = min(offset + limit, len(obs_indices))
2588
+ obs_list = obs_indices[start:end]
2589
+
2590
+ try:
2591
+ if not obs_list:
2592
+ # Empty schema-only table
2593
+ if use_polars:
2594
+ schema_cols = {}
2595
+ if include_obs_no:
2596
+ schema_cols["_n"] = pl.Int64
2597
+ for v in vars:
2598
+ schema_cols[v] = pl.Utf8
2599
+ table = pl.DataFrame(schema=schema_cols).to_arrow()
2600
+ else:
2601
+ columns = {}
2602
+ if include_obs_no:
2603
+ columns["_n"] = pa.array([], type=pa.int64())
2604
+ for v in vars:
2605
+ columns[v] = pa.array([], type=pa.string())
2606
+ table = pa.table(columns)
2607
+ else:
2608
+ # Fetch all data in one C-call
2609
+ raw_data = Data.get(var=vars, obs=obs_list, valuelabel=False)
2610
+
2611
+ if use_polars:
2612
+ df = pl.DataFrame(raw_data, schema=vars, orient="row")
2613
+ if include_obs_no:
2614
+ obs_nums = [i + 1 for i in obs_list]
2615
+ df = df.with_columns(pl.Series("_n", obs_nums, dtype=pl.Int64))
2616
+ df = df.select(["_n"] + vars)
2617
+ table = df.to_arrow()
2618
+ else:
2619
+ df = pd.DataFrame(raw_data, columns=vars)
2620
+ if include_obs_no:
2621
+ df.insert(0, "_n", [i + 1 for i in obs_list])
2622
+ table = pa.Table.from_pandas(df, preserve_index=False)
2623
+
2624
+ # Serialize to IPC Stream
2625
+ sink = pa.BufferOutputStream()
2626
+ with pa.RecordBatchStreamWriter(sink, table.schema) as writer:
2627
+ writer.write_table(table)
2628
+
2629
+ return sink.getvalue().to_pybytes()
2630
+
2631
+ except Exception as e:
2632
+ raise RuntimeError(f"Failed to generate Arrow stream: {e}")
2633
+
1343
2634
  _FILTER_IDENT = re.compile(r"\b[A-Za-z_][A-Za-z0-9_]*\b")
1344
2635
 
1345
2636
  def _extract_filter_vars(self, filter_expr: str) -> List[str]:
@@ -1527,39 +2818,46 @@ class StataClient:
1527
2818
  return self._list_graphs_cache
1528
2819
 
1529
2820
  # Cache miss or expired, fetch fresh data
1530
- try:
1531
- # 'graph dir' returns list in r(list)
1532
- # We need to ensure we run it quietly so we don't spam.
1533
- self.stata.run("quietly graph dir, memory")
2821
+ with self._exec_lock:
2822
+ try:
2823
+ # Preservation of r() results is critical because this can be called
2824
+ # automatically after every user command (e.g., during streaming).
2825
+ import time
2826
+ hold_name = f"_mcp_ghold_{int(time.time() * 1000 % 1000000)}"
2827
+ self.stata.run(f"capture _return hold {hold_name}", echo=False)
2828
+
2829
+ try:
2830
+ self.stata.run("macro define mcp_graph_list \"\"", echo=False)
2831
+ self.stata.run("quietly graph dir, memory", echo=False)
2832
+ from sfi import Macro # type: ignore[import-not-found]
2833
+ self.stata.run("macro define mcp_graph_list `r(list)'", echo=False)
2834
+ graph_list_str = Macro.getGlobal("mcp_graph_list")
2835
+ finally:
2836
+ self.stata.run(f"capture _return restore {hold_name}", echo=False)
1534
2837
 
1535
- # Accessing r-class results in Python can be tricky via pystata's run command.
1536
- # We stash the result in a global macro that python sfi can easily read.
1537
- from sfi import Macro # type: ignore[import-not-found]
1538
- self.stata.run("global mcp_graph_list `r(list)'")
1539
- graph_list_str = Macro.getGlobal("mcp_graph_list")
1540
- raw_list = graph_list_str.split() if graph_list_str else []
2838
+ raw_list = graph_list_str.split() if graph_list_str else []
1541
2839
 
1542
- # Map internal Stata names back to user-facing names when we have an alias.
1543
- reverse = getattr(self, "_graph_name_reverse", {})
1544
- graph_list = [reverse.get(n, n) for n in raw_list]
2840
+ # Map internal Stata names back to user-facing names when we have an alias.
2841
+ reverse = getattr(self, "_graph_name_reverse", {})
2842
+ graph_list = [reverse.get(n, n) for n in raw_list]
1545
2843
 
1546
- result = graph_list
2844
+ result = graph_list
1547
2845
 
1548
- # Update cache
1549
- with self._list_graphs_cache_lock:
1550
- self._list_graphs_cache = result
1551
- self._list_graphs_cache_time = current_time
1552
-
1553
- return result
1554
-
1555
- except Exception as e:
1556
- # On error, return cached result if available, otherwise empty list
1557
- with self._list_graphs_cache_lock:
1558
- if self._list_graphs_cache is not None:
1559
- logger.warning(f"list_graphs failed, returning cached result: {e}")
1560
- return self._list_graphs_cache
1561
- logger.warning(f"list_graphs failed, no cache available: {e}")
1562
- return []
2846
+ # Update cache
2847
+ with self._list_graphs_cache_lock:
2848
+ self._list_graphs_cache = result
2849
+ self._list_graphs_cache_time = time.time()
2850
+
2851
+ return result
2852
+
2853
+ except Exception as e:
2854
+ # On error, return cached result if available, otherwise empty list
2855
+ with self._list_graphs_cache_lock:
2856
+ if self._list_graphs_cache is not None:
2857
+ logger.warning(f"list_graphs failed, returning cached result: {e}")
2858
+ return self._list_graphs_cache
2859
+ logger.warning(f"list_graphs failed, no cache available: {e}")
2860
+ return []
1563
2861
 
1564
2862
  def list_graphs_structured(self) -> GraphListResponse:
1565
2863
  names = self.list_graphs()
@@ -1583,8 +2881,9 @@ class StataClient:
1583
2881
  import tempfile
1584
2882
 
1585
2883
  fmt = (format or "pdf").strip().lower()
1586
- if fmt not in {"pdf", "png"}:
1587
- raise ValueError(f"Unsupported graph export format: {format}. Allowed: pdf, png.")
2884
+ if fmt not in {"pdf", "png", "svg"}:
2885
+ raise ValueError(f"Unsupported graph export format: {format}. Allowed: pdf, png, svg.")
2886
+
1588
2887
 
1589
2888
  if not filename:
1590
2889
  suffix = f".{fmt}"
@@ -1608,9 +2907,9 @@ class StataClient:
1608
2907
  gph_path_for_stata = gph_path.replace("\\", "/")
1609
2908
  # Make the target graph current, then save without name() (which isn't accepted there)
1610
2909
  if graph_name:
1611
- self._exec_no_capture(f'graph display "{graph_name}"', echo=False)
1612
- save_cmd = f'graph save "{gph_path_for_stata}", replace'
1613
- save_resp = self._exec_no_capture(save_cmd, echo=False)
2910
+ self._exec_no_capture_silent(f'quietly graph display "{graph_name}"', echo=False)
2911
+ save_cmd = f'quietly graph save "{gph_path_for_stata}", replace'
2912
+ save_resp = self._exec_no_capture_silent(save_cmd, echo=False)
1614
2913
  if not save_resp.success:
1615
2914
  msg = save_resp.error.message if save_resp.error else f"graph save failed (rc={save_resp.rc})"
1616
2915
  raise RuntimeError(msg)
@@ -1618,8 +2917,8 @@ class StataClient:
1618
2917
  # 2) Prepare a do-file to export PNG externally
1619
2918
  user_filename_fwd = user_filename.replace("\\", "/")
1620
2919
  do_lines = [
1621
- f'graph use "{gph_path_for_stata}"',
1622
- f'graph export "{user_filename_fwd}", replace as(png)',
2920
+ f'quietly graph use "{gph_path_for_stata}"',
2921
+ f'quietly graph export "{user_filename_fwd}", replace as(png)',
1623
2922
  "exit",
1624
2923
  ]
1625
2924
  with tempfile.NamedTemporaryFile(prefix="mcp_stata_export_", suffix=".do", delete=False, mode="w", encoding="ascii") as do_tmp:
@@ -1670,20 +2969,21 @@ class StataClient:
1670
2969
  # Stata prefers forward slashes in its command parser on Windows
1671
2970
  filename_for_stata = user_filename.replace("\\", "/")
1672
2971
 
1673
- cmd = "graph export"
1674
2972
  if graph_name:
1675
2973
  resolved = self._resolve_graph_name_for_stata(graph_name)
1676
- cmd += f' "{filename_for_stata}", name("{resolved}") replace as({fmt})'
1677
- else:
1678
- cmd += f' "{filename_for_stata}", replace as({fmt})'
2974
+ # Use display + export without name() for maximum compatibility.
2975
+ # name(NAME) often fails in PyStata for non-active graphs (r(693)).
2976
+ self._exec_no_capture_silent(f'quietly graph display "{resolved}"', echo=False)
2977
+
2978
+ cmd = f'quietly graph export "{filename_for_stata}", replace as({fmt})'
1679
2979
 
1680
2980
  # Avoid stdout/stderr redirection for graph export because PyStata's
1681
2981
  # output thread can crash on Windows when we swap stdio handles.
1682
- resp = self._exec_no_capture(cmd, echo=False)
2982
+ resp = self._exec_no_capture_silent(cmd, echo=False)
1683
2983
  if not resp.success:
1684
2984
  # Retry once after a short pause in case Stata had a transient file handle issue
1685
2985
  time.sleep(0.2)
1686
- resp_retry = self._exec_no_capture(cmd, echo=False)
2986
+ resp_retry = self._exec_no_capture_silent(cmd, echo=False)
1687
2987
  if not resp_retry.success:
1688
2988
  msg = resp_retry.error.message if resp_retry.error else f"graph export failed (rc={resp_retry.rc})"
1689
2989
  raise RuntimeError(msg)
@@ -1716,14 +3016,15 @@ class StataClient:
1716
3016
  if not self._initialized:
1717
3017
  self.init()
1718
3018
 
1719
- # Try to locate the .sthlp help file
1720
- # We use 'capture' to avoid crashing if not found
1721
- self.stata.run(f"capture findfile {topic}.sthlp")
3019
+ with self._exec_lock:
3020
+ # Try to locate the .sthlp help file
3021
+ # We use 'capture' to avoid crashing if not found
3022
+ self.stata.run(f"capture findfile {topic}.sthlp")
1722
3023
 
1723
- # Retrieve the found path from r(fn)
1724
- from sfi import Macro # type: ignore[import-not-found]
1725
- self.stata.run("global mcp_help_file `r(fn)'")
1726
- fn = Macro.getGlobal("mcp_help_file")
3024
+ # Retrieve the found path from r(fn)
3025
+ from sfi import Macro # type: ignore[import-not-found]
3026
+ self.stata.run("global mcp_help_file `r(fn)'")
3027
+ fn = Macro.getGlobal("mcp_help_file")
1727
3028
 
1728
3029
  if fn and os.path.exists(fn):
1729
3030
  try:
@@ -1737,73 +3038,77 @@ class StataClient:
1737
3038
  logger.warning("SMCL to Markdown failed, falling back to plain text: %s", parse_err)
1738
3039
  return self._smcl_to_text(smcl)
1739
3040
  except Exception as e:
1740
- return f"Error reading help file at {fn}: {e}"
3041
+ logger.warning("Help file read failed for %s: %s", topic, e)
1741
3042
 
1742
- # Fallback to URL if file not found
1743
- return f"Help file for '{topic}' not found. Please consult: https://www.stata.com/help.cgi?{topic}"
3043
+ # If no help file found, return a fallback message
3044
+ return f"Help file for '{topic}' not found."
3045
+
3046
+ def get_stored_results(self, force_fresh: bool = False) -> Dict[str, Any]:
3047
+ """Returns e() and r() results using SFI for maximum reliability."""
3048
+ if not force_fresh and self._last_results is not None:
3049
+ return self._last_results
1744
3050
 
1745
- def get_stored_results(self) -> Dict[str, Any]:
1746
- """Returns e() and r() results."""
1747
3051
  if not self._initialized:
1748
3052
  self.init()
1749
3053
 
1750
- results = {"r": {}, "e": {}}
1751
-
1752
- # We parse 'return list' output as there is no direct bulk export of stored results
1753
- raw_r_resp = self.run_command_structured("return list", echo=True)
1754
- raw_e_resp = self.run_command_structured("ereturn list", echo=True)
1755
- raw_r = raw_r_resp.stdout if raw_r_resp.success else (raw_r_resp.error.snippet if raw_r_resp.error else "")
1756
- raw_e = raw_e_resp.stdout if raw_e_resp.success else (raw_e_resp.error.snippet if raw_e_resp.error else "")
1757
-
1758
- # Simple parser
1759
- def parse_list(text):
1760
- data = {}
1761
- # We don't strictly need to track sections if we check patterns
1762
- for line in text.splitlines():
1763
- line = line.strip()
1764
- if not line:
1765
- continue
1766
-
1767
- # scalars: r(name) = value
1768
- if "=" in line and ("r(" in line or "e(" in line):
1769
- try:
1770
- name_part, val_part = line.split("=", 1)
1771
- name_part = name_part.strip() # "r(mean)"
1772
- val_part = val_part.strip() # "6165.2..."
1773
-
1774
- # Extract just the name inside r(...) if desired,
1775
- # or keep full key "r(mean)".
1776
- # User likely wants "mean" inside "r" dict.
1777
-
1778
- if "(" in name_part and name_part.endswith(")"):
1779
- # r(mean) -> mean
1780
- start = name_part.find("(") + 1
1781
- end = name_part.find(")")
1782
- key = name_part[start:end]
1783
- data[key] = val_part
1784
- except Exception:
1785
- pass
1786
-
1787
- # macros: r(name) : "value"
1788
- elif ":" in line and ("r(" in line or "e(" in line):
1789
- try:
1790
- name_part, val_part = line.split(":", 1)
1791
- name_part = name_part.strip()
1792
- val_part = val_part.strip().strip('"')
1793
-
1794
- if "(" in name_part and name_part.endswith(")"):
1795
- start = name_part.find("(") + 1
1796
- end = name_part.find(")")
1797
- key = name_part[start:end]
1798
- data[key] = val_part
1799
- except Exception:
1800
- pass
1801
- return data
1802
-
1803
- results["r"] = parse_list(raw_r)
1804
- results["e"] = parse_list(raw_e)
1805
-
1806
- return results
3054
+ with self._exec_lock:
3055
+ # We must be extremely careful not to clobber r()/e() while fetching their names.
3056
+ # We use a hold to peek at the results.
3057
+ hold_name = f"mcp_peek_{uuid.uuid4().hex[:8]}"
3058
+ self.stata.run(f"capture _return hold {hold_name}", echo=False)
3059
+
3060
+ try:
3061
+ from sfi import Scalar, Macro
3062
+ results = {"r": {}, "e": {}}
3063
+
3064
+ for rclass in ["r", "e"]:
3065
+ # Restore with 'hold' to peek at results without losing them from the hold
3066
+ # Note: Stata 18+ supports 'restore ..., hold' which is ideal.
3067
+ self.stata.run(f"capture _return restore {hold_name}, hold", echo=False)
3068
+
3069
+ # Fetch names using backtick expansion (which we verified works better than colon)
3070
+ # and avoid leading underscores which were causing syntax errors with 'global'
3071
+ self.stata.run(f"macro define mcp_scnames `: {rclass}(scalars)'", echo=False)
3072
+ self.stata.run(f"macro define mcp_macnames `: {rclass}(macros)'", echo=False)
3073
+
3074
+ # 1. Capture Scalars
3075
+ names_str = Macro.getGlobal("mcp_scnames")
3076
+ if names_str:
3077
+ for name in names_str.split():
3078
+ try:
3079
+ val = Scalar.getValue(f"{rclass}({name})")
3080
+ results[rclass][name] = val
3081
+ except Exception:
3082
+ pass
3083
+
3084
+ # 2. Capture Macros (strings)
3085
+ macros_str = Macro.getGlobal("mcp_macnames")
3086
+ if macros_str:
3087
+ for name in macros_str.split():
3088
+ try:
3089
+ # Restore/Hold again to be safe before fetching each macro
3090
+ self.stata.run(f"capture _return restore {hold_name}, hold", echo=False)
3091
+ # Capture the string value into a macro
3092
+ self.stata.run(f"macro define mcp_mval `{rclass}({name})'", echo=False)
3093
+ val = Macro.getGlobal("mcp_mval")
3094
+ results[rclass][name] = val
3095
+ except Exception:
3096
+ pass
3097
+
3098
+ # Cleanup
3099
+ self.stata.run("macro drop mcp_scnames mcp_macnames mcp_mval", echo=False)
3100
+ self.stata.run(f"capture _return restore {hold_name}", echo=False) # Restore one last time to leave Stata in correct state
3101
+
3102
+ self._last_results = results
3103
+ return results
3104
+ except Exception as e:
3105
+ logger.error(f"SFI-based get_stored_results failed: {e}")
3106
+ # Try to clean up hold if we failed
3107
+ try:
3108
+ self.stata.run(f"capture _return drop {hold_name}", echo=False)
3109
+ except Exception:
3110
+ pass
3111
+ return {"r": {}, "e": {}}
1807
3112
 
1808
3113
  def invalidate_graph_cache(self, graph_name: str = None) -> None:
1809
3114
  """Invalidate cache for specific graph or all graphs.
@@ -1953,47 +3258,32 @@ class StataClient:
1953
3258
 
1954
3259
  # Additional validation by attempting to display the graph
1955
3260
  resolved = self._resolve_graph_name_for_stata(graph_name)
1956
- cmd = f'graph display {resolved}'
1957
- resp = self._exec_no_capture(cmd, echo=False)
3261
+ cmd = f'quietly graph display {resolved}'
3262
+ resp = self._exec_no_capture_silent(cmd, echo=False)
1958
3263
  return resp.success
1959
3264
  except Exception:
1960
3265
  return False
1961
3266
 
1962
3267
  def _is_cache_valid(self, graph_name: str, cache_path: str) -> bool:
1963
- """Check if cached content is still valid."""
3268
+ """Check if cached content is still valid using internal signatures."""
1964
3269
  try:
1965
- # Get current graph content hash
1966
- import tempfile
1967
- import os
1968
-
1969
- temp_dir = tempfile.gettempdir()
1970
- temp_file = os.path.join(temp_dir, f"temp_{graph_name}_{os.getpid()}.svg")
1971
-
1972
- resolved = self._resolve_graph_name_for_stata(graph_name)
1973
- export_cmd = f'graph export "{temp_file.replace("\\\\", "/")}", name({resolved}) replace as(svg)'
1974
- resp = self._exec_no_capture(export_cmd, echo=False)
1975
-
1976
- if resp.success and os.path.exists(temp_file):
1977
- with open(temp_file, 'rb') as f:
1978
- current_data = f.read()
1979
- os.remove(temp_file)
3270
+ if not os.path.exists(cache_path) or os.path.getsize(cache_path) == 0:
3271
+ return False
1980
3272
 
1981
- current_hash = self._get_content_hash(current_data)
1982
- cached_hash = self._preemptive_cache.get(f"{graph_name}_hash")
3273
+ current_sig = self._get_graph_signature(graph_name)
3274
+ cached_sig = self._preemptive_cache.get(f"{graph_name}_sig")
3275
+
3276
+ # If we have a signature match, it's valid for the current command session
3277
+ if cached_sig and cached_sig == current_sig:
3278
+ return True
1983
3279
 
1984
- return cached_hash == current_hash
3280
+ # Otherwise it's invalid (needs refresh for new command)
3281
+ return False
1985
3282
  except Exception:
1986
- pass
1987
-
1988
- return False # Assume invalid if we can't verify
1989
-
1990
- def export_graphs_all(self, use_base64: bool = False) -> GraphExportResponse:
1991
- """Exports all graphs to file paths (default) or base64-encoded strings.
3283
+ return False
1992
3284
 
1993
- Args:
1994
- use_base64: If True, returns base64-encoded images. If False (default),
1995
- returns file paths to exported SVG files.
1996
- """
3285
+ def export_graphs_all(self) -> GraphExportResponse:
3286
+ """Exports all graphs to file paths."""
1997
3287
  exports: List[GraphExport] = []
1998
3288
  graph_names = self.list_graphs(force_refresh=True)
1999
3289
 
@@ -2003,7 +3293,6 @@ class StataClient:
2003
3293
  import tempfile
2004
3294
  import os
2005
3295
  import threading
2006
- import base64
2007
3296
  import uuid
2008
3297
  import time
2009
3298
  import logging
@@ -2027,15 +3316,15 @@ class StataClient:
2027
3316
  svg_path_for_stata = svg_path.replace("\\", "/")
2028
3317
 
2029
3318
  try:
2030
- export_cmd = f'graph export "{svg_path_for_stata}", name({resolved}) replace as(svg)'
2031
- export_resp = self._exec_no_capture(export_cmd, echo=False)
3319
+ export_cmd = f'quietly graph export "{svg_path_for_stata}", name({resolved}) replace as(svg)'
3320
+ export_resp = self._exec_no_capture_silent(export_cmd, echo=False)
2032
3321
 
2033
3322
  if not export_resp.success:
2034
- display_cmd = f'graph display {resolved}'
2035
- display_resp = self._exec_no_capture(display_cmd, echo=False)
3323
+ display_cmd = f'quietly graph display {resolved}'
3324
+ display_resp = self._exec_no_capture_silent(display_cmd, echo=False)
2036
3325
  if display_resp.success:
2037
- export_cmd2 = f'graph export "{svg_path_for_stata}", replace as(svg)'
2038
- export_resp = self._exec_no_capture(export_cmd2, echo=False)
3326
+ export_cmd2 = f'quietly graph export "{svg_path_for_stata}", replace as(svg)'
3327
+ export_resp = self._exec_no_capture_silent(export_cmd2, echo=False)
2039
3328
  else:
2040
3329
  export_resp = display_resp
2041
3330
 
@@ -2077,12 +3366,7 @@ class StataClient:
2077
3366
 
2078
3367
  for name, cached_path in cached_graphs.items():
2079
3368
  try:
2080
- if use_base64:
2081
- with open(cached_path, "rb") as f:
2082
- svg_b64 = base64.b64encode(f.read()).decode("ascii")
2083
- exports.append(GraphExport(name=name, image_base64=svg_b64))
2084
- else:
2085
- exports.append(GraphExport(name=name, file_path=cached_path))
3369
+ exports.append(GraphExport(name=name, file_path=cached_path))
2086
3370
  except Exception as e:
2087
3371
  cache_errors.append(f"Failed to read cached graph {name}: {e}")
2088
3372
  # Fall back to uncached processing
@@ -2125,24 +3409,16 @@ class StataClient:
2125
3409
  self._cache_sizes[name] = item_size
2126
3410
  self._total_cache_size += item_size
2127
3411
 
2128
- if use_base64:
2129
- svg_b64 = base64.b64encode(result).decode("ascii")
2130
- exports.append(GraphExport(name=name, image_base64=svg_b64))
2131
- else:
2132
- exports.append(GraphExport(name=name, file_path=cache_path))
3412
+ exports.append(GraphExport(name=name, file_path=cache_path))
2133
3413
  except Exception as e:
2134
3414
  cache_errors.append(f"Failed to cache graph {name}: {e}")
2135
3415
  # Still return the result even if caching fails
2136
- if use_base64:
2137
- svg_b64 = base64.b64encode(result).decode("ascii")
2138
- exports.append(GraphExport(name=name, image_base64=svg_b64))
2139
- else:
2140
- # Create temp file for immediate use
2141
- safe_name = self._sanitize_filename(name)
2142
- temp_path = os.path.join(tempfile.gettempdir(), f"{safe_name}_{uuid.uuid4().hex[:8]}.svg")
2143
- with open(temp_path, 'wb') as f:
2144
- f.write(result)
2145
- exports.append(GraphExport(name=name, file_path=temp_path))
3416
+ # Create temp file for immediate use
3417
+ safe_name = self._sanitize_filename(name)
3418
+ temp_path = os.path.join(tempfile.gettempdir(), f"{safe_name}_{uuid.uuid4().hex[:8]}.svg")
3419
+ with open(temp_path, 'wb') as f:
3420
+ f.write(result)
3421
+ exports.append(GraphExport(name=name, file_path=temp_path))
2146
3422
 
2147
3423
  # Log errors if any occurred
2148
3424
  if cache_errors:
@@ -2197,29 +3473,21 @@ class StataClient:
2197
3473
  del self._preemptive_cache[hash_key]
2198
3474
 
2199
3475
  try:
2200
- # Sanitize graph name for file system
2201
- safe_name = self._sanitize_filename(graph_name)
3476
+ # Include signature in filename to force client-side refresh
3477
+ sig = self._get_graph_signature(graph_name)
3478
+ safe_name = self._sanitize_filename(sig)
2202
3479
  cache_path = os.path.join(self._preemptive_cache_dir, f"{safe_name}.svg")
2203
3480
  cache_path_for_stata = cache_path.replace("\\", "/")
2204
3481
 
2205
3482
  resolved_graph_name = self._resolve_graph_name_for_stata(graph_name)
2206
- graph_name_q = self._stata_quote(resolved_graph_name)
3483
+ # Use display + export without name() for maximum compatibility.
3484
+ # name(NAME) often fails in PyStata for non-active graphs (r(693)).
3485
+ # Quoting the name helps with spaces/special characters.
3486
+ display_cmd = f'quietly graph display "{resolved_graph_name}"'
3487
+ self._exec_no_capture_silent(display_cmd, echo=False)
2207
3488
 
2208
- export_cmd = f'graph export "{cache_path_for_stata}", name({graph_name_q}) replace as(svg)'
2209
- resp = self._exec_no_capture(export_cmd, echo=False)
2210
-
2211
- # Fallback: some graph names (spaces, slashes, backslashes) can confuse
2212
- # Stata's parser in name() even when the graph exists. In that case,
2213
- # make the graph current, then export without name().
2214
- if not resp.success:
2215
- try:
2216
- display_cmd = f'graph display {graph_name_q}'
2217
- display_resp = self._exec_no_capture(display_cmd, echo=False)
2218
- if display_resp.success:
2219
- export_cmd2 = f'graph export "{cache_path_for_stata}", replace as(svg)'
2220
- resp = self._exec_no_capture(export_cmd2, echo=False)
2221
- except Exception:
2222
- pass
3489
+ export_cmd = f'quietly graph export "{cache_path_for_stata}", replace as(svg)'
3490
+ resp = self._exec_no_capture_silent(export_cmd, echo=False)
2223
3491
 
2224
3492
  if resp.success and os.path.exists(cache_path) and os.path.getsize(cache_path) > 0:
2225
3493
  # Read the data to compute hash
@@ -2232,9 +3500,20 @@ class StataClient:
2232
3500
  self._evict_cache_if_needed(item_size)
2233
3501
 
2234
3502
  with self._cache_lock:
3503
+ # Clear any old versions of this graph from the path cache
3504
+ # (Optional but keeps it clean)
3505
+ old_path = self._preemptive_cache.get(graph_name)
3506
+ if old_path and old_path != cache_path:
3507
+ try:
3508
+ os.remove(old_path)
3509
+ except Exception:
3510
+ pass
3511
+
2235
3512
  self._preemptive_cache[graph_name] = cache_path
2236
3513
  # Store content hash for validation
2237
3514
  self._preemptive_cache[f"{graph_name}_hash"] = self._get_content_hash(data)
3515
+ # Store signature for fast validation
3516
+ self._preemptive_cache[f"{graph_name}_sig"] = self._get_graph_signature(graph_name)
2238
3517
  # Update tracking
2239
3518
  self._cache_access_times[graph_name] = time.time()
2240
3519
  self._cache_sizes[graph_name] = item_size
@@ -2253,105 +3532,58 @@ class StataClient:
2253
3532
  return False
2254
3533
 
2255
3534
  def run_do_file(self, path: str, echo: bool = True, trace: bool = False, max_output_lines: Optional[int] = None, cwd: Optional[str] = None) -> CommandResponse:
2256
- if cwd is not None and not os.path.isdir(cwd):
2257
- return CommandResponse(
2258
- command=f'do "{path}"',
2259
- rc=601,
2260
- stdout="",
2261
- stderr=None,
2262
- success=False,
2263
- error=ErrorEnvelope(
2264
- message=f"cwd not found: {cwd}",
2265
- rc=601,
2266
- command=path,
2267
- ),
2268
- )
2269
-
2270
- effective_path = path
2271
- if cwd is not None and not os.path.isabs(path):
2272
- effective_path = os.path.abspath(os.path.join(cwd, path))
2273
-
2274
- if not os.path.exists(effective_path):
2275
- return CommandResponse(
2276
- command=f'do "{effective_path}"',
2277
- rc=601,
2278
- stdout="",
2279
- stderr=None,
2280
- success=False,
2281
- error=ErrorEnvelope(
2282
- message=f"Do-file not found: {effective_path}",
2283
- rc=601,
2284
- command=effective_path,
2285
- ),
2286
- )
3535
+ effective_path, command, error_response = self._resolve_do_file_path(path, cwd)
3536
+ if error_response is not None:
3537
+ return error_response
2287
3538
 
2288
3539
  if not self._initialized:
2289
3540
  self.init()
2290
3541
 
2291
3542
  start_time = time.time()
2292
3543
  exc: Optional[Exception] = None
2293
- path_for_stata = effective_path.replace("\\", "/")
2294
- command = f'do "{path_for_stata}"'
3544
+ smcl_content = ""
3545
+ smcl_path = None
2295
3546
 
2296
- log_file = tempfile.NamedTemporaryFile(
2297
- prefix="mcp_stata_",
2298
- suffix=".log",
2299
- delete=False,
2300
- mode="w",
2301
- encoding="utf-8",
2302
- errors="replace",
2303
- buffering=1,
2304
- )
2305
- log_path = log_file.name
2306
- tail = TailBuffer(max_chars=200000 if trace else 20000)
2307
- tee = FileTeeIO(log_file, tail)
3547
+ _log_file, log_path, tail, tee = self._create_streaming_log(trace=trace)
3548
+ base_dir = cwd or os.path.dirname(effective_path)
3549
+ smcl_path = self._create_smcl_log_path(base_dir=base_dir)
3550
+ smcl_log_name = self._make_smcl_log_name()
2308
3551
 
2309
3552
  rc = -1
3553
+ try:
3554
+ rc, exc = self._run_streaming_blocking(
3555
+ command=command,
3556
+ tee=tee,
3557
+ cwd=cwd,
3558
+ trace=trace,
3559
+ echo=echo,
3560
+ smcl_path=smcl_path,
3561
+ smcl_log_name=smcl_log_name,
3562
+ hold_attr="_hold_name_do_sync",
3563
+ require_smcl_log=True,
3564
+ )
3565
+ except Exception as e:
3566
+ exc = e
3567
+ rc = 1
3568
+ finally:
3569
+ tee.close()
2310
3570
 
2311
- with self._exec_lock:
2312
- try:
2313
- from sfi import Scalar, SFIToolkit # Import SFI tools
2314
- with self._temp_cwd(cwd):
2315
- with self._redirect_io_streaming(tee, tee):
2316
- try:
2317
- if trace:
2318
- self.stata.run("set trace on")
2319
- ret = self.stata.run(command, echo=echo)
2320
- # Some PyStata builds return output as a string rather than printing.
2321
- if isinstance(ret, str) and ret:
2322
- try:
2323
- tee.write(ret)
2324
- except Exception:
2325
- pass
2326
-
2327
- except Exception as e:
2328
- exc = e
2329
- rc = 1
2330
- finally:
2331
- if trace:
2332
- try:
2333
- self.stata.run("set trace off")
2334
- except Exception:
2335
- pass
2336
- except Exception as e:
2337
- # Outer catch in case imports or locks fail
2338
- exc = e
2339
- rc = 1
2340
-
2341
- tee.close()
3571
+ # Read SMCL content as the authoritative source
3572
+ smcl_content = self._read_smcl_file(smcl_path)
2342
3573
 
2343
- tail_text = tail.get_value()
2344
- log_tail = self._read_log_tail(log_path, 200000 if trace else 20000)
2345
- if log_tail and len(log_tail) > len(tail_text):
2346
- tail_text = log_tail
2347
- combined = (tail_text or "") + (f"\n{exc}" if exc else "")
3574
+ combined = self._build_combined_log(tail, log_path, rc, trace, exc)
2348
3575
 
2349
- # Parse RC from log tail if no exception occurred
3576
+ # Use SMCL content as primary source for RC detection if not already captured
2350
3577
  if rc == -1 and not exc:
2351
- parsed_rc = self._parse_rc_from_text(combined)
2352
- rc = parsed_rc if parsed_rc is not None else 0
2353
- elif exc:
2354
- # Try to parse RC from exception message
3578
+ parsed_rc = self._parse_rc_from_smcl(smcl_content)
3579
+ if parsed_rc is not None:
3580
+ rc = parsed_rc
3581
+ else:
3582
+ # Fallback to text parsing
3583
+ parsed_rc = self._parse_rc_from_text(combined)
3584
+ rc = parsed_rc if parsed_rc is not None else 0
3585
+ elif exc and rc == 1:
3586
+ # Try to parse more specific RC from exception message
2355
3587
  parsed_rc = self._parse_rc_from_text(str(exc))
2356
3588
  if parsed_rc is not None:
2357
3589
  rc = parsed_rc
@@ -2360,15 +3592,20 @@ class StataClient:
2360
3592
  error = None
2361
3593
 
2362
3594
  if not success:
2363
- # Robust extraction
2364
- msg, context = self._extract_error_and_context(combined, rc)
3595
+ # Use SMCL as authoritative source for error extraction
3596
+ if smcl_content:
3597
+ msg, context = self._extract_error_from_smcl(smcl_content, rc)
3598
+ else:
3599
+ # Fallback to combined log
3600
+ msg, context = self._extract_error_and_context(combined, rc)
2365
3601
 
2366
3602
  error = ErrorEnvelope(
2367
3603
  message=msg,
2368
3604
  rc=rc,
2369
3605
  snippet=context,
2370
3606
  command=command,
2371
- log_path=log_path
3607
+ log_path=log_path,
3608
+ smcl_output=smcl_content,
2372
3609
  )
2373
3610
 
2374
3611
  duration = time.time() - start_time
@@ -2389,6 +3626,7 @@ class StataClient:
2389
3626
  log_path=log_path,
2390
3627
  success=success,
2391
3628
  error=error,
3629
+ smcl_output=smcl_content,
2392
3630
  )
2393
3631
 
2394
3632
  def load_data(self, source: str, clear: bool = True, max_output_lines: Optional[int] = None) -> CommandResponse:
@@ -2407,40 +3645,8 @@ class StataClient:
2407
3645
  cmd = f"sysuse {src}{clear_suffix}"
2408
3646
 
2409
3647
  result = self._exec_with_capture(cmd, echo=True, trace=False)
2410
-
2411
- # Truncate stdout if requested
2412
- if max_output_lines is not None and result.stdout:
2413
- lines = result.stdout.splitlines()
2414
- if len(lines) > max_output_lines:
2415
- truncated_lines = lines[:max_output_lines]
2416
- truncated_lines.append(f"\n... (output truncated: showing {max_output_lines} of {len(lines)} lines)")
2417
- result = CommandResponse(
2418
- command=result.command,
2419
- rc=result.rc,
2420
- stdout="\n".join(truncated_lines),
2421
- stderr=result.stderr,
2422
- success=result.success,
2423
- error=result.error,
2424
- )
2425
-
2426
- return result
3648
+ return self._truncate_command_output(result, max_output_lines)
2427
3649
 
2428
3650
  def codebook(self, varname: str, trace: bool = False, max_output_lines: Optional[int] = None) -> CommandResponse:
2429
3651
  result = self._exec_with_capture(f"codebook {varname}", trace=trace)
2430
-
2431
- # Truncate stdout if requested
2432
- if max_output_lines is not None and result.stdout:
2433
- lines = result.stdout.splitlines()
2434
- if len(lines) > max_output_lines:
2435
- truncated_lines = lines[:max_output_lines]
2436
- truncated_lines.append(f"\n... (output truncated: showing {max_output_lines} of {len(lines)} lines)")
2437
- result = CommandResponse(
2438
- command=result.command,
2439
- rc=result.rc,
2440
- stdout="\n".join(truncated_lines),
2441
- stderr=result.stderr,
2442
- success=result.success,
2443
- error=result.error,
2444
- )
2445
-
2446
- return result
3652
+ return self._truncate_command_output(result, max_output_lines)