PyPI - mcp-stata - Versions diffs - 1.6.2__py3-none-any.whl → 1.7.3__py3-none-any.whl - Mend - Supply Chain Defender

mcp-stata 1.6.2py3-none-any.whl → 1.7.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mcp-stata might be problematic. Click here for more details.

Files changed (10) hide show

mcp_stata/discovery.py +315 -77
mcp_stata/server.py +7 -0
mcp_stata/stata_client.py +328 -77
mcp_stata/ui_http.py +61 -6
{mcp_stata-1.6.2.dist-info → mcp_stata-1.7.3.dist-info}/METADATA +68 -2
mcp_stata-1.7.3.dist-info/RECORD +14 -0
mcp_stata-1.6.2.dist-info/RECORD +0 -14
{mcp_stata-1.6.2.dist-info → mcp_stata-1.7.3.dist-info}/WHEEL +0 -0
{mcp_stata-1.6.2.dist-info → mcp_stata-1.7.3.dist-info}/entry_points.txt +0 -0
{mcp_stata-1.6.2.dist-info → mcp_stata-1.7.3.dist-info}/licenses/LICENSE +0 -0

mcp_stata/stata_client.py CHANGED Viewed

@@ -6,13 +6,15 @@ import re
 import subprocess
 import sys
 import threading
+from importlib.metadata import PackageNotFoundError, version
 import tempfile
 import time
 from contextlib import contextmanager
 from io import StringIO
-from typing import Any, Awaitable, Callable, Dict, List, Optional
+from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple
 import anyio
+from anyio import get_cancelled_exc_class
 from .discovery import find_stata_path
 from .models import (
@@ -32,6 +34,74 @@ from .graph_detector import StreamingGraphCache
 logger = logging.getLogger("mcp_stata")
+# ============================================================================
+# MODULE-LEVEL DISCOVERY CACHE
+# ============================================================================
+# This cache ensures Stata discovery runs exactly once per process lifetime
+_discovery_lock = threading.Lock()
+_discovery_result: Optional[Tuple[str, str]] = None  # (path, edition)
+_discovery_attempted = False
+_discovery_error: Optional[Exception] = None
+def _get_discovered_stata() -> Tuple[str, str]:
+    """
+    Get the discovered Stata path and edition, running discovery only once.
+    Returns:
+        Tuple of (stata_executable_path, edition)
+    Raises:
+        RuntimeError: If Stata discovery fails
+    """
+    global _discovery_result, _discovery_attempted, _discovery_error
+    with _discovery_lock:
+        # If we've already successfully discovered Stata, return cached result
+        if _discovery_result is not None:
+            return _discovery_result
+        # If we've already attempted and failed, re-raise the cached error
+        if _discovery_attempted and _discovery_error is not None:
+            raise RuntimeError(f"Stata binary not found: {_discovery_error}") from _discovery_error
+        # This is the first attempt - run discovery
+        _discovery_attempted = True
+        try:
+            # Log environment state once at first discovery
+            env_path = os.getenv("STATA_PATH")
+            if env_path:
+                logger.info("STATA_PATH env provided (raw): %s", env_path)
+            else:
+                logger.info("STATA_PATH env not set; attempting auto-discovery")
+            try:
+                pkg_version = version("mcp-stata")
+            except PackageNotFoundError:
+                pkg_version = "unknown"
+            logger.info("mcp-stata version: %s", pkg_version)
+            # Run discovery
+            stata_exec_path, edition = find_stata_path()
+            # Cache the successful result
+            _discovery_result = (stata_exec_path, edition)
+            logger.info("Discovery found Stata at: %s (%s)", stata_exec_path, edition)
+            return _discovery_result
+        except FileNotFoundError as e:
+            _discovery_error = e
+            raise RuntimeError(f"Stata binary not found: {e}") from e
+        except PermissionError as e:
+            _discovery_error = e
+            raise RuntimeError(
+                f"Stata binary is not executable: {e}. "
+                "Point STATA_PATH directly to the Stata binary (e.g., .../Contents/MacOS/stata-mp)."
+            ) from e
 class StataClient:
     _initialized = False
     _exec_lock: threading.Lock
@@ -100,6 +170,62 @@ class StataClient:
                 logger.error(f"Failed to notify about graph cache: {e}")
         return graph_cache_callback
+    def _request_break_in(self) -> None:
+        """
+        Attempt to interrupt a running Stata command when cancellation is requested.
+        Uses the Stata sfi.breakIn hook when available; errors are swallowed because
+        cancellation should never crash the host process.
+        """
+        try:
+            import sfi  # type: ignore[import-not-found]
+            break_fn = getattr(sfi, "breakIn", None) or getattr(sfi, "break_in", None)
+            if callable(break_fn):
+                try:
+                    break_fn()
+                    logger.info("Sent breakIn() to Stata for cancellation")
+                except Exception as e:  # pragma: no cover - best-effort
+                    logger.warning(f"Failed to send breakIn() to Stata: {e}")
+            else:  # pragma: no cover - environment without Stata runtime
+                logger.debug("sfi.breakIn not available; cannot interrupt Stata")
+        except Exception as e:  # pragma: no cover - import failure or other
+            logger.debug(f"Unable to import sfi for cancellation: {e}")
+    async def _wait_for_stata_stop(self, timeout: float = 2.0) -> bool:
+        """
+        After requesting a break, poll the Stata interface so it can surface BreakError
+        and return control. This is best-effort and time-bounded.
+        """
+        deadline = time.monotonic() + timeout
+        try:
+            import sfi  # type: ignore[import-not-found]
+            toolkit = getattr(sfi, "SFIToolkit", None)
+            poll = getattr(toolkit, "pollnow", None) or getattr(toolkit, "pollstd", None)
+            BreakError = getattr(sfi, "BreakError", None)
+        except Exception:  # pragma: no cover
+            return False
+        if not callable(poll):
+            return False
+        last_exc: Optional[Exception] = None
+        while time.monotonic() < deadline:
+            try:
+                poll()
+            except Exception as e:  # pragma: no cover - depends on Stata runtime
+                last_exc = e
+                if BreakError is not None and isinstance(e, BreakError):
+                    logger.info("Stata BreakError detected; cancellation acknowledged by Stata")
+                    return True
+                # If Stata already stopped, break on any other exception.
+                break
+            await anyio.sleep(0.05)
+        if last_exc:
+            logger.debug(f"Cancellation poll exited with {last_exc}")
+        return False
     @contextmanager
     def _temp_cwd(self, cwd: Optional[str]):
@@ -114,24 +240,15 @@ class StataClient:
             os.chdir(prev)
     def init(self):
-        """Initializes usage of pystata."""
+        """Initializes usage of pystata using cached discovery results."""
         if self._initialized:
             return
         try:
             import stata_setup
-            try:
-                stata_exec_path, edition = find_stata_path()
-            except FileNotFoundError as e:
-                raise RuntimeError(f"Stata binary not found: {e}") from e
-            except PermissionError as e:
-                raise RuntimeError(
-                    f"Stata binary is not executable: {e}. "
-                    "Point STATA_PATH directly to the Stata binary (e.g., .../Contents/MacOS/stata-mp)."
-                ) from e
-            logger.info(f"Discovery found Stata at: {stata_exec_path} ({edition})")
+            # Get discovered Stata path (cached from first call)
+            stata_exec_path, edition = _get_discovered_stata()
             candidates = []
@@ -171,6 +288,7 @@ class StataClient:
                 try:
                     stata_setup.config(path, edition)
                     success = True
+                    logger.debug("stata_setup.config succeeded with path: %s", path)
                     break
                 except Exception:
                     continue
@@ -187,14 +305,6 @@ class StataClient:
             from pystata import stata  # type: ignore[import-not-found]
             self.stata = stata
             self._initialized = True
-            # Ensure a clean graph state for a fresh client. PyStata's backend is
-            # effectively global, so graph memory can otherwise leak across tests
-            # and separate StataClient instances.
-            try:
-                self.stata.run("capture graph drop _all", quietly=True)
-            except Exception:
-                pass
             # Initialize list_graphs TTL cache
             self._list_graphs_cache = None
@@ -205,11 +315,14 @@ class StataClient:
             # internal Stata graph names.
             self._graph_name_aliases: Dict[str, str] = {}
             self._graph_name_reverse: Dict[str, str] = {}
+            logger.info("StataClient initialized successfully with %s (%s)", stata_exec_path, edition)
-        except ImportError:
-            # Fallback for when stata_setup isn't in PYTHONPATH yet?
-            # Usually users must have it installed. We rely on discovery logic.
-            raise RuntimeError("Could not import `stata_setup`. Ensure pystata is installed.")
+        except ImportError as e:
+            raise RuntimeError(
+                f"Failed to import stata_setup or pystata: {e}. "
+                "Ensure they are installed (pip install pystata stata-setup)."
+            ) from e
     def _make_valid_stata_name(self, name: str) -> str:
         """Create a valid Stata name (<=32 chars, [A-Za-z_][A-Za-z0-9_]*)."""
@@ -295,6 +408,73 @@ class StataClient:
                 return None
         return None
+    def _read_log_tail(self, path: str, max_chars: int) -> str:
+        try:
+            with open(path, "rb") as f:
+                f.seek(0, os.SEEK_END)
+                size = f.tell()
+                if size <= 0:
+                    return ""
+                read_size = min(size, max_chars)
+                f.seek(-read_size, os.SEEK_END)
+                data = f.read(read_size)
+            return data.decode("utf-8", errors="replace")
+        except Exception:
+            return ""
+    def _select_stata_error_message(self, text: str, fallback: str) -> str:
+        if not text:
+            return fallback
+        ignore_patterns = (
+            r"^r\(\d+\);?$",
+            r"^end of do-file$",
+            r"^execution terminated$",
+            r"^[-=*]{3,}.*$",
+        )
+        rc_pattern = r"^r\(\d+\);?$"
+        error_patterns = (
+            r"\btype mismatch\b",
+            r"\bnot found\b",
+            r"\bnot allowed\b",
+            r"\bno observations\b",
+            r"\bconformability error\b",
+            r"\binvalid\b",
+            r"\bsyntax error\b",
+            r"\berror\b",
+        )
+        lines = text.splitlines()
+        for raw in reversed(lines):
+            line = raw.strip()
+            if not line:
+                continue
+            if any(re.search(pat, line, re.IGNORECASE) for pat in error_patterns):
+                return line
+        for i in range(len(lines) - 1, -1, -1):
+            line = lines[i].strip()
+            if not line:
+                continue
+            if re.match(rc_pattern, line, re.IGNORECASE):
+                for j in range(i - 1, -1, -1):
+                    prev_line = lines[j].strip()
+                    if not prev_line:
+                        continue
+                    if prev_line.startswith((".", ">", "-", "=")):
+                        continue
+                    if any(re.match(pat, prev_line, re.IGNORECASE) for pat in ignore_patterns):
+                        continue
+                    return prev_line
+                return line
+        for raw in reversed(lines):
+            line = raw.strip()
+            if not line:
+                continue
+            if line.startswith((".", ">", "-", "=")):
+                continue
+            if any(re.match(pat, line, re.IGNORECASE) for pat in ignore_patterns):
+                continue
+            return line
+        return fallback
     def _smcl_to_text(self, smcl: str) -> str:
         """Convert simple SMCL markup into plain text for LLM-friendly help."""
         # First, keep inline directive content if present (e.g., {bf:word} -> word)
@@ -320,7 +500,10 @@ class StataClient:
         rc_final = rc_hint if (rc_hint is not None and rc_hint != 0) else (rc if rc not in (-1, None) else rc_hint)
         line_no = self._parse_line_from_text(combined) if combined else None
         snippet = combined[-800:] if combined else None
-        message = (stderr or (str(exc) if exc else "") or stdout or "Stata error").strip()
+        fallback = (stderr or (str(exc) if exc else "") or stdout or "Stata error").strip()
+        if fallback == "Stata error" and rc_final is not None:
+            fallback = f"Stata error r({rc_final})"
+        message = self._select_stata_error_message(combined, fallback)
         return ErrorEnvelope(
             message=message,
             rc=rc_final,
@@ -527,7 +710,7 @@ class StataClient:
             buffering=1,
         )
         log_path = log_file.name
-        tail = TailBuffer(max_chars=8000)
+        tail = TailBuffer(max_chars=200000 if trace else 20000)
         tee = FileTeeIO(log_file, tail)
         # Inform the MCP client immediately where to read/tail the output.
@@ -538,33 +721,42 @@ class StataClient:
         def _run_blocking() -> None:
             nonlocal rc, exc
             with self._exec_lock:
-                with self._temp_cwd(cwd):
-                    with self._redirect_io_streaming(tee, tee):
-                        try:
-                            if trace:
-                                self.stata.run("set trace on")
-                            ret = self.stata.run(code, echo=echo)
-                            # Some PyStata builds return output as a string rather than printing.
-                            if isinstance(ret, str) and ret:
-                                try:
-                                    tee.write(ret)
-                                except Exception:
-                                    pass
-                        except Exception as e:
-                            exc = e
-                        finally:
-                            rc = self._read_return_code()
-                            if trace:
-                                try:
-                                    self.stata.run("set trace off")
-                                except Exception:
-                                    pass
+                self._is_executing = True
+                try:
+                    with self._temp_cwd(cwd):
+                        with self._redirect_io_streaming(tee, tee):
+                            try:
+                                if trace:
+                                    self.stata.run("set trace on")
+                                ret = self.stata.run(code, echo=echo)
+                                # Some PyStata builds return output as a string rather than printing.
+                                if isinstance(ret, str) and ret:
+                                    try:
+                                        tee.write(ret)
+                                    except Exception:
+                                        pass
+                            except Exception as e:
+                                exc = e
+                            finally:
+                                rc = self._read_return_code()
+                                if trace:
+                                    try:
+                                        self.stata.run("set trace off")
+                                    except Exception:
+                                        pass
+                finally:
+                    self._is_executing = False
         try:
             if notify_progress is not None:
                 await notify_progress(0, None, "Running Stata command")
-            await anyio.to_thread.run_sync(_run_blocking)
+            await anyio.to_thread.run_sync(_run_blocking, abandon_on_cancel=True)
+        except get_cancelled_exc_class():
+            # Best-effort cancellation: signal Stata to break, wait briefly, then propagate.
+            self._request_break_in()
+            await self._wait_for_stata_stop()
+            raise
         finally:
             tee.close()
@@ -583,6 +775,9 @@ class StataClient:
                 logger.warning(f"Failed to cache detected graphs: {e}")
         tail_text = tail.get_value()
+        log_tail = self._read_log_tail(log_path, 200000 if trace else 20000)
+        if log_tail and len(log_tail) > len(tail_text):
+            tail_text = log_tail
         combined = (tail_text or "") + (f"\n{exc}" if exc else "")
         rc_hint = self._parse_rc_from_text(combined) if combined else None
         if exc is None and rc_hint is not None and rc_hint != 0:
@@ -596,14 +791,10 @@ class StataClient:
             rc_hint = self._parse_rc_from_text(combined) if combined else None
             rc_final = rc_hint if (rc_hint is not None and rc_hint != 0) else (rc if rc not in (-1, None) else rc_hint)
             line_no = self._parse_line_from_text(combined) if combined else None
-            message = "Stata error"
-            if tail_text and tail_text.strip():
-                for line in reversed(tail_text.splitlines()):
-                    if line.strip():
-                        message = line.strip()
-                        break
-            elif exc is not None:
-                message = str(exc).strip() or message
+            fallback = (str(exc).strip() if exc is not None else "") or "Stata error"
+            if fallback == "Stata error" and rc_final is not None:
+                fallback = f"Stata error r({rc_final})"
+            message = self._select_stata_error_message(combined, fallback)
             error = ErrorEnvelope(
                 message=message,
@@ -754,7 +945,7 @@ class StataClient:
             buffering=1,
         )
         log_path = log_file.name
-        tail = TailBuffer(max_chars=8000)
+        tail = TailBuffer(max_chars=200000 if trace else 20000)
         tee = FileTeeIO(log_file, tail)
         # Inform the MCP client immediately where to read/tail the output.
@@ -838,7 +1029,11 @@ class StataClient:
                     await notify_progress(0, None, "Running do-file")
             try:
-                await anyio.to_thread.run_sync(_run_blocking)
+                await anyio.to_thread.run_sync(_run_blocking, abandon_on_cancel=True)
+            except get_cancelled_exc_class():
+                self._request_break_in()
+                await self._wait_for_stata_stop()
+                raise
             finally:
                 done.set()
                 tee.close()
@@ -916,6 +1111,9 @@ class StataClient:
                 logger.error(f"Post-execution graph detection failed: {e}")
         tail_text = tail.get_value()
+        log_tail = self._read_log_tail(log_path, 200000 if trace else 20000)
+        if log_tail and len(log_tail) > len(tail_text):
+            tail_text = log_tail
         combined = (tail_text or "") + (f"\n{exc}" if exc else "")
         rc_hint = self._parse_rc_from_text(combined) if combined else None
         if exc is None and rc_hint is not None and rc_hint != 0:
@@ -929,14 +1127,10 @@ class StataClient:
             rc_hint = self._parse_rc_from_text(combined) if combined else None
             rc_final = rc_hint if (rc_hint is not None and rc_hint != 0) else (rc if rc not in (-1, None) else rc_hint)
             line_no = self._parse_line_from_text(combined) if combined else None
-            message = "Stata error"
-            if tail_text and tail_text.strip():
-                for line in reversed(tail_text.splitlines()):
-                    if line.strip():
-                        message = line.strip()
-                        break
-            elif exc is not None:
-                message = str(exc).strip() or message
+            fallback = (str(exc).strip() if exc is not None else "") or "Stata error"
+            if fallback == "Stata error" and rc_final is not None:
+                fallback = f"Stata error r({rc_final})"
+            message = self._select_stata_error_message(combined, fallback)
             error = ErrorEnvelope(
                 message=message,
@@ -1299,6 +1493,65 @@ class StataClient:
         return indices
+    def apply_sort(self, sort_spec: List[str]) -> None:
+        """
+        Apply sorting to the dataset using gsort.
+        Args:
+            sort_spec: List of variables to sort by, with optional +/- prefix.
+                      e.g., ["-price", "+mpg"] sorts by price descending, then mpg ascending.
+                      No prefix is treated as ascending (+).
+        Raises:
+            ValueError: If sort_spec is invalid or contains invalid variables
+            RuntimeError: If no data in memory or sort command fails
+        """
+        if not self._initialized:
+            self.init()
+        state = self.get_dataset_state()
+        if int(state.get("k", 0) or 0) == 0 and int(state.get("n", 0) or 0) == 0:
+            raise RuntimeError("No data in memory")
+        if not sort_spec or not isinstance(sort_spec, list):
+            raise ValueError("sort_spec must be a non-empty list")
+        # Validate all variables exist
+        var_map = self._get_var_index_map()
+        for spec in sort_spec:
+            if not isinstance(spec, str) or not spec:
+                raise ValueError(f"Invalid sort specification: {spec!r}")
+            # Extract variable name (remove +/- prefix if present)
+            varname = spec.lstrip("+-")
+            if not varname:
+                raise ValueError(f"Invalid sort specification: {spec!r}")
+            if varname not in var_map:
+                raise ValueError(f"Variable not found: {varname}")
+        # Build gsort command
+        # gsort uses - for descending, + or nothing for ascending
+        gsort_args = []
+        for spec in sort_spec:
+            if spec.startswith("-") or spec.startswith("+"):
+                gsort_args.append(spec)
+            else:
+                # No prefix means ascending, add + explicitly for clarity
+                gsort_args.append(f"+{spec}")
+        cmd = f"gsort {' '.join(gsort_args)}"
+        try:
+            result = self.run_command_structured(cmd, echo=False)
+            if not result.success:
+                error_msg = result.error.message if result.error else "Sort failed"
+                raise RuntimeError(f"Failed to sort dataset: {error_msg}")
+        except Exception as e:
+            if isinstance(e, RuntimeError):
+                raise
+            raise RuntimeError(f"Failed to sort dataset: {e}")
     def get_variable_details(self, varname: str) -> str:
         """Returns codebook/summary for a specific variable."""
         resp = self.run_command_structured(f"codebook {varname}", echo=True)
@@ -2121,7 +2374,7 @@ class StataClient:
             buffering=1,
         )
         log_path = log_file.name
-        tail = TailBuffer(max_chars=8000)
+        tail = TailBuffer(max_chars=200000 if trace else 20000)
         tee = FileTeeIO(log_file, tail)
         rc = -1
@@ -2152,6 +2405,9 @@ class StataClient:
         tee.close()
         tail_text = tail.get_value()
+        log_tail = self._read_log_tail(log_path, 200000 if trace else 20000)
+        if log_tail and len(log_tail) > len(tail_text):
+            tail_text = log_tail
         combined = (tail_text or "") + (f"\n{exc}" if exc else "")
         rc_hint = self._parse_rc_from_text(combined) if combined else None
         if exc is None and rc_hint is not None and rc_hint != 0:
@@ -2166,14 +2422,10 @@ class StataClient:
             rc_hint = self._parse_rc_from_text(combined) if combined else None
             rc_final = rc_hint if (rc_hint is not None and rc_hint != 0) else (rc if rc not in (-1, None) else rc_hint)
             line_no = self._parse_line_from_text(combined) if combined else None
-            message = "Stata error"
-            if tail_text and tail_text.strip():
-                for line in reversed(tail_text.splitlines()):
-                    if line.strip():
-                        message = line.strip()
-                        break
-            elif exc is not None:
-                message = str(exc).strip() or message
+            fallback = (str(exc).strip() if exc is not None else "") or "Stata error"
+            if fallback == "Stata error" and rc_final is not None:
+                fallback = f"Stata error r({rc_final})"
+            message = self._select_stata_error_message(combined, fallback)
             error = ErrorEnvelope(
                 message=message,
@@ -2258,4 +2510,3 @@ class StataClient:
                 )
         return result