PyPI - mcpbr - Versions diffs - 0.4.15__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

mcpbr 0.4.15py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

mcpbr/benchmarks/__init__.py +12 -0
mcpbr/benchmarks/adversarial.py +341 -0
mcpbr/benchmarks/custom.py +607 -0
mcpbr/benchmarks/longbench.py +623 -0
mcpbr/benchmarks/mmmu.py +353 -0
mcpbr/config.py +4 -0
mcpbr/config_migration.py +470 -0
mcpbr/config_wizard.py +647 -0
mcpbr/custom_metrics.py +405 -0
mcpbr/dashboard.py +619 -0
mcpbr/dataset_streaming.py +491 -0
mcpbr/dataset_versioning.py +222 -0
mcpbr/docker_cache.py +539 -0
mcpbr/docker_prewarm.py +369 -0
mcpbr/dry_run.py +532 -0
mcpbr/failure_analysis.py +558 -0
mcpbr/few_shot.py +367 -0
mcpbr/formatting.py +444 -0
mcpbr/gpu_support.py +157 -0
mcpbr/harness.py +38 -4
mcpbr/latency_metrics.py +317 -0
mcpbr/resource_limits.py +487 -0
mcpbr/result_streaming.py +519 -0
mcpbr/sampling.py +193 -0
mcpbr/task_batching.py +403 -0
mcpbr/task_scheduler.py +468 -0
{mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/METADATA +10 -6
{mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/RECORD +38 -15
{mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
{mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
{mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
{mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
{mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
{mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
{mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
{mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/WHEEL +0 -0
{mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/entry_points.txt +0 -0
{mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/licenses/LICENSE +0 -0

mcpbr/formatting.py ADDED Viewed

@@ -0,0 +1,444 @@
+"""Color and formatting options for CLI output.
+Provides configurable themes and formatting utilities for consistent CLI output
+across the mcpbr tool. Supports the NO_COLOR convention (https://no-color.org/)
+and configurable themes via the MCPBR_THEME environment variable or CLI flags.
+"""
+from __future__ import annotations
+import os
+import sys
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any
+from rich.console import Console
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    SpinnerColumn,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
+from rich.table import Table
+from rich.text import Text
+class Theme(Enum):
+    """Available output themes.
+    Attributes:
+        DEFAULT: Rich colors with bold styles for maximum readability.
+        MINIMAL: Subdued colors for less visual noise.
+        PLAIN: No formatting or color at all.
+    """
+    DEFAULT = "default"
+    MINIMAL = "minimal"
+    PLAIN = "plain"
+@dataclass(frozen=True)
+class ThemeConfig:
+    """Style configuration for a theme.
+    Each field is a Rich markup style string used to format the corresponding
+    message category (e.g., ``"bold green"`` for success messages).
+    Attributes:
+        success_style: Style for success messages.
+        error_style: Style for error messages.
+        warning_style: Style for warning messages.
+        info_style: Style for informational messages.
+        header_style: Style for section headers.
+        dim_style: Style for secondary/dimmed text.
+        highlight_style: Style for highlighted/emphasized text.
+    """
+    success_style: str = "bold green"
+    error_style: str = "bold red"
+    warning_style: str = "bold yellow"
+    info_style: str = "bold blue"
+    header_style: str = "bold magenta"
+    dim_style: str = "dim"
+    highlight_style: str = "bold cyan"
+THEME_CONFIGS: dict[Theme, ThemeConfig] = {
+    Theme.DEFAULT: ThemeConfig(
+        success_style="bold green",
+        error_style="bold red",
+        warning_style="bold yellow",
+        info_style="bold blue",
+        header_style="bold magenta",
+        dim_style="dim",
+        highlight_style="bold cyan",
+    ),
+    Theme.MINIMAL: ThemeConfig(
+        success_style="green",
+        error_style="red",
+        warning_style="yellow",
+        info_style="blue",
+        header_style="magenta",
+        dim_style="dim",
+        highlight_style="cyan",
+    ),
+    Theme.PLAIN: ThemeConfig(
+        success_style="",
+        error_style="",
+        warning_style="",
+        info_style="",
+        header_style="",
+        dim_style="",
+        highlight_style="",
+    ),
+}
+def _resolve_theme(theme_name: str | None = None) -> Theme:
+    """Resolve a theme name string to a Theme enum value.
+    Checks the provided name first, then the MCPBR_THEME environment variable,
+    and falls back to ``Theme.DEFAULT``.
+    Args:
+        theme_name: Optional theme name (case-insensitive). One of
+            ``"default"``, ``"minimal"``, or ``"plain"``.
+    Returns:
+        The resolved Theme enum value.
+    Raises:
+        ValueError: If the theme name is not recognized.
+    """
+    name = theme_name or os.environ.get("MCPBR_THEME")
+    if name is None:
+        return Theme.DEFAULT
+    try:
+        return Theme(name.strip().lower())
+    except ValueError:
+        valid = ", ".join(t.value for t in Theme)
+        raise ValueError(f"Unknown theme '{name}'. Valid themes: {valid}") from None
+def detect_color_support(force_color: bool | None = None) -> bool:
+    """Determine whether the current environment supports color output.
+    Resolution order:
+        1. ``force_color`` parameter (explicit override).
+        2. ``NO_COLOR`` environment variable -- if set (any value), colors are
+           disabled per https://no-color.org/.
+        3. ``MCPBR_THEME`` environment variable -- if set to ``"plain"``, colors
+           are disabled.
+        4. Terminal detection -- colors are enabled when stdout is a TTY.
+    Args:
+        force_color: Explicit override. ``True`` forces colors on, ``False``
+            forces them off, ``None`` uses auto-detection.
+    Returns:
+        ``True`` if color output should be used, ``False`` otherwise.
+    """
+    if force_color is not None:
+        return force_color
+    # NO_COLOR convention: any value (including empty string) disables color
+    if "NO_COLOR" in os.environ:
+        return False
+    # MCPBR_THEME=plain disables color
+    theme_env = os.environ.get("MCPBR_THEME", "").strip().lower()
+    if theme_env == "plain":
+        return False
+    # Auto-detect: color only when stdout is a TTY
+    return hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
+class OutputFormatter:
+    """Formatted output for CLI messages.
+    Provides methods to print and format success, error, warning, info, and
+    header messages using Rich markup styles. Also supports table and progress
+    bar rendering.
+    Args:
+        theme: The theme to use for formatting. Defaults to ``Theme.DEFAULT``.
+        force_color: Explicit color override. ``True`` forces colors on,
+            ``False`` forces them off, ``None`` uses auto-detection.
+        console: Optional Rich Console instance. If not provided, one is
+            created based on color support settings.
+    """
+    def __init__(
+        self,
+        theme: Theme = Theme.DEFAULT,
+        force_color: bool | None = None,
+        console: Console | None = None,
+    ) -> None:
+        self._theme = theme
+        self._config = THEME_CONFIGS[theme]
+        self._color_enabled = detect_color_support(force_color)
+        if console is not None:
+            self._console = console
+        else:
+            # When color is disabled, use no_color=True so Rich strips markup
+            self._console = Console(no_color=not self._color_enabled)
+    @property
+    def theme(self) -> Theme:
+        """The active theme."""
+        return self._theme
+    @property
+    def config(self) -> ThemeConfig:
+        """The active theme configuration."""
+        return self._config
+    @property
+    def color_enabled(self) -> bool:
+        """Whether color output is enabled."""
+        return self._color_enabled
+    @property
+    def console(self) -> Console:
+        """The underlying Rich console."""
+        return self._console
+    # ------------------------------------------------------------------
+    # Print methods (write directly to console)
+    # ------------------------------------------------------------------
+    def success(self, message: str) -> None:
+        """Print a success message.
+        Args:
+            message: The message text.
+        """
+        self._print_styled(message, self._config.success_style, prefix="[ok]")
+    def error(self, message: str) -> None:
+        """Print an error message.
+        Args:
+            message: The message text.
+        """
+        self._print_styled(message, self._config.error_style, prefix="[error]")
+    def warning(self, message: str) -> None:
+        """Print a warning message.
+        Args:
+            message: The message text.
+        """
+        self._print_styled(message, self._config.warning_style, prefix="[warn]")
+    def info(self, message: str) -> None:
+        """Print an informational message.
+        Args:
+            message: The message text.
+        """
+        self._print_styled(message, self._config.info_style, prefix="[info]")
+    def header(self, message: str) -> None:
+        """Print a section header.
+        Args:
+            message: The header text.
+        """
+        self._print_styled(message, self._config.header_style)
+    # ------------------------------------------------------------------
+    # Format methods (return styled strings without printing)
+    # ------------------------------------------------------------------
+    def format_success(self, message: str) -> str:
+        """Return a Rich-markup formatted success string.
+        Args:
+            message: The message text.
+        Returns:
+            Formatted string with Rich markup tags, or plain text when
+            colors are disabled.
+        """
+        return self._format_styled(message, self._config.success_style, prefix="[ok]")
+    def format_error(self, message: str) -> str:
+        """Return a Rich-markup formatted error string.
+        Args:
+            message: The message text.
+        Returns:
+            Formatted string with Rich markup tags, or plain text when
+            colors are disabled.
+        """
+        return self._format_styled(message, self._config.error_style, prefix="[error]")
+    def format_warning(self, message: str) -> str:
+        """Return a Rich-markup formatted warning string.
+        Args:
+            message: The message text.
+        Returns:
+            Formatted string with Rich markup tags, or plain text when
+            colors are disabled.
+        """
+        return self._format_styled(message, self._config.warning_style, prefix="[warn]")
+    def format_info(self, message: str) -> str:
+        """Return a Rich-markup formatted info string.
+        Args:
+            message: The message text.
+        Returns:
+            Formatted string with Rich markup tags, or plain text when
+            colors are disabled.
+        """
+        return self._format_styled(message, self._config.info_style, prefix="[info]")
+    def format_header(self, message: str) -> str:
+        """Return a Rich-markup formatted header string.
+        Args:
+            message: The message text.
+        Returns:
+            Formatted string with Rich markup tags, or plain text when
+            colors are disabled.
+        """
+        return self._format_styled(message, self._config.header_style)
+    # ------------------------------------------------------------------
+    # Table rendering
+    # ------------------------------------------------------------------
+    def table(
+        self,
+        title: str,
+        columns: list[str],
+        rows: list[list[Any]],
+    ) -> None:
+        """Print a formatted Rich table.
+        Args:
+            title: Table title displayed above the table.
+            columns: List of column header names.
+            rows: List of rows, where each row is a list of cell values.
+                Values are converted to strings automatically.
+        """
+        tbl = Table(title=title, show_header=True, header_style=self._config.header_style)
+        for col in columns:
+            tbl.add_column(col)
+        for row in rows:
+            tbl.add_row(*(str(cell) for cell in row))
+        self._console.print(tbl)
+    # ------------------------------------------------------------------
+    # Progress bar
+    # ------------------------------------------------------------------
+    def progress_bar(self) -> Progress:
+        """Return a configured Rich Progress instance.
+        Returns:
+            A ``rich.progress.Progress`` object with spinner, description,
+            bar, completion count, elapsed time, and remaining time columns.
+        """
+        return Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            MofNCompleteColumn(),
+            TimeElapsedColumn(),
+            TimeRemainingColumn(),
+            console=self._console,
+        )
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    def _print_styled(self, message: str, style: str, prefix: str = "") -> None:
+        """Print a message with a Rich style and optional prefix.
+        Uses ``rich.text.Text`` objects throughout to prevent Rich from
+        interpreting bracket-style prefixes (e.g. ``[ok]``) as markup tags.
+        Args:
+            message: The message text.
+            style: Rich style string (e.g., ``"bold green"``).
+            prefix: Optional prefix tag like ``"[ok]"`` or ``"[error]"``.
+        """
+        text = Text()
+        if not self._color_enabled or not style:
+            if prefix:
+                text.append(f"{prefix} ")
+            text.append(message)
+        else:
+            if prefix:
+                text.append(f"{prefix} ", style=style)
+            text.append(message, style=style)
+        self._console.print(text)
+    def _format_styled(self, message: str, style: str, prefix: str = "") -> str:
+        """Return a message formatted with Rich markup.
+        When colors are disabled or the style is empty, returns plain text.
+        Args:
+            message: The message text.
+            style: Rich style string.
+            prefix: Optional prefix tag.
+        Returns:
+            Formatted string.
+        """
+        if not self._color_enabled or not style:
+            return f"{prefix} {message}" if prefix else message
+        if prefix:
+            return f"[{style}]{prefix} {message}[/{style}]"
+        return f"[{style}]{message}[/{style}]"
+def get_formatter(
+    theme: str | None = None,
+    no_color: bool = False,
+    console: Console | None = None,
+) -> OutputFormatter:
+    """Factory function to create a configured OutputFormatter.
+    This is the primary entry point for obtaining a formatter instance.
+    It resolves the theme from the provided argument, the ``MCPBR_THEME``
+    environment variable, or the default theme. It also respects the
+    ``NO_COLOR`` environment variable and the ``no_color`` parameter.
+    Args:
+        theme: Theme name (``"default"``, ``"minimal"``, or ``"plain"``).
+            Falls back to the ``MCPBR_THEME`` environment variable, then
+            ``"default"``.
+        no_color: If ``True``, forces color off regardless of other settings.
+        console: Optional Rich Console instance to use.
+    Returns:
+        A configured ``OutputFormatter`` instance.
+    Raises:
+        ValueError: If the theme name is not recognized.
+    """
+    resolved_theme = _resolve_theme(theme)
+    force_color: bool | None = None
+    if no_color:
+        force_color = False
+    return OutputFormatter(theme=resolved_theme, force_color=force_color, console=console)

mcpbr/gpu_support.py ADDED Viewed

@@ -0,0 +1,157 @@
+"""GPU support for Docker containers used in ML benchmark evaluations.
+Provides detection of available GPUs (NVIDIA), Docker GPU runtime checks,
+and Docker container configuration for GPU access.
+"""
+import logging
+import subprocess
+import docker
+import docker.types
+logger = logging.getLogger(__name__)
+def detect_gpus() -> dict:
+    """Detect available GPUs on the host system.
+    Checks for NVIDIA GPUs via nvidia-smi and verifies the Docker GPU runtime
+    is available.
+    Returns:
+        Dictionary with GPU detection results:
+            - nvidia_available (bool): Whether NVIDIA GPUs were detected.
+            - gpu_count (int): Number of GPUs found.
+            - gpu_names (list[str]): Names of detected GPUs.
+            - driver_version (str): NVIDIA driver version, or empty string.
+            - docker_runtime_available (bool): Whether Docker NVIDIA runtime is available.
+    """
+    info: dict = {
+        "nvidia_available": False,
+        "gpu_count": 0,
+        "gpu_names": [],
+        "driver_version": "",
+        "docker_runtime_available": False,
+    }
+    # Detect NVIDIA GPUs via nvidia-smi
+    try:
+        result = subprocess.run(
+            [
+                "nvidia-smi",
+                "--query-gpu=name,driver_version",
+                "--format=csv,noheader,nounits",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        if result.returncode == 0 and result.stdout.strip():
+            lines = result.stdout.strip().splitlines()
+            gpu_names = []
+            driver_version = ""
+            for line in lines:
+                parts = [p.strip() for p in line.split(",")]
+                if len(parts) >= 2:
+                    gpu_names.append(parts[0])
+                    driver_version = parts[1]
+                elif len(parts) == 1:
+                    gpu_names.append(parts[0])
+            info["nvidia_available"] = True
+            info["gpu_count"] = len(gpu_names)
+            info["gpu_names"] = gpu_names
+            info["driver_version"] = driver_version
+    except FileNotFoundError:
+        logger.debug("nvidia-smi not found; no NVIDIA GPUs detected.")
+    except subprocess.TimeoutExpired:
+        logger.warning("nvidia-smi timed out while detecting GPUs.")
+    except Exception as e:
+        logger.debug(f"GPU detection failed: {e}")
+    # Check Docker GPU runtime availability
+    info["docker_runtime_available"] = check_gpu_runtime()
+    return info
+def get_docker_gpu_config(gpu_enabled: bool) -> dict:
+    """Return Docker container creation kwargs for GPU access.
+    When gpu_enabled is True, returns a dictionary containing a DeviceRequest
+    that grants access to all available NVIDIA GPUs. This dict can be merged
+    into the kwargs passed to ``docker.containers.run()`` or
+    ``docker.containers.create()``.
+    Args:
+        gpu_enabled: Whether to enable GPU access in the container.
+    Returns:
+        Dictionary of Docker container kwargs. Empty dict if gpu_enabled is False.
+        When True, contains ``device_requests`` with a DeviceRequest for all GPUs.
+    """
+    if not gpu_enabled:
+        return {}
+    return {
+        "device_requests": [
+            docker.types.DeviceRequest(
+                count=-1,
+                capabilities=[["gpu"]],
+            )
+        ],
+    }
+def check_gpu_runtime() -> bool:
+    """Check if Docker has the NVIDIA runtime available.
+    Queries the Docker daemon info for registered runtimes and checks
+    whether the ``nvidia`` runtime is among them.
+    Returns:
+        True if the NVIDIA Docker runtime is available, False otherwise.
+    """
+    try:
+        client = docker.from_env()
+        docker_info = client.info()
+        runtimes = docker_info.get("Runtimes", {})
+        return "nvidia" in runtimes
+    except docker.errors.DockerException as e:
+        logger.debug(f"Could not query Docker for GPU runtime: {e}")
+        return False
+    except Exception as e:
+        logger.debug(f"Unexpected error checking Docker GPU runtime: {e}")
+        return False
+def format_gpu_info(info: dict) -> str:
+    """Format GPU detection info as a human-readable string.
+    Args:
+        info: Dictionary returned by ``detect_gpus()``.
+    Returns:
+        Human-readable multi-line string describing the GPU environment.
+    """
+    lines: list[str] = []
+    if not info.get("nvidia_available"):
+        lines.append("No NVIDIA GPUs detected.")
+    else:
+        gpu_count = info.get("gpu_count", 0)
+        lines.append(f"NVIDIA GPUs detected: {gpu_count}")
+        gpu_names = info.get("gpu_names", [])
+        for i, name in enumerate(gpu_names):
+            lines.append(f"  GPU {i}: {name}")
+        driver_version = info.get("driver_version", "")
+        if driver_version:
+            lines.append(f"Driver version: {driver_version}")
+    runtime_available = info.get("docker_runtime_available", False)
+    lines.append(f"Docker NVIDIA runtime: {'available' if runtime_available else 'not available'}")
+    return "\n".join(lines)

mcpbr/harness.py CHANGED Viewed

@@ -418,6 +418,7 @@ async def _run_mcp_evaluation(
     start_time = time.time()
     env: TaskEnvironment | None = None
+    agent_result: AgentResult | None = None
     try:
         # Track Docker environment creation time
         docker_start = time.time()
@@ -480,10 +481,15 @@ async def _run_mcp_evaluation(
         return result
     except asyncio.TimeoutError:
-        # Note: The agent harness should have captured partial statistics in the AgentResult
-        # before raising TimeoutError, but this is a fallback for unexpected timeout locations
         end_time = time.time()
         runtime_seconds = end_time - start_time
+        # Preserve agent metrics if the agent completed before the timeout
+        # (timeout may have occurred during evaluation, not during agent solve)
+        if agent_result is not None:
+            result = agent_result_to_dict(agent_result, None, config.model, runtime_seconds)
+            result["status"] = "timeout"
+            result["error"] = "Evaluation timed out after agent completed"
+            return result
         cost = calculate_cost(config.model, 0, 0)
         return {
             "resolved": False,
@@ -499,6 +505,11 @@ async def _run_mcp_evaluation(
     except Exception as e:
         end_time = time.time()
         runtime_seconds = end_time - start_time
+        # Preserve agent metrics if the agent completed before the error
+        if agent_result is not None:
+            result = agent_result_to_dict(agent_result, None, config.model, runtime_seconds)
+            result["error"] = str(e)
+            return result
         cost = calculate_cost(config.model, 0, 0)
         return {
             "resolved": False,
@@ -562,6 +573,7 @@ async def _run_baseline_evaluation(
     start_time = time.time()
     env: TaskEnvironment | None = None
+    agent_result: AgentResult | None = None
     try:
         # Track Docker environment creation time
         docker_start = time.time()
@@ -622,10 +634,15 @@ async def _run_baseline_evaluation(
         return result
     except asyncio.TimeoutError:
-        # Note: The agent harness should have captured partial statistics in the AgentResult
-        # before raising TimeoutError, but this is a fallback for unexpected timeout locations
         end_time = time.time()
         runtime_seconds = end_time - start_time
+        # Preserve agent metrics if the agent completed before the timeout
+        # (timeout may have occurred during evaluation, not during agent solve)
+        if agent_result is not None:
+            result = agent_result_to_dict(agent_result, None, config.model, runtime_seconds)
+            result["status"] = "timeout"
+            result["error"] = "Evaluation timed out after agent completed"
+            return result
         cost = calculate_cost(config.model, 0, 0)
         return {
             "resolved": False,
@@ -641,6 +658,11 @@ async def _run_baseline_evaluation(
     except Exception as e:
         end_time = time.time()
         runtime_seconds = end_time - start_time
+        # Preserve agent metrics if the agent completed before the error
+        if agent_result is not None:
+            result = agent_result_to_dict(agent_result, None, config.model, runtime_seconds)
+            result["error"] = str(e)
+            return result
         cost = calculate_cost(config.model, 0, 0)
         return {
             "resolved": False,
@@ -1182,6 +1204,18 @@ async def run_evaluation(
                 progress.stop()
     finally:
         await docker_manager.cleanup_all()
+        # Force-shutdown the default executor to prevent asyncio.run() from
+        # hanging during cleanup. Docker SDK background threads (urllib3
+        # connection pool) may linger after client.close(), causing
+        # executor.shutdown(wait=True) to block indefinitely.
+        try:
+            loop = asyncio.get_running_loop()
+            executor = getattr(loop, "_default_executor", None)
+            if executor is not None:
+                executor.shutdown(wait=False, cancel_futures=True)
+                loop._default_executor = None
+        except RuntimeError as exc:
+            console.print(f"[yellow]Default executor shutdown skipped: {exc}[/yellow]")
     # Check if we're in comparison mode
     if config.comparison_mode:

mcpbr 0.4.15__py3-none-any.whl → 0.5.0__py3-none-any.whl

mcpbr 0.4.15py3-none-any.whl → 0.5.0py3-none-any.whl