PyPI - mcpbr - Versions diffs - 0.4.16__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

mcpbr 0.4.16py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

mcpbr/__init__.py +20 -1
mcpbr/config.py +37 -1
mcpbr/config_migration.py +470 -0
mcpbr/config_wizard.py +647 -0
mcpbr/dashboard.py +619 -0
mcpbr/dataset_streaming.py +491 -0
mcpbr/docker_cache.py +539 -0
mcpbr/docker_env.py +2 -1
mcpbr/docker_prewarm.py +370 -0
mcpbr/dry_run.py +533 -0
mcpbr/formatting.py +444 -0
mcpbr/gpu_support.py +2 -1
mcpbr/graceful_degradation.py +277 -0
mcpbr/harness.py +38 -4
mcpbr/languages.py +228 -0
mcpbr/logging_config.py +207 -0
mcpbr/models.py +66 -0
mcpbr/preflight.py +2 -1
mcpbr/pricing.py +72 -0
mcpbr/providers.py +316 -3
mcpbr/resource_limits.py +487 -0
mcpbr/result_streaming.py +519 -0
mcpbr/sdk.py +264 -0
mcpbr/smoke_test.py +2 -1
mcpbr/task_batching.py +403 -0
mcpbr/task_scheduler.py +468 -0
{mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/METADATA +8 -1
{mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/RECORD +38 -22
{mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
{mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
{mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
{mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
{mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
{mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
{mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
{mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/WHEEL +0 -0
{mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/entry_points.txt +0 -0
{mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/licenses/LICENSE +0 -0

mcpbr/graceful_degradation.py ADDED Viewed

@@ -0,0 +1,277 @@
+"""Graceful degradation for benchmark evaluation.
+Provides fault-tolerant execution of benchmark tasks with failure isolation,
+classification, checkpointing, and configurable error handling policies.
+"""
+import asyncio
+import json
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from enum import Enum
+from pathlib import Path
+from typing import Any
+class FailureType(Enum):
+    """Classification of task failure types."""
+    TRANSIENT = "transient"
+    PERMANENT = "permanent"
+    UNKNOWN = "unknown"
+@dataclass
+class TaskFailure:
+    """Record of a single task failure.
+    Attributes:
+        task_id: Identifier of the failed task.
+        error: Error message describing the failure.
+        failure_type: Classification of the failure.
+        timestamp: ISO 8601 timestamp of when the failure occurred.
+        retryable: Whether the task could be retried.
+    """
+    task_id: str
+    error: str
+    failure_type: FailureType
+    timestamp: str
+    retryable: bool = True
+@dataclass
+class ExecutionCheckpoint:
+    """Checkpoint of execution state for crash recovery and resumption.
+    Tracks which tasks have completed, failed, or been skipped during
+    an evaluation run. Can be serialized to/from JSON for persistence.
+    Attributes:
+        completed_tasks: List of task IDs that completed successfully.
+        failed_tasks: List of TaskFailure records for failed tasks.
+        skipped_tasks: List of task IDs that were skipped.
+    """
+    completed_tasks: list[str] = field(default_factory=list)
+    failed_tasks: list[TaskFailure] = field(default_factory=list)
+    skipped_tasks: list[str] = field(default_factory=list)
+    def save(self, path: Path) -> None:
+        """Save checkpoint to a JSON file.
+        Args:
+            path: File path to write the checkpoint to.
+        """
+        data = {
+            "completed": self.completed_tasks,
+            "failed": [
+                {
+                    "task_id": f.task_id,
+                    "error": f.error,
+                    "type": f.failure_type.value,
+                    "timestamp": f.timestamp,
+                    "retryable": f.retryable,
+                }
+                for f in self.failed_tasks
+            ],
+            "skipped": self.skipped_tasks,
+        }
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(json.dumps(data, indent=2))
+    @classmethod
+    def load(cls, path: Path) -> "ExecutionCheckpoint":
+        """Load checkpoint from a JSON file.
+        Args:
+            path: File path to read the checkpoint from.
+        Returns:
+            ExecutionCheckpoint populated from the file.
+        """
+        data = json.loads(path.read_text())
+        return cls(
+            completed_tasks=data["completed"],
+            failed_tasks=[
+                TaskFailure(
+                    task_id=f["task_id"],
+                    error=f["error"],
+                    failure_type=FailureType(f["type"]),
+                    timestamp=f["timestamp"],
+                    retryable=f.get("retryable", True),
+                )
+                for f in data["failed"]
+            ],
+            skipped_tasks=data["skipped"],
+        )
+# Exception types considered transient (may succeed on retry)
+_TRANSIENT_ERRORS = (
+    TimeoutError,
+    asyncio.TimeoutError,
+    ConnectionError,
+    ConnectionResetError,
+    ConnectionRefusedError,
+    ConnectionAbortedError,
+    OSError,
+    IOError,
+)
+# Exception types considered permanent (will not succeed on retry)
+_PERMANENT_ERRORS = (
+    ValueError,
+    TypeError,
+    KeyError,
+    IndexError,
+    AttributeError,
+    NotImplementedError,
+    SyntaxError,
+    ImportError,
+)
+def classify_failure(error: Exception) -> FailureType:
+    """Classify an error as transient, permanent, or unknown.
+    Transient errors are those that may succeed on retry (timeouts,
+    connection issues, resource exhaustion). Permanent errors are
+    programming or configuration errors that will not resolve on retry.
+    Args:
+        error: The exception to classify.
+    Returns:
+        FailureType indicating the classification.
+    """
+    if isinstance(error, _TRANSIENT_ERRORS):
+        return FailureType.TRANSIENT
+    if isinstance(error, _PERMANENT_ERRORS):
+        return FailureType.PERMANENT
+    return FailureType.UNKNOWN
+class GracefulExecutor:
+    """Executor that provides graceful degradation for benchmark tasks.
+    Isolates task failures so that one failing task does not prevent
+    other tasks from executing. Supports configurable error policies
+    including continue-on-error and max-failure thresholds.
+    Args:
+        continue_on_error: If True, continue executing tasks after failures.
+            If False, stop on the first failure.
+        max_failures: Maximum number of failures before stopping execution.
+            None means no limit (continue until all tasks are processed).
+        checkpoint_dir: Directory to save execution checkpoints for crash recovery.
+            None means no checkpointing.
+    """
+    def __init__(
+        self,
+        continue_on_error: bool = True,
+        max_failures: int | None = None,
+        checkpoint_dir: Path | None = None,
+    ) -> None:
+        """Initialize GracefulExecutor.
+        Args:
+            continue_on_error: Whether to continue after task failures.
+            max_failures: Maximum failures before halting. None for unlimited.
+            checkpoint_dir: Directory for saving checkpoint files.
+        """
+        self.continue_on_error = continue_on_error
+        self.max_failures = max_failures
+        self.checkpoint_dir = checkpoint_dir
+        self.checkpoint = ExecutionCheckpoint()
+    async def execute_task(self, task_id: str, coro: Any) -> Any | None:
+        """Execute a single task with failure isolation.
+        Wraps the coroutine execution in error handling that records
+        failures without propagating them (when continue_on_error is True).
+        Args:
+            task_id: Identifier for the task being executed.
+            coro: Awaitable coroutine to execute.
+        Returns:
+            The result of the coroutine, or None if the task failed.
+        """
+        try:
+            result = await coro
+            self.checkpoint.completed_tasks.append(task_id)
+            self._save_checkpoint()
+            return result
+        except Exception as e:
+            failure_type = classify_failure(e)
+            failure = TaskFailure(
+                task_id=task_id,
+                error=str(e),
+                failure_type=failure_type,
+                timestamp=datetime.now(timezone.utc).isoformat(),
+                retryable=failure_type == FailureType.TRANSIENT,
+            )
+            self.checkpoint.failed_tasks.append(failure)
+            self._save_checkpoint()
+            return None
+    def should_continue(self) -> bool:
+        """Determine whether execution should continue.
+        Considers the continue_on_error flag and the max_failures threshold.
+        Returns:
+            True if execution should continue, False if it should stop.
+        """
+        failure_count = len(self.checkpoint.failed_tasks)
+        # If any failure occurred and continue_on_error is False, stop
+        if not self.continue_on_error and failure_count > 0:
+            return False
+        # If max_failures is set and we've reached it, stop
+        if self.max_failures is not None and failure_count >= self.max_failures:
+            return False
+        return True
+    def get_partial_report(self) -> dict[str, Any]:
+        """Generate a report of execution progress including partial results.
+        Returns:
+            Dictionary with execution statistics and failure details.
+        """
+        completed_count = len(self.checkpoint.completed_tasks)
+        failed_count = len(self.checkpoint.failed_tasks)
+        skipped_count = len(self.checkpoint.skipped_tasks)
+        total_tasks = completed_count + failed_count + skipped_count
+        success_rate = completed_count / total_tasks if total_tasks > 0 else 0.0
+        failures = [
+            {
+                "task_id": f.task_id,
+                "error": f.error,
+                "failure_type": f.failure_type.value,
+                "timestamp": f.timestamp,
+                "retryable": f.retryable,
+            }
+            for f in self.checkpoint.failed_tasks
+        ]
+        return {
+            "total_tasks": total_tasks,
+            "completed_count": completed_count,
+            "failed_count": failed_count,
+            "skipped_count": skipped_count,
+            "success_rate": success_rate,
+            "failures": failures,
+        }
+    def _save_checkpoint(self) -> None:
+        """Save checkpoint to disk if checkpoint_dir is configured."""
+        if self.checkpoint_dir is not None:
+            checkpoint_path = self.checkpoint_dir / "checkpoint.json"
+            self.checkpoint.save(checkpoint_path)

mcpbr/harness.py CHANGED Viewed

@@ -418,6 +418,7 @@ async def _run_mcp_evaluation(
     start_time = time.time()
     env: TaskEnvironment | None = None
+    agent_result: AgentResult | None = None
     try:
         # Track Docker environment creation time
         docker_start = time.time()
@@ -480,10 +481,15 @@ async def _run_mcp_evaluation(
         return result
     except asyncio.TimeoutError:
-        # Note: The agent harness should have captured partial statistics in the AgentResult
-        # before raising TimeoutError, but this is a fallback for unexpected timeout locations
         end_time = time.time()
         runtime_seconds = end_time - start_time
+        # Preserve agent metrics if the agent completed before the timeout
+        # (timeout may have occurred during evaluation, not during agent solve)
+        if agent_result is not None:
+            result = agent_result_to_dict(agent_result, None, config.model, runtime_seconds)
+            result["status"] = "timeout"
+            result["error"] = "Evaluation timed out after agent completed"
+            return result
         cost = calculate_cost(config.model, 0, 0)
         return {
             "resolved": False,
@@ -499,6 +505,11 @@ async def _run_mcp_evaluation(
     except Exception as e:
         end_time = time.time()
         runtime_seconds = end_time - start_time
+        # Preserve agent metrics if the agent completed before the error
+        if agent_result is not None:
+            result = agent_result_to_dict(agent_result, None, config.model, runtime_seconds)
+            result["error"] = str(e)
+            return result
         cost = calculate_cost(config.model, 0, 0)
         return {
             "resolved": False,
@@ -562,6 +573,7 @@ async def _run_baseline_evaluation(
     start_time = time.time()
     env: TaskEnvironment | None = None
+    agent_result: AgentResult | None = None
     try:
         # Track Docker environment creation time
         docker_start = time.time()
@@ -622,10 +634,15 @@ async def _run_baseline_evaluation(
         return result
     except asyncio.TimeoutError:
-        # Note: The agent harness should have captured partial statistics in the AgentResult
-        # before raising TimeoutError, but this is a fallback for unexpected timeout locations
         end_time = time.time()
         runtime_seconds = end_time - start_time
+        # Preserve agent metrics if the agent completed before the timeout
+        # (timeout may have occurred during evaluation, not during agent solve)
+        if agent_result is not None:
+            result = agent_result_to_dict(agent_result, None, config.model, runtime_seconds)
+            result["status"] = "timeout"
+            result["error"] = "Evaluation timed out after agent completed"
+            return result
         cost = calculate_cost(config.model, 0, 0)
         return {
             "resolved": False,
@@ -641,6 +658,11 @@ async def _run_baseline_evaluation(
     except Exception as e:
         end_time = time.time()
         runtime_seconds = end_time - start_time
+        # Preserve agent metrics if the agent completed before the error
+        if agent_result is not None:
+            result = agent_result_to_dict(agent_result, None, config.model, runtime_seconds)
+            result["error"] = str(e)
+            return result
         cost = calculate_cost(config.model, 0, 0)
         return {
             "resolved": False,
@@ -1182,6 +1204,18 @@ async def run_evaluation(
                 progress.stop()
     finally:
         await docker_manager.cleanup_all()
+        # Force-shutdown the default executor to prevent asyncio.run() from
+        # hanging during cleanup. Docker SDK background threads (urllib3
+        # connection pool) may linger after client.close(), causing
+        # executor.shutdown(wait=True) to block indefinitely.
+        try:
+            loop = asyncio.get_running_loop()
+            executor = getattr(loop, "_default_executor", None)
+            if executor is not None:
+                executor.shutdown(wait=False, cancel_futures=True)
+                loop._default_executor = None
+        except RuntimeError as exc:
+            console.print(f"[yellow]Default executor shutdown skipped: {exc}[/yellow]")
     # Check if we're in comparison mode
     if config.comparison_mode:

mcpbr/languages.py ADDED Viewed

@@ -0,0 +1,228 @@
+"""Multi-language support for code generation benchmarks.
+This module provides:
+- Language enum defining supported programming languages.
+- LanguageConfig dataclass with per-language Docker, run, compile, and test settings.
+- detect_language() to identify the language from a filename or code snippet.
+- get_language_config() to retrieve configuration for a given language.
+- get_supported_languages() to list all supported language names.
+- CrossLanguageMetrics for comparing benchmark performance across languages.
+"""
+from __future__ import annotations
+import os
+import re
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any
+class Language(Enum):
+    """Supported programming languages for code generation benchmarks."""
+    PYTHON = "python"
+    JAVASCRIPT = "javascript"
+    TYPESCRIPT = "typescript"
+    JAVA = "java"
+    GO = "go"
+@dataclass
+class LanguageConfig:
+    """Configuration for running and testing code in a specific language.
+    Attributes:
+        name: Human-readable language name.
+        language: The Language enum member.
+        file_extension: File extension including the dot (e.g., ".py").
+        docker_image: Docker image used to run code in this language.
+        run_command: Command template to run a file. Use {file} as placeholder.
+        test_framework: Name of the default test framework for this language.
+        compile_command: Optional command template to compile. None for interpreted languages.
+    """
+    name: str
+    language: Language
+    file_extension: str
+    docker_image: str
+    run_command: str
+    test_framework: str
+    compile_command: str | None = None
+LANGUAGE_CONFIGS: dict[Language, LanguageConfig] = {
+    Language.PYTHON: LanguageConfig(
+        name="Python",
+        language=Language.PYTHON,
+        file_extension=".py",
+        docker_image="python:3.11-slim",
+        run_command="python {file}",
+        test_framework="pytest",
+    ),
+    Language.JAVASCRIPT: LanguageConfig(
+        name="JavaScript",
+        language=Language.JAVASCRIPT,
+        file_extension=".js",
+        docker_image="node:20-slim",
+        run_command="node {file}",
+        test_framework="jest",
+    ),
+    Language.TYPESCRIPT: LanguageConfig(
+        name="TypeScript",
+        language=Language.TYPESCRIPT,
+        file_extension=".ts",
+        docker_image="node:20-slim",
+        run_command="npx ts-node {file}",
+        test_framework="jest",
+        compile_command="npx tsc {file}",
+    ),
+    Language.JAVA: LanguageConfig(
+        name="Java",
+        language=Language.JAVA,
+        file_extension=".java",
+        docker_image="eclipse-temurin:17-jdk-jammy",
+        run_command="java {file}",  # Requires Java 11+ single-file source execution
+        test_framework="junit",
+        compile_command="javac {file}",
+    ),
+    Language.GO: LanguageConfig(
+        name="Go",
+        language=Language.GO,
+        file_extension=".go",
+        docker_image="golang:1.21-alpine",
+        run_command="go run {file}",
+        test_framework="go test",
+        compile_command="go build {file}",
+    ),
+}
+# Map file extensions to languages for filename-based detection.
+_EXTENSION_MAP: dict[str, Language] = {
+    config.file_extension: lang for lang, config in LANGUAGE_CONFIGS.items()
+}
+# Ordered list of (pattern, language) tuples for code content detection.
+# More specific patterns come first to avoid false positives.
+_CODE_PATTERNS: list[tuple[re.Pattern[str], Language]] = [
+    # Go: package declaration is highly distinctive
+    (re.compile(r"^package\s+\w+", re.MULTILINE), Language.GO),
+    (re.compile(r"\bfunc\s+\w+\s*\("), Language.GO),
+    # Java: class declaration with access modifier
+    (re.compile(r"\bpublic\s+class\s+\w+"), Language.JAVA),
+    (re.compile(r"\bpublic\s+static\s+void\s+main"), Language.JAVA),
+    # TypeScript: type annotations on const/let/var, or interface keyword
+    (re.compile(r"\b(?:const|let|var)\s+\w+\s*:\s*\w+"), Language.TYPESCRIPT),
+    (re.compile(r"\binterface\s+\w+\s*\{"), Language.TYPESCRIPT),
+    # JavaScript: const/let/var without type annotations, require(), console.log
+    (re.compile(r"\brequire\s*\(\s*['\"]"), Language.JAVASCRIPT),
+    (re.compile(r"\bconsole\.log\s*\("), Language.JAVASCRIPT),
+    (re.compile(r"\b(?:const|let|var)\s+\w+\s*="), Language.JAVASCRIPT),
+    # Python: def/class with colon, import, print()
+    (re.compile(r"^def\s+\w+\s*\(.*\)\s*:", re.MULTILINE), Language.PYTHON),
+    (re.compile(r"^import\s+\w+", re.MULTILINE), Language.PYTHON),
+    (re.compile(r"\bprint\s*\("), Language.PYTHON),
+]
+def detect_language(code: str | None = None, filename: str | None = None) -> Language | None:
+    """Detect the programming language from a filename or code snippet.
+    Filename-based detection takes priority over code content analysis.
+    Args:
+        code: Source code string to analyze.
+        filename: Filename (with or without path) to check extension.
+    Returns:
+        The detected Language, or None if detection fails.
+    """
+    # Try filename-based detection first (higher confidence).
+    if filename:
+        _, ext = os.path.splitext(filename)
+        if ext in _EXTENSION_MAP:
+            return _EXTENSION_MAP[ext]
+    # Fall back to code content analysis.
+    if code:
+        for pattern, language in _CODE_PATTERNS:
+            if pattern.search(code):
+                return language
+    return None
+def get_language_config(language: Language) -> LanguageConfig:
+    """Get the configuration for a given language.
+    Args:
+        language: A Language enum member.
+    Returns:
+        The LanguageConfig for the specified language.
+    """
+    return LANGUAGE_CONFIGS[language]
+def get_supported_languages() -> list[str]:
+    """Return a list of all supported language name strings.
+    Returns:
+        List of language value strings (e.g., ["python", "javascript", ...]).
+    """
+    return [lang.value for lang in Language]
+@dataclass
+class CrossLanguageMetrics:
+    """Aggregated benchmark metrics across multiple programming languages.
+    Attributes:
+        language_scores: Mapping of language name to its pass rate (resolved ratio).
+        best_language: The language with the highest pass rate.
+        worst_language: The language with the lowest pass rate.
+        average_score: The mean pass rate across all languages.
+    """
+    language_scores: dict[str, float]
+    best_language: str
+    worst_language: str
+    average_score: float
+    @classmethod
+    def from_results(cls, results: dict[str, list[dict[str, Any]]]) -> CrossLanguageMetrics:
+        """Compute cross-language metrics from per-language result lists.
+        Each result dict is expected to have a ``"resolved"`` boolean key.
+        The pass rate for a language is the fraction of results where
+        ``resolved`` is ``True``.
+        Args:
+            results: Mapping of language name to list of result dicts.
+        Returns:
+            A CrossLanguageMetrics instance with computed scores.
+        Raises:
+            ValueError: If results is empty or any language has an empty result list.
+        """
+        if not results:
+            raise ValueError("results must not be empty")
+        language_scores: dict[str, float] = {}
+        for lang_name, lang_results in results.items():
+            if not lang_results:
+                raise ValueError(f"Result list for language '{lang_name}' must not be empty")
+            resolved_count = sum(1 for r in lang_results if r.get("resolved", False))
+            language_scores[lang_name] = resolved_count / len(lang_results)
+        best_language = max(language_scores, key=language_scores.get)  # type: ignore[arg-type]
+        worst_language = min(language_scores, key=language_scores.get)  # type: ignore[arg-type]
+        average_score = sum(language_scores.values()) / len(language_scores)
+        return cls(
+            language_scores=language_scores,
+            best_language=best_language,
+            worst_language=worst_language,
+            average_score=average_score,
+        )

mcpbr 0.4.16__py3-none-any.whl → 0.6.0__py3-none-any.whl

mcpbr 0.4.16py3-none-any.whl → 0.6.0py3-none-any.whl