PyPI - mcpbr - Versions diffs - 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

mcpbr 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

mcpbr/__init__.py +20 -1
mcpbr/config.py +37 -1
mcpbr/docker_env.py +2 -1
mcpbr/docker_prewarm.py +2 -1
mcpbr/dry_run.py +2 -1
mcpbr/gpu_support.py +2 -1
mcpbr/graceful_degradation.py +277 -0
mcpbr/languages.py +228 -0
mcpbr/logging_config.py +207 -0
mcpbr/models.py +66 -0
mcpbr/preflight.py +2 -1
mcpbr/pricing.py +72 -0
mcpbr/providers.py +316 -3
mcpbr/sdk.py +264 -0
mcpbr/smoke_test.py +2 -1
{mcpbr-0.5.0.dist-info → mcpbr-0.6.0.dist-info}/METADATA +8 -1
{mcpbr-0.5.0.dist-info → mcpbr-0.6.0.dist-info}/RECORD +27 -23
{mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
{mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
{mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
{mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
{mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
{mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
{mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
{mcpbr-0.5.0.dist-info → mcpbr-0.6.0.dist-info}/WHEEL +0 -0
{mcpbr-0.5.0.dist-info → mcpbr-0.6.0.dist-info}/entry_points.txt +0 -0
{mcpbr-0.5.0.dist-info → mcpbr-0.6.0.dist-info}/licenses/LICENSE +0 -0

mcpbr/__init__.py CHANGED Viewed

@@ -3,4 +3,23 @@
 A benchmark runner for evaluating MCP servers against SWE-bench tasks.
 """
-__version__ = "0.3.23"
+__version__ = "0.6.0"
+from .sdk import (
+    BenchmarkResult,
+    MCPBenchmark,
+    get_version,
+    list_benchmarks,
+    list_models,
+    list_providers,
+)
+__all__ = [
+    "__version__",
+    "BenchmarkResult",
+    "MCPBenchmark",
+    "get_version",
+    "list_benchmarks",
+    "list_models",
+    "list_providers",
+]

mcpbr/config.py CHANGED Viewed

@@ -12,7 +12,7 @@ from .config_inheritance import load_config_with_inheritance
 from .env_expansion import expand_env_vars, load_dotenv_file, validate_config_security
 from .models import DEFAULT_MODEL
-VALID_PROVIDERS = ("anthropic",)
+VALID_PROVIDERS = ("anthropic", "openai", "gemini", "qwen")
 VALID_HARNESSES = ("claude-code",)
 VALID_BENCHMARKS = (
     "swe-bench-lite",
@@ -431,6 +431,42 @@ class HarnessConfig(BaseModel):
         description="Infrastructure configuration (local or azure)",
     )
+    continue_on_error: bool = Field(
+        default=True,
+        description="Continue evaluation when individual tasks fail instead of stopping",
+    )
+    max_failures: int | None = Field(
+        default=None,
+        description="Maximum number of task failures before halting evaluation (None for unlimited)",
+    )
+    checkpoint_interval: int = Field(
+        default=1,
+        description="Save execution checkpoint every N completed tasks",
+    )
+    resume_from_checkpoint: Path | None = Field(
+        default=None,
+        description="Path to a checkpoint file to resume evaluation from",
+    )
+    @field_validator("checkpoint_interval")
+    @classmethod
+    def validate_checkpoint_interval(cls, v: int) -> int:
+        """Validate checkpoint_interval is at least 1."""
+        if v < 1:
+            raise ValueError("checkpoint_interval must be at least 1")
+        return v
+    @field_validator("max_failures")
+    @classmethod
+    def validate_max_failures(cls, v: int | None) -> int | None:
+        """Validate max_failures is positive if set."""
+        if v is not None and v < 1:
+            raise ValueError("max_failures must be at least 1")
+        return v
     @field_validator("provider")
     @classmethod
     def validate_provider(cls, v: str) -> str:

mcpbr/docker_env.py CHANGED Viewed

@@ -13,11 +13,12 @@ from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
-import docker
 from docker.models.containers import Container
 from docker.models.networks import Network
 from docker.models.volumes import Volume
+import docker
 MCPBR_LABEL = "mcpbr"
 MCPBR_INSTANCE_LABEL = "mcpbr.instance"
 MCPBR_SESSION_LABEL = "mcpbr.session"

mcpbr/docker_prewarm.py CHANGED Viewed

@@ -11,12 +11,13 @@ import time
 from dataclasses import dataclass, field
 from typing import Any, Callable
-import docker
 import docker.errors
 from rich.console import Console
 from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
 from rich.table import Table
+import docker
 from .docker_env import SWEBENCH_IMAGE_REGISTRY, get_swebench_image_name
 logger = logging.getLogger(__name__)

mcpbr/dry_run.py CHANGED Viewed

@@ -13,11 +13,12 @@ import os
 import shutil
 from dataclasses import dataclass, field
-import docker
 from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
+import docker
 from .benchmarks import create_benchmark
 from .config import HarnessConfig
 from .config_validator import ConfigValidator, ValidationResult

mcpbr/gpu_support.py CHANGED Viewed

@@ -7,9 +7,10 @@ and Docker container configuration for GPU access.
 import logging
 import subprocess
-import docker
 import docker.types
+import docker
 logger = logging.getLogger(__name__)

mcpbr/graceful_degradation.py ADDED Viewed

@@ -0,0 +1,277 @@
+"""Graceful degradation for benchmark evaluation.
+Provides fault-tolerant execution of benchmark tasks with failure isolation,
+classification, checkpointing, and configurable error handling policies.
+"""
+import asyncio
+import json
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from enum import Enum
+from pathlib import Path
+from typing import Any
+class FailureType(Enum):
+    """Classification of task failure types."""
+    TRANSIENT = "transient"
+    PERMANENT = "permanent"
+    UNKNOWN = "unknown"
+@dataclass
+class TaskFailure:
+    """Record of a single task failure.
+    Attributes:
+        task_id: Identifier of the failed task.
+        error: Error message describing the failure.
+        failure_type: Classification of the failure.
+        timestamp: ISO 8601 timestamp of when the failure occurred.
+        retryable: Whether the task could be retried.
+    """
+    task_id: str
+    error: str
+    failure_type: FailureType
+    timestamp: str
+    retryable: bool = True
+@dataclass
+class ExecutionCheckpoint:
+    """Checkpoint of execution state for crash recovery and resumption.
+    Tracks which tasks have completed, failed, or been skipped during
+    an evaluation run. Can be serialized to/from JSON for persistence.
+    Attributes:
+        completed_tasks: List of task IDs that completed successfully.
+        failed_tasks: List of TaskFailure records for failed tasks.
+        skipped_tasks: List of task IDs that were skipped.
+    """
+    completed_tasks: list[str] = field(default_factory=list)
+    failed_tasks: list[TaskFailure] = field(default_factory=list)
+    skipped_tasks: list[str] = field(default_factory=list)
+    def save(self, path: Path) -> None:
+        """Save checkpoint to a JSON file.
+        Args:
+            path: File path to write the checkpoint to.
+        """
+        data = {
+            "completed": self.completed_tasks,
+            "failed": [
+                {
+                    "task_id": f.task_id,
+                    "error": f.error,
+                    "type": f.failure_type.value,
+                    "timestamp": f.timestamp,
+                    "retryable": f.retryable,
+                }
+                for f in self.failed_tasks
+            ],
+            "skipped": self.skipped_tasks,
+        }
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(json.dumps(data, indent=2))
+    @classmethod
+    def load(cls, path: Path) -> "ExecutionCheckpoint":
+        """Load checkpoint from a JSON file.
+        Args:
+            path: File path to read the checkpoint from.
+        Returns:
+            ExecutionCheckpoint populated from the file.
+        """
+        data = json.loads(path.read_text())
+        return cls(
+            completed_tasks=data["completed"],
+            failed_tasks=[
+                TaskFailure(
+                    task_id=f["task_id"],
+                    error=f["error"],
+                    failure_type=FailureType(f["type"]),
+                    timestamp=f["timestamp"],
+                    retryable=f.get("retryable", True),
+                )
+                for f in data["failed"]
+            ],
+            skipped_tasks=data["skipped"],
+        )
+# Exception types considered transient (may succeed on retry)
+_TRANSIENT_ERRORS = (
+    TimeoutError,
+    asyncio.TimeoutError,
+    ConnectionError,
+    ConnectionResetError,
+    ConnectionRefusedError,
+    ConnectionAbortedError,
+    OSError,
+    IOError,
+)
+# Exception types considered permanent (will not succeed on retry)
+_PERMANENT_ERRORS = (
+    ValueError,
+    TypeError,
+    KeyError,
+    IndexError,
+    AttributeError,
+    NotImplementedError,
+    SyntaxError,
+    ImportError,
+)
+def classify_failure(error: Exception) -> FailureType:
+    """Classify an error as transient, permanent, or unknown.
+    Transient errors are those that may succeed on retry (timeouts,
+    connection issues, resource exhaustion). Permanent errors are
+    programming or configuration errors that will not resolve on retry.
+    Args:
+        error: The exception to classify.
+    Returns:
+        FailureType indicating the classification.
+    """
+    if isinstance(error, _TRANSIENT_ERRORS):
+        return FailureType.TRANSIENT
+    if isinstance(error, _PERMANENT_ERRORS):
+        return FailureType.PERMANENT
+    return FailureType.UNKNOWN
+class GracefulExecutor:
+    """Executor that provides graceful degradation for benchmark tasks.
+    Isolates task failures so that one failing task does not prevent
+    other tasks from executing. Supports configurable error policies
+    including continue-on-error and max-failure thresholds.
+    Args:
+        continue_on_error: If True, continue executing tasks after failures.
+            If False, stop on the first failure.
+        max_failures: Maximum number of failures before stopping execution.
+            None means no limit (continue until all tasks are processed).
+        checkpoint_dir: Directory to save execution checkpoints for crash recovery.
+            None means no checkpointing.
+    """
+    def __init__(
+        self,
+        continue_on_error: bool = True,
+        max_failures: int | None = None,
+        checkpoint_dir: Path | None = None,
+    ) -> None:
+        """Initialize GracefulExecutor.
+        Args:
+            continue_on_error: Whether to continue after task failures.
+            max_failures: Maximum failures before halting. None for unlimited.
+            checkpoint_dir: Directory for saving checkpoint files.
+        """
+        self.continue_on_error = continue_on_error
+        self.max_failures = max_failures
+        self.checkpoint_dir = checkpoint_dir
+        self.checkpoint = ExecutionCheckpoint()
+    async def execute_task(self, task_id: str, coro: Any) -> Any | None:
+        """Execute a single task with failure isolation.
+        Wraps the coroutine execution in error handling that records
+        failures without propagating them (when continue_on_error is True).
+        Args:
+            task_id: Identifier for the task being executed.
+            coro: Awaitable coroutine to execute.
+        Returns:
+            The result of the coroutine, or None if the task failed.
+        """
+        try:
+            result = await coro
+            self.checkpoint.completed_tasks.append(task_id)
+            self._save_checkpoint()
+            return result
+        except Exception as e:
+            failure_type = classify_failure(e)
+            failure = TaskFailure(
+                task_id=task_id,
+                error=str(e),
+                failure_type=failure_type,
+                timestamp=datetime.now(timezone.utc).isoformat(),
+                retryable=failure_type == FailureType.TRANSIENT,
+            )
+            self.checkpoint.failed_tasks.append(failure)
+            self._save_checkpoint()
+            return None
+    def should_continue(self) -> bool:
+        """Determine whether execution should continue.
+        Considers the continue_on_error flag and the max_failures threshold.
+        Returns:
+            True if execution should continue, False if it should stop.
+        """
+        failure_count = len(self.checkpoint.failed_tasks)
+        # If any failure occurred and continue_on_error is False, stop
+        if not self.continue_on_error and failure_count > 0:
+            return False
+        # If max_failures is set and we've reached it, stop
+        if self.max_failures is not None and failure_count >= self.max_failures:
+            return False
+        return True
+    def get_partial_report(self) -> dict[str, Any]:
+        """Generate a report of execution progress including partial results.
+        Returns:
+            Dictionary with execution statistics and failure details.
+        """
+        completed_count = len(self.checkpoint.completed_tasks)
+        failed_count = len(self.checkpoint.failed_tasks)
+        skipped_count = len(self.checkpoint.skipped_tasks)
+        total_tasks = completed_count + failed_count + skipped_count
+        success_rate = completed_count / total_tasks if total_tasks > 0 else 0.0
+        failures = [
+            {
+                "task_id": f.task_id,
+                "error": f.error,
+                "failure_type": f.failure_type.value,
+                "timestamp": f.timestamp,
+                "retryable": f.retryable,
+            }
+            for f in self.checkpoint.failed_tasks
+        ]
+        return {
+            "total_tasks": total_tasks,
+            "completed_count": completed_count,
+            "failed_count": failed_count,
+            "skipped_count": skipped_count,
+            "success_rate": success_rate,
+            "failures": failures,
+        }
+    def _save_checkpoint(self) -> None:
+        """Save checkpoint to disk if checkpoint_dir is configured."""
+        if self.checkpoint_dir is not None:
+            checkpoint_path = self.checkpoint_dir / "checkpoint.json"
+            self.checkpoint.save(checkpoint_path)

mcpbr/languages.py ADDED Viewed

@@ -0,0 +1,228 @@
+"""Multi-language support for code generation benchmarks.
+This module provides:
+- Language enum defining supported programming languages.
+- LanguageConfig dataclass with per-language Docker, run, compile, and test settings.
+- detect_language() to identify the language from a filename or code snippet.
+- get_language_config() to retrieve configuration for a given language.
+- get_supported_languages() to list all supported language names.
+- CrossLanguageMetrics for comparing benchmark performance across languages.
+"""
+from __future__ import annotations
+import os
+import re
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any
+class Language(Enum):
+    """Supported programming languages for code generation benchmarks."""
+    PYTHON = "python"
+    JAVASCRIPT = "javascript"
+    TYPESCRIPT = "typescript"
+    JAVA = "java"
+    GO = "go"
+@dataclass
+class LanguageConfig:
+    """Configuration for running and testing code in a specific language.
+    Attributes:
+        name: Human-readable language name.
+        language: The Language enum member.
+        file_extension: File extension including the dot (e.g., ".py").
+        docker_image: Docker image used to run code in this language.
+        run_command: Command template to run a file. Use {file} as placeholder.
+        test_framework: Name of the default test framework for this language.
+        compile_command: Optional command template to compile. None for interpreted languages.
+    """
+    name: str
+    language: Language
+    file_extension: str
+    docker_image: str
+    run_command: str
+    test_framework: str
+    compile_command: str | None = None
+LANGUAGE_CONFIGS: dict[Language, LanguageConfig] = {
+    Language.PYTHON: LanguageConfig(
+        name="Python",
+        language=Language.PYTHON,
+        file_extension=".py",
+        docker_image="python:3.11-slim",
+        run_command="python {file}",
+        test_framework="pytest",
+    ),
+    Language.JAVASCRIPT: LanguageConfig(
+        name="JavaScript",
+        language=Language.JAVASCRIPT,
+        file_extension=".js",
+        docker_image="node:20-slim",
+        run_command="node {file}",
+        test_framework="jest",
+    ),
+    Language.TYPESCRIPT: LanguageConfig(
+        name="TypeScript",
+        language=Language.TYPESCRIPT,
+        file_extension=".ts",
+        docker_image="node:20-slim",
+        run_command="npx ts-node {file}",
+        test_framework="jest",
+        compile_command="npx tsc {file}",
+    ),
+    Language.JAVA: LanguageConfig(
+        name="Java",
+        language=Language.JAVA,
+        file_extension=".java",
+        docker_image="eclipse-temurin:17-jdk-jammy",
+        run_command="java {file}",  # Requires Java 11+ single-file source execution
+        test_framework="junit",
+        compile_command="javac {file}",
+    ),
+    Language.GO: LanguageConfig(
+        name="Go",
+        language=Language.GO,
+        file_extension=".go",
+        docker_image="golang:1.21-alpine",
+        run_command="go run {file}",
+        test_framework="go test",
+        compile_command="go build {file}",
+    ),
+}
+# Map file extensions to languages for filename-based detection.
+_EXTENSION_MAP: dict[str, Language] = {
+    config.file_extension: lang for lang, config in LANGUAGE_CONFIGS.items()
+}
+# Ordered list of (pattern, language) tuples for code content detection.
+# More specific patterns come first to avoid false positives.
+_CODE_PATTERNS: list[tuple[re.Pattern[str], Language]] = [
+    # Go: package declaration is highly distinctive
+    (re.compile(r"^package\s+\w+", re.MULTILINE), Language.GO),
+    (re.compile(r"\bfunc\s+\w+\s*\("), Language.GO),
+    # Java: class declaration with access modifier
+    (re.compile(r"\bpublic\s+class\s+\w+"), Language.JAVA),
+    (re.compile(r"\bpublic\s+static\s+void\s+main"), Language.JAVA),
+    # TypeScript: type annotations on const/let/var, or interface keyword
+    (re.compile(r"\b(?:const|let|var)\s+\w+\s*:\s*\w+"), Language.TYPESCRIPT),
+    (re.compile(r"\binterface\s+\w+\s*\{"), Language.TYPESCRIPT),
+    # JavaScript: const/let/var without type annotations, require(), console.log
+    (re.compile(r"\brequire\s*\(\s*['\"]"), Language.JAVASCRIPT),
+    (re.compile(r"\bconsole\.log\s*\("), Language.JAVASCRIPT),
+    (re.compile(r"\b(?:const|let|var)\s+\w+\s*="), Language.JAVASCRIPT),
+    # Python: def/class with colon, import, print()
+    (re.compile(r"^def\s+\w+\s*\(.*\)\s*:", re.MULTILINE), Language.PYTHON),
+    (re.compile(r"^import\s+\w+", re.MULTILINE), Language.PYTHON),
+    (re.compile(r"\bprint\s*\("), Language.PYTHON),
+]
+def detect_language(code: str | None = None, filename: str | None = None) -> Language | None:
+    """Detect the programming language from a filename or code snippet.
+    Filename-based detection takes priority over code content analysis.
+    Args:
+        code: Source code string to analyze.
+        filename: Filename (with or without path) to check extension.
+    Returns:
+        The detected Language, or None if detection fails.
+    """
+    # Try filename-based detection first (higher confidence).
+    if filename:
+        _, ext = os.path.splitext(filename)
+        if ext in _EXTENSION_MAP:
+            return _EXTENSION_MAP[ext]
+    # Fall back to code content analysis.
+    if code:
+        for pattern, language in _CODE_PATTERNS:
+            if pattern.search(code):
+                return language
+    return None
+def get_language_config(language: Language) -> LanguageConfig:
+    """Get the configuration for a given language.
+    Args:
+        language: A Language enum member.
+    Returns:
+        The LanguageConfig for the specified language.
+    """
+    return LANGUAGE_CONFIGS[language]
+def get_supported_languages() -> list[str]:
+    """Return a list of all supported language name strings.
+    Returns:
+        List of language value strings (e.g., ["python", "javascript", ...]).
+    """
+    return [lang.value for lang in Language]
+@dataclass
+class CrossLanguageMetrics:
+    """Aggregated benchmark metrics across multiple programming languages.
+    Attributes:
+        language_scores: Mapping of language name to its pass rate (resolved ratio).
+        best_language: The language with the highest pass rate.
+        worst_language: The language with the lowest pass rate.
+        average_score: The mean pass rate across all languages.
+    """
+    language_scores: dict[str, float]
+    best_language: str
+    worst_language: str
+    average_score: float
+    @classmethod
+    def from_results(cls, results: dict[str, list[dict[str, Any]]]) -> CrossLanguageMetrics:
+        """Compute cross-language metrics from per-language result lists.
+        Each result dict is expected to have a ``"resolved"`` boolean key.
+        The pass rate for a language is the fraction of results where
+        ``resolved`` is ``True``.
+        Args:
+            results: Mapping of language name to list of result dicts.
+        Returns:
+            A CrossLanguageMetrics instance with computed scores.
+        Raises:
+            ValueError: If results is empty or any language has an empty result list.
+        """
+        if not results:
+            raise ValueError("results must not be empty")
+        language_scores: dict[str, float] = {}
+        for lang_name, lang_results in results.items():
+            if not lang_results:
+                raise ValueError(f"Result list for language '{lang_name}' must not be empty")
+            resolved_count = sum(1 for r in lang_results if r.get("resolved", False))
+            language_scores[lang_name] = resolved_count / len(lang_results)
+        best_language = max(language_scores, key=language_scores.get)  # type: ignore[arg-type]
+        worst_language = min(language_scores, key=language_scores.get)  # type: ignore[arg-type]
+        average_score = sum(language_scores.values()) / len(language_scores)
+        return cls(
+            language_scores=language_scores,
+            best_language=best_language,
+            worst_language=worst_language,
+            average_score=average_score,
+        )

mcpbr 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

mcpbr 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl