mcpbr 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mcpbr/__init__.py CHANGED
@@ -3,4 +3,23 @@
3
3
  A benchmark runner for evaluating MCP servers against SWE-bench tasks.
4
4
  """
5
5
 
6
- __version__ = "0.3.23"
6
+ __version__ = "0.6.0"
7
+
8
+ from .sdk import (
9
+ BenchmarkResult,
10
+ MCPBenchmark,
11
+ get_version,
12
+ list_benchmarks,
13
+ list_models,
14
+ list_providers,
15
+ )
16
+
17
+ __all__ = [
18
+ "__version__",
19
+ "BenchmarkResult",
20
+ "MCPBenchmark",
21
+ "get_version",
22
+ "list_benchmarks",
23
+ "list_models",
24
+ "list_providers",
25
+ ]
mcpbr/config.py CHANGED
@@ -12,7 +12,7 @@ from .config_inheritance import load_config_with_inheritance
12
12
  from .env_expansion import expand_env_vars, load_dotenv_file, validate_config_security
13
13
  from .models import DEFAULT_MODEL
14
14
 
15
- VALID_PROVIDERS = ("anthropic",)
15
+ VALID_PROVIDERS = ("anthropic", "openai", "gemini", "qwen")
16
16
  VALID_HARNESSES = ("claude-code",)
17
17
  VALID_BENCHMARKS = (
18
18
  "swe-bench-lite",
@@ -431,6 +431,42 @@ class HarnessConfig(BaseModel):
431
431
  description="Infrastructure configuration (local or azure)",
432
432
  )
433
433
 
434
+ continue_on_error: bool = Field(
435
+ default=True,
436
+ description="Continue evaluation when individual tasks fail instead of stopping",
437
+ )
438
+
439
+ max_failures: int | None = Field(
440
+ default=None,
441
+ description="Maximum number of task failures before halting evaluation (None for unlimited)",
442
+ )
443
+
444
+ checkpoint_interval: int = Field(
445
+ default=1,
446
+ description="Save execution checkpoint every N completed tasks",
447
+ )
448
+
449
+ resume_from_checkpoint: Path | None = Field(
450
+ default=None,
451
+ description="Path to a checkpoint file to resume evaluation from",
452
+ )
453
+
454
+ @field_validator("checkpoint_interval")
455
+ @classmethod
456
+ def validate_checkpoint_interval(cls, v: int) -> int:
457
+ """Validate checkpoint_interval is at least 1."""
458
+ if v < 1:
459
+ raise ValueError("checkpoint_interval must be at least 1")
460
+ return v
461
+
462
+ @field_validator("max_failures")
463
+ @classmethod
464
+ def validate_max_failures(cls, v: int | None) -> int | None:
465
+ """Validate max_failures is positive if set."""
466
+ if v is not None and v < 1:
467
+ raise ValueError("max_failures must be at least 1")
468
+ return v
469
+
434
470
  @field_validator("provider")
435
471
  @classmethod
436
472
  def validate_provider(cls, v: str) -> str:
mcpbr/docker_env.py CHANGED
@@ -13,11 +13,12 @@ from dataclasses import dataclass, field
13
13
  from pathlib import Path
14
14
  from typing import Any
15
15
 
16
- import docker
17
16
  from docker.models.containers import Container
18
17
  from docker.models.networks import Network
19
18
  from docker.models.volumes import Volume
20
19
 
20
+ import docker
21
+
21
22
  MCPBR_LABEL = "mcpbr"
22
23
  MCPBR_INSTANCE_LABEL = "mcpbr.instance"
23
24
  MCPBR_SESSION_LABEL = "mcpbr.session"
mcpbr/docker_prewarm.py CHANGED
@@ -11,12 +11,13 @@ import time
11
11
  from dataclasses import dataclass, field
12
12
  from typing import Any, Callable
13
13
 
14
- import docker
15
14
  import docker.errors
16
15
  from rich.console import Console
17
16
  from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
18
17
  from rich.table import Table
19
18
 
19
+ import docker
20
+
20
21
  from .docker_env import SWEBENCH_IMAGE_REGISTRY, get_swebench_image_name
21
22
 
22
23
  logger = logging.getLogger(__name__)
mcpbr/dry_run.py CHANGED
@@ -13,11 +13,12 @@ import os
13
13
  import shutil
14
14
  from dataclasses import dataclass, field
15
15
 
16
- import docker
17
16
  from rich.console import Console
18
17
  from rich.panel import Panel
19
18
  from rich.table import Table
20
19
 
20
+ import docker
21
+
21
22
  from .benchmarks import create_benchmark
22
23
  from .config import HarnessConfig
23
24
  from .config_validator import ConfigValidator, ValidationResult
mcpbr/gpu_support.py CHANGED
@@ -7,9 +7,10 @@ and Docker container configuration for GPU access.
7
7
  import logging
8
8
  import subprocess
9
9
 
10
- import docker
11
10
  import docker.types
12
11
 
12
+ import docker
13
+
13
14
  logger = logging.getLogger(__name__)
14
15
 
15
16
 
@@ -0,0 +1,277 @@
1
+ """Graceful degradation for benchmark evaluation.
2
+
3
+ Provides fault-tolerant execution of benchmark tasks with failure isolation,
4
+ classification, checkpointing, and configurable error handling policies.
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ from dataclasses import dataclass, field
10
+ from datetime import datetime, timezone
11
+ from enum import Enum
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+
16
+ class FailureType(Enum):
17
+ """Classification of task failure types."""
18
+
19
+ TRANSIENT = "transient"
20
+ PERMANENT = "permanent"
21
+ UNKNOWN = "unknown"
22
+
23
+
24
+ @dataclass
25
+ class TaskFailure:
26
+ """Record of a single task failure.
27
+
28
+ Attributes:
29
+ task_id: Identifier of the failed task.
30
+ error: Error message describing the failure.
31
+ failure_type: Classification of the failure.
32
+ timestamp: ISO 8601 timestamp of when the failure occurred.
33
+ retryable: Whether the task could be retried.
34
+ """
35
+
36
+ task_id: str
37
+ error: str
38
+ failure_type: FailureType
39
+ timestamp: str
40
+ retryable: bool = True
41
+
42
+
43
+ @dataclass
44
+ class ExecutionCheckpoint:
45
+ """Checkpoint of execution state for crash recovery and resumption.
46
+
47
+ Tracks which tasks have completed, failed, or been skipped during
48
+ an evaluation run. Can be serialized to/from JSON for persistence.
49
+
50
+ Attributes:
51
+ completed_tasks: List of task IDs that completed successfully.
52
+ failed_tasks: List of TaskFailure records for failed tasks.
53
+ skipped_tasks: List of task IDs that were skipped.
54
+ """
55
+
56
+ completed_tasks: list[str] = field(default_factory=list)
57
+ failed_tasks: list[TaskFailure] = field(default_factory=list)
58
+ skipped_tasks: list[str] = field(default_factory=list)
59
+
60
+ def save(self, path: Path) -> None:
61
+ """Save checkpoint to a JSON file.
62
+
63
+ Args:
64
+ path: File path to write the checkpoint to.
65
+ """
66
+ data = {
67
+ "completed": self.completed_tasks,
68
+ "failed": [
69
+ {
70
+ "task_id": f.task_id,
71
+ "error": f.error,
72
+ "type": f.failure_type.value,
73
+ "timestamp": f.timestamp,
74
+ "retryable": f.retryable,
75
+ }
76
+ for f in self.failed_tasks
77
+ ],
78
+ "skipped": self.skipped_tasks,
79
+ }
80
+ path.parent.mkdir(parents=True, exist_ok=True)
81
+ path.write_text(json.dumps(data, indent=2))
82
+
83
+ @classmethod
84
+ def load(cls, path: Path) -> "ExecutionCheckpoint":
85
+ """Load checkpoint from a JSON file.
86
+
87
+ Args:
88
+ path: File path to read the checkpoint from.
89
+
90
+ Returns:
91
+ ExecutionCheckpoint populated from the file.
92
+ """
93
+ data = json.loads(path.read_text())
94
+ return cls(
95
+ completed_tasks=data["completed"],
96
+ failed_tasks=[
97
+ TaskFailure(
98
+ task_id=f["task_id"],
99
+ error=f["error"],
100
+ failure_type=FailureType(f["type"]),
101
+ timestamp=f["timestamp"],
102
+ retryable=f.get("retryable", True),
103
+ )
104
+ for f in data["failed"]
105
+ ],
106
+ skipped_tasks=data["skipped"],
107
+ )
108
+
109
+
110
+ # Exception types considered transient (may succeed on retry)
111
+ _TRANSIENT_ERRORS = (
112
+ TimeoutError,
113
+ asyncio.TimeoutError,
114
+ ConnectionError,
115
+ ConnectionResetError,
116
+ ConnectionRefusedError,
117
+ ConnectionAbortedError,
118
+ OSError,
119
+ IOError,
120
+ )
121
+
122
+ # Exception types considered permanent (will not succeed on retry)
123
+ _PERMANENT_ERRORS = (
124
+ ValueError,
125
+ TypeError,
126
+ KeyError,
127
+ IndexError,
128
+ AttributeError,
129
+ NotImplementedError,
130
+ SyntaxError,
131
+ ImportError,
132
+ )
133
+
134
+
135
+ def classify_failure(error: Exception) -> FailureType:
136
+ """Classify an error as transient, permanent, or unknown.
137
+
138
+ Transient errors are those that may succeed on retry (timeouts,
139
+ connection issues, resource exhaustion). Permanent errors are
140
+ programming or configuration errors that will not resolve on retry.
141
+
142
+ Args:
143
+ error: The exception to classify.
144
+
145
+ Returns:
146
+ FailureType indicating the classification.
147
+ """
148
+ if isinstance(error, _TRANSIENT_ERRORS):
149
+ return FailureType.TRANSIENT
150
+ if isinstance(error, _PERMANENT_ERRORS):
151
+ return FailureType.PERMANENT
152
+ return FailureType.UNKNOWN
153
+
154
+
155
+ class GracefulExecutor:
156
+ """Executor that provides graceful degradation for benchmark tasks.
157
+
158
+ Isolates task failures so that one failing task does not prevent
159
+ other tasks from executing. Supports configurable error policies
160
+ including continue-on-error and max-failure thresholds.
161
+
162
+ Args:
163
+ continue_on_error: If True, continue executing tasks after failures.
164
+ If False, stop on the first failure.
165
+ max_failures: Maximum number of failures before stopping execution.
166
+ None means no limit (continue until all tasks are processed).
167
+ checkpoint_dir: Directory to save execution checkpoints for crash recovery.
168
+ None means no checkpointing.
169
+ """
170
+
171
+ def __init__(
172
+ self,
173
+ continue_on_error: bool = True,
174
+ max_failures: int | None = None,
175
+ checkpoint_dir: Path | None = None,
176
+ ) -> None:
177
+ """Initialize GracefulExecutor.
178
+
179
+ Args:
180
+ continue_on_error: Whether to continue after task failures.
181
+ max_failures: Maximum failures before halting. None for unlimited.
182
+ checkpoint_dir: Directory for saving checkpoint files.
183
+ """
184
+ self.continue_on_error = continue_on_error
185
+ self.max_failures = max_failures
186
+ self.checkpoint_dir = checkpoint_dir
187
+ self.checkpoint = ExecutionCheckpoint()
188
+
189
+ async def execute_task(self, task_id: str, coro: Any) -> Any | None:
190
+ """Execute a single task with failure isolation.
191
+
192
+ Wraps the coroutine execution in error handling that records
193
+ failures without propagating them (when continue_on_error is True).
194
+
195
+ Args:
196
+ task_id: Identifier for the task being executed.
197
+ coro: Awaitable coroutine to execute.
198
+
199
+ Returns:
200
+ The result of the coroutine, or None if the task failed.
201
+ """
202
+ try:
203
+ result = await coro
204
+ self.checkpoint.completed_tasks.append(task_id)
205
+ self._save_checkpoint()
206
+ return result
207
+ except Exception as e:
208
+ failure_type = classify_failure(e)
209
+ failure = TaskFailure(
210
+ task_id=task_id,
211
+ error=str(e),
212
+ failure_type=failure_type,
213
+ timestamp=datetime.now(timezone.utc).isoformat(),
214
+ retryable=failure_type == FailureType.TRANSIENT,
215
+ )
216
+ self.checkpoint.failed_tasks.append(failure)
217
+ self._save_checkpoint()
218
+ return None
219
+
220
+ def should_continue(self) -> bool:
221
+ """Determine whether execution should continue.
222
+
223
+ Considers the continue_on_error flag and the max_failures threshold.
224
+
225
+ Returns:
226
+ True if execution should continue, False if it should stop.
227
+ """
228
+ failure_count = len(self.checkpoint.failed_tasks)
229
+
230
+ # If any failure occurred and continue_on_error is False, stop
231
+ if not self.continue_on_error and failure_count > 0:
232
+ return False
233
+
234
+ # If max_failures is set and we've reached it, stop
235
+ if self.max_failures is not None and failure_count >= self.max_failures:
236
+ return False
237
+
238
+ return True
239
+
240
+ def get_partial_report(self) -> dict[str, Any]:
241
+ """Generate a report of execution progress including partial results.
242
+
243
+ Returns:
244
+ Dictionary with execution statistics and failure details.
245
+ """
246
+ completed_count = len(self.checkpoint.completed_tasks)
247
+ failed_count = len(self.checkpoint.failed_tasks)
248
+ skipped_count = len(self.checkpoint.skipped_tasks)
249
+ total_tasks = completed_count + failed_count + skipped_count
250
+
251
+ success_rate = completed_count / total_tasks if total_tasks > 0 else 0.0
252
+
253
+ failures = [
254
+ {
255
+ "task_id": f.task_id,
256
+ "error": f.error,
257
+ "failure_type": f.failure_type.value,
258
+ "timestamp": f.timestamp,
259
+ "retryable": f.retryable,
260
+ }
261
+ for f in self.checkpoint.failed_tasks
262
+ ]
263
+
264
+ return {
265
+ "total_tasks": total_tasks,
266
+ "completed_count": completed_count,
267
+ "failed_count": failed_count,
268
+ "skipped_count": skipped_count,
269
+ "success_rate": success_rate,
270
+ "failures": failures,
271
+ }
272
+
273
+ def _save_checkpoint(self) -> None:
274
+ """Save checkpoint to disk if checkpoint_dir is configured."""
275
+ if self.checkpoint_dir is not None:
276
+ checkpoint_path = self.checkpoint_dir / "checkpoint.json"
277
+ self.checkpoint.save(checkpoint_path)
mcpbr/languages.py ADDED
@@ -0,0 +1,228 @@
1
+ """Multi-language support for code generation benchmarks.
2
+
3
+ This module provides:
4
+ - Language enum defining supported programming languages.
5
+ - LanguageConfig dataclass with per-language Docker, run, compile, and test settings.
6
+ - detect_language() to identify the language from a filename or code snippet.
7
+ - get_language_config() to retrieve configuration for a given language.
8
+ - get_supported_languages() to list all supported language names.
9
+ - CrossLanguageMetrics for comparing benchmark performance across languages.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import os
15
+ import re
16
+ from dataclasses import dataclass
17
+ from enum import Enum
18
+ from typing import Any
19
+
20
+
21
+ class Language(Enum):
22
+ """Supported programming languages for code generation benchmarks."""
23
+
24
+ PYTHON = "python"
25
+ JAVASCRIPT = "javascript"
26
+ TYPESCRIPT = "typescript"
27
+ JAVA = "java"
28
+ GO = "go"
29
+
30
+
31
+ @dataclass
32
+ class LanguageConfig:
33
+ """Configuration for running and testing code in a specific language.
34
+
35
+ Attributes:
36
+ name: Human-readable language name.
37
+ language: The Language enum member.
38
+ file_extension: File extension including the dot (e.g., ".py").
39
+ docker_image: Docker image used to run code in this language.
40
+ run_command: Command template to run a file. Use {file} as placeholder.
41
+ test_framework: Name of the default test framework for this language.
42
+ compile_command: Optional command template to compile. None for interpreted languages.
43
+ """
44
+
45
+ name: str
46
+ language: Language
47
+ file_extension: str
48
+ docker_image: str
49
+ run_command: str
50
+ test_framework: str
51
+ compile_command: str | None = None
52
+
53
+
54
+ LANGUAGE_CONFIGS: dict[Language, LanguageConfig] = {
55
+ Language.PYTHON: LanguageConfig(
56
+ name="Python",
57
+ language=Language.PYTHON,
58
+ file_extension=".py",
59
+ docker_image="python:3.11-slim",
60
+ run_command="python {file}",
61
+ test_framework="pytest",
62
+ ),
63
+ Language.JAVASCRIPT: LanguageConfig(
64
+ name="JavaScript",
65
+ language=Language.JAVASCRIPT,
66
+ file_extension=".js",
67
+ docker_image="node:20-slim",
68
+ run_command="node {file}",
69
+ test_framework="jest",
70
+ ),
71
+ Language.TYPESCRIPT: LanguageConfig(
72
+ name="TypeScript",
73
+ language=Language.TYPESCRIPT,
74
+ file_extension=".ts",
75
+ docker_image="node:20-slim",
76
+ run_command="npx ts-node {file}",
77
+ test_framework="jest",
78
+ compile_command="npx tsc {file}",
79
+ ),
80
+ Language.JAVA: LanguageConfig(
81
+ name="Java",
82
+ language=Language.JAVA,
83
+ file_extension=".java",
84
+ docker_image="eclipse-temurin:17-jdk-jammy",
85
+ run_command="java {file}", # Requires Java 11+ single-file source execution
86
+ test_framework="junit",
87
+ compile_command="javac {file}",
88
+ ),
89
+ Language.GO: LanguageConfig(
90
+ name="Go",
91
+ language=Language.GO,
92
+ file_extension=".go",
93
+ docker_image="golang:1.21-alpine",
94
+ run_command="go run {file}",
95
+ test_framework="go test",
96
+ compile_command="go build {file}",
97
+ ),
98
+ }
99
+
100
+ # Map file extensions to languages for filename-based detection.
101
+ _EXTENSION_MAP: dict[str, Language] = {
102
+ config.file_extension: lang for lang, config in LANGUAGE_CONFIGS.items()
103
+ }
104
+
105
+ # Ordered list of (pattern, language) tuples for code content detection.
106
+ # More specific patterns come first to avoid false positives.
107
+ _CODE_PATTERNS: list[tuple[re.Pattern[str], Language]] = [
108
+ # Go: package declaration is highly distinctive
109
+ (re.compile(r"^package\s+\w+", re.MULTILINE), Language.GO),
110
+ (re.compile(r"\bfunc\s+\w+\s*\("), Language.GO),
111
+ # Java: class declaration with access modifier
112
+ (re.compile(r"\bpublic\s+class\s+\w+"), Language.JAVA),
113
+ (re.compile(r"\bpublic\s+static\s+void\s+main"), Language.JAVA),
114
+ # TypeScript: type annotations on const/let/var, or interface keyword
115
+ (re.compile(r"\b(?:const|let|var)\s+\w+\s*:\s*\w+"), Language.TYPESCRIPT),
116
+ (re.compile(r"\binterface\s+\w+\s*\{"), Language.TYPESCRIPT),
117
+ # JavaScript: const/let/var without type annotations, require(), console.log
118
+ (re.compile(r"\brequire\s*\(\s*['\"]"), Language.JAVASCRIPT),
119
+ (re.compile(r"\bconsole\.log\s*\("), Language.JAVASCRIPT),
120
+ (re.compile(r"\b(?:const|let|var)\s+\w+\s*="), Language.JAVASCRIPT),
121
+ # Python: def/class with colon, import, print()
122
+ (re.compile(r"^def\s+\w+\s*\(.*\)\s*:", re.MULTILINE), Language.PYTHON),
123
+ (re.compile(r"^import\s+\w+", re.MULTILINE), Language.PYTHON),
124
+ (re.compile(r"\bprint\s*\("), Language.PYTHON),
125
+ ]
126
+
127
+
128
+ def detect_language(code: str | None = None, filename: str | None = None) -> Language | None:
129
+ """Detect the programming language from a filename or code snippet.
130
+
131
+ Filename-based detection takes priority over code content analysis.
132
+
133
+ Args:
134
+ code: Source code string to analyze.
135
+ filename: Filename (with or without path) to check extension.
136
+
137
+ Returns:
138
+ The detected Language, or None if detection fails.
139
+ """
140
+ # Try filename-based detection first (higher confidence).
141
+ if filename:
142
+ _, ext = os.path.splitext(filename)
143
+ if ext in _EXTENSION_MAP:
144
+ return _EXTENSION_MAP[ext]
145
+
146
+ # Fall back to code content analysis.
147
+ if code:
148
+ for pattern, language in _CODE_PATTERNS:
149
+ if pattern.search(code):
150
+ return language
151
+
152
+ return None
153
+
154
+
155
+ def get_language_config(language: Language) -> LanguageConfig:
156
+ """Get the configuration for a given language.
157
+
158
+ Args:
159
+ language: A Language enum member.
160
+
161
+ Returns:
162
+ The LanguageConfig for the specified language.
163
+ """
164
+ return LANGUAGE_CONFIGS[language]
165
+
166
+
167
+ def get_supported_languages() -> list[str]:
168
+ """Return a list of all supported language name strings.
169
+
170
+ Returns:
171
+ List of language value strings (e.g., ["python", "javascript", ...]).
172
+ """
173
+ return [lang.value for lang in Language]
174
+
175
+
176
+ @dataclass
177
+ class CrossLanguageMetrics:
178
+ """Aggregated benchmark metrics across multiple programming languages.
179
+
180
+ Attributes:
181
+ language_scores: Mapping of language name to its pass rate (resolved ratio).
182
+ best_language: The language with the highest pass rate.
183
+ worst_language: The language with the lowest pass rate.
184
+ average_score: The mean pass rate across all languages.
185
+ """
186
+
187
+ language_scores: dict[str, float]
188
+ best_language: str
189
+ worst_language: str
190
+ average_score: float
191
+
192
+ @classmethod
193
+ def from_results(cls, results: dict[str, list[dict[str, Any]]]) -> CrossLanguageMetrics:
194
+ """Compute cross-language metrics from per-language result lists.
195
+
196
+ Each result dict is expected to have a ``"resolved"`` boolean key.
197
+ The pass rate for a language is the fraction of results where
198
+ ``resolved`` is ``True``.
199
+
200
+ Args:
201
+ results: Mapping of language name to list of result dicts.
202
+
203
+ Returns:
204
+ A CrossLanguageMetrics instance with computed scores.
205
+
206
+ Raises:
207
+ ValueError: If results is empty or any language has an empty result list.
208
+ """
209
+ if not results:
210
+ raise ValueError("results must not be empty")
211
+
212
+ language_scores: dict[str, float] = {}
213
+ for lang_name, lang_results in results.items():
214
+ if not lang_results:
215
+ raise ValueError(f"Result list for language '{lang_name}' must not be empty")
216
+ resolved_count = sum(1 for r in lang_results if r.get("resolved", False))
217
+ language_scores[lang_name] = resolved_count / len(lang_results)
218
+
219
+ best_language = max(language_scores, key=language_scores.get) # type: ignore[arg-type]
220
+ worst_language = min(language_scores, key=language_scores.get) # type: ignore[arg-type]
221
+ average_score = sum(language_scores.values()) / len(language_scores)
222
+
223
+ return cls(
224
+ language_scores=language_scores,
225
+ best_language=best_language,
226
+ worst_language=worst_language,
227
+ average_score=average_score,
228
+ )