mcpbr 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcpbr/__init__.py +20 -1
- mcpbr/config.py +37 -1
- mcpbr/docker_env.py +2 -1
- mcpbr/docker_prewarm.py +2 -1
- mcpbr/dry_run.py +2 -1
- mcpbr/gpu_support.py +2 -1
- mcpbr/graceful_degradation.py +277 -0
- mcpbr/languages.py +228 -0
- mcpbr/logging_config.py +207 -0
- mcpbr/models.py +66 -0
- mcpbr/preflight.py +2 -1
- mcpbr/pricing.py +72 -0
- mcpbr/providers.py +316 -3
- mcpbr/sdk.py +264 -0
- mcpbr/smoke_test.py +2 -1
- {mcpbr-0.5.0.dist-info → mcpbr-0.6.0.dist-info}/METADATA +8 -1
- {mcpbr-0.5.0.dist-info → mcpbr-0.6.0.dist-info}/RECORD +27 -23
- {mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
- {mcpbr-0.5.0.dist-info → mcpbr-0.6.0.dist-info}/WHEEL +0 -0
- {mcpbr-0.5.0.dist-info → mcpbr-0.6.0.dist-info}/entry_points.txt +0 -0
- {mcpbr-0.5.0.dist-info → mcpbr-0.6.0.dist-info}/licenses/LICENSE +0 -0
mcpbr/__init__.py
CHANGED
|
@@ -3,4 +3,23 @@
|
|
|
3
3
|
A benchmark runner for evaluating MCP servers against SWE-bench tasks.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
__version__ = "0.
|
|
6
|
+
__version__ = "0.6.0"
|
|
7
|
+
|
|
8
|
+
from .sdk import (
|
|
9
|
+
BenchmarkResult,
|
|
10
|
+
MCPBenchmark,
|
|
11
|
+
get_version,
|
|
12
|
+
list_benchmarks,
|
|
13
|
+
list_models,
|
|
14
|
+
list_providers,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"__version__",
|
|
19
|
+
"BenchmarkResult",
|
|
20
|
+
"MCPBenchmark",
|
|
21
|
+
"get_version",
|
|
22
|
+
"list_benchmarks",
|
|
23
|
+
"list_models",
|
|
24
|
+
"list_providers",
|
|
25
|
+
]
|
mcpbr/config.py
CHANGED
|
@@ -12,7 +12,7 @@ from .config_inheritance import load_config_with_inheritance
|
|
|
12
12
|
from .env_expansion import expand_env_vars, load_dotenv_file, validate_config_security
|
|
13
13
|
from .models import DEFAULT_MODEL
|
|
14
14
|
|
|
15
|
-
VALID_PROVIDERS = ("anthropic",)
|
|
15
|
+
VALID_PROVIDERS = ("anthropic", "openai", "gemini", "qwen")
|
|
16
16
|
VALID_HARNESSES = ("claude-code",)
|
|
17
17
|
VALID_BENCHMARKS = (
|
|
18
18
|
"swe-bench-lite",
|
|
@@ -431,6 +431,42 @@ class HarnessConfig(BaseModel):
|
|
|
431
431
|
description="Infrastructure configuration (local or azure)",
|
|
432
432
|
)
|
|
433
433
|
|
|
434
|
+
continue_on_error: bool = Field(
|
|
435
|
+
default=True,
|
|
436
|
+
description="Continue evaluation when individual tasks fail instead of stopping",
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
max_failures: int | None = Field(
|
|
440
|
+
default=None,
|
|
441
|
+
description="Maximum number of task failures before halting evaluation (None for unlimited)",
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
checkpoint_interval: int = Field(
|
|
445
|
+
default=1,
|
|
446
|
+
description="Save execution checkpoint every N completed tasks",
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
resume_from_checkpoint: Path | None = Field(
|
|
450
|
+
default=None,
|
|
451
|
+
description="Path to a checkpoint file to resume evaluation from",
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
@field_validator("checkpoint_interval")
|
|
455
|
+
@classmethod
|
|
456
|
+
def validate_checkpoint_interval(cls, v: int) -> int:
|
|
457
|
+
"""Validate checkpoint_interval is at least 1."""
|
|
458
|
+
if v < 1:
|
|
459
|
+
raise ValueError("checkpoint_interval must be at least 1")
|
|
460
|
+
return v
|
|
461
|
+
|
|
462
|
+
@field_validator("max_failures")
|
|
463
|
+
@classmethod
|
|
464
|
+
def validate_max_failures(cls, v: int | None) -> int | None:
|
|
465
|
+
"""Validate max_failures is positive if set."""
|
|
466
|
+
if v is not None and v < 1:
|
|
467
|
+
raise ValueError("max_failures must be at least 1")
|
|
468
|
+
return v
|
|
469
|
+
|
|
434
470
|
@field_validator("provider")
|
|
435
471
|
@classmethod
|
|
436
472
|
def validate_provider(cls, v: str) -> str:
|
mcpbr/docker_env.py
CHANGED
|
@@ -13,11 +13,12 @@ from dataclasses import dataclass, field
|
|
|
13
13
|
from pathlib import Path
|
|
14
14
|
from typing import Any
|
|
15
15
|
|
|
16
|
-
import docker
|
|
17
16
|
from docker.models.containers import Container
|
|
18
17
|
from docker.models.networks import Network
|
|
19
18
|
from docker.models.volumes import Volume
|
|
20
19
|
|
|
20
|
+
import docker
|
|
21
|
+
|
|
21
22
|
MCPBR_LABEL = "mcpbr"
|
|
22
23
|
MCPBR_INSTANCE_LABEL = "mcpbr.instance"
|
|
23
24
|
MCPBR_SESSION_LABEL = "mcpbr.session"
|
mcpbr/docker_prewarm.py
CHANGED
|
@@ -11,12 +11,13 @@ import time
|
|
|
11
11
|
from dataclasses import dataclass, field
|
|
12
12
|
from typing import Any, Callable
|
|
13
13
|
|
|
14
|
-
import docker
|
|
15
14
|
import docker.errors
|
|
16
15
|
from rich.console import Console
|
|
17
16
|
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
|
18
17
|
from rich.table import Table
|
|
19
18
|
|
|
19
|
+
import docker
|
|
20
|
+
|
|
20
21
|
from .docker_env import SWEBENCH_IMAGE_REGISTRY, get_swebench_image_name
|
|
21
22
|
|
|
22
23
|
logger = logging.getLogger(__name__)
|
mcpbr/dry_run.py
CHANGED
|
@@ -13,11 +13,12 @@ import os
|
|
|
13
13
|
import shutil
|
|
14
14
|
from dataclasses import dataclass, field
|
|
15
15
|
|
|
16
|
-
import docker
|
|
17
16
|
from rich.console import Console
|
|
18
17
|
from rich.panel import Panel
|
|
19
18
|
from rich.table import Table
|
|
20
19
|
|
|
20
|
+
import docker
|
|
21
|
+
|
|
21
22
|
from .benchmarks import create_benchmark
|
|
22
23
|
from .config import HarnessConfig
|
|
23
24
|
from .config_validator import ConfigValidator, ValidationResult
|
mcpbr/gpu_support.py
CHANGED
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""Graceful degradation for benchmark evaluation.
|
|
2
|
+
|
|
3
|
+
Provides fault-tolerant execution of benchmark tasks with failure isolation,
|
|
4
|
+
classification, checkpointing, and configurable error handling policies.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import json
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class FailureType(Enum):
|
|
17
|
+
"""Classification of task failure types."""
|
|
18
|
+
|
|
19
|
+
TRANSIENT = "transient"
|
|
20
|
+
PERMANENT = "permanent"
|
|
21
|
+
UNKNOWN = "unknown"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class TaskFailure:
|
|
26
|
+
"""Record of a single task failure.
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
task_id: Identifier of the failed task.
|
|
30
|
+
error: Error message describing the failure.
|
|
31
|
+
failure_type: Classification of the failure.
|
|
32
|
+
timestamp: ISO 8601 timestamp of when the failure occurred.
|
|
33
|
+
retryable: Whether the task could be retried.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
task_id: str
|
|
37
|
+
error: str
|
|
38
|
+
failure_type: FailureType
|
|
39
|
+
timestamp: str
|
|
40
|
+
retryable: bool = True
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class ExecutionCheckpoint:
|
|
45
|
+
"""Checkpoint of execution state for crash recovery and resumption.
|
|
46
|
+
|
|
47
|
+
Tracks which tasks have completed, failed, or been skipped during
|
|
48
|
+
an evaluation run. Can be serialized to/from JSON for persistence.
|
|
49
|
+
|
|
50
|
+
Attributes:
|
|
51
|
+
completed_tasks: List of task IDs that completed successfully.
|
|
52
|
+
failed_tasks: List of TaskFailure records for failed tasks.
|
|
53
|
+
skipped_tasks: List of task IDs that were skipped.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
completed_tasks: list[str] = field(default_factory=list)
|
|
57
|
+
failed_tasks: list[TaskFailure] = field(default_factory=list)
|
|
58
|
+
skipped_tasks: list[str] = field(default_factory=list)
|
|
59
|
+
|
|
60
|
+
def save(self, path: Path) -> None:
|
|
61
|
+
"""Save checkpoint to a JSON file.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
path: File path to write the checkpoint to.
|
|
65
|
+
"""
|
|
66
|
+
data = {
|
|
67
|
+
"completed": self.completed_tasks,
|
|
68
|
+
"failed": [
|
|
69
|
+
{
|
|
70
|
+
"task_id": f.task_id,
|
|
71
|
+
"error": f.error,
|
|
72
|
+
"type": f.failure_type.value,
|
|
73
|
+
"timestamp": f.timestamp,
|
|
74
|
+
"retryable": f.retryable,
|
|
75
|
+
}
|
|
76
|
+
for f in self.failed_tasks
|
|
77
|
+
],
|
|
78
|
+
"skipped": self.skipped_tasks,
|
|
79
|
+
}
|
|
80
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
81
|
+
path.write_text(json.dumps(data, indent=2))
|
|
82
|
+
|
|
83
|
+
@classmethod
|
|
84
|
+
def load(cls, path: Path) -> "ExecutionCheckpoint":
|
|
85
|
+
"""Load checkpoint from a JSON file.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
path: File path to read the checkpoint from.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
ExecutionCheckpoint populated from the file.
|
|
92
|
+
"""
|
|
93
|
+
data = json.loads(path.read_text())
|
|
94
|
+
return cls(
|
|
95
|
+
completed_tasks=data["completed"],
|
|
96
|
+
failed_tasks=[
|
|
97
|
+
TaskFailure(
|
|
98
|
+
task_id=f["task_id"],
|
|
99
|
+
error=f["error"],
|
|
100
|
+
failure_type=FailureType(f["type"]),
|
|
101
|
+
timestamp=f["timestamp"],
|
|
102
|
+
retryable=f.get("retryable", True),
|
|
103
|
+
)
|
|
104
|
+
for f in data["failed"]
|
|
105
|
+
],
|
|
106
|
+
skipped_tasks=data["skipped"],
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# Exception types considered transient (may succeed on retry)
|
|
111
|
+
_TRANSIENT_ERRORS = (
|
|
112
|
+
TimeoutError,
|
|
113
|
+
asyncio.TimeoutError,
|
|
114
|
+
ConnectionError,
|
|
115
|
+
ConnectionResetError,
|
|
116
|
+
ConnectionRefusedError,
|
|
117
|
+
ConnectionAbortedError,
|
|
118
|
+
OSError,
|
|
119
|
+
IOError,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Exception types considered permanent (will not succeed on retry)
|
|
123
|
+
_PERMANENT_ERRORS = (
|
|
124
|
+
ValueError,
|
|
125
|
+
TypeError,
|
|
126
|
+
KeyError,
|
|
127
|
+
IndexError,
|
|
128
|
+
AttributeError,
|
|
129
|
+
NotImplementedError,
|
|
130
|
+
SyntaxError,
|
|
131
|
+
ImportError,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def classify_failure(error: Exception) -> FailureType:
|
|
136
|
+
"""Classify an error as transient, permanent, or unknown.
|
|
137
|
+
|
|
138
|
+
Transient errors are those that may succeed on retry (timeouts,
|
|
139
|
+
connection issues, resource exhaustion). Permanent errors are
|
|
140
|
+
programming or configuration errors that will not resolve on retry.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
error: The exception to classify.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
FailureType indicating the classification.
|
|
147
|
+
"""
|
|
148
|
+
if isinstance(error, _TRANSIENT_ERRORS):
|
|
149
|
+
return FailureType.TRANSIENT
|
|
150
|
+
if isinstance(error, _PERMANENT_ERRORS):
|
|
151
|
+
return FailureType.PERMANENT
|
|
152
|
+
return FailureType.UNKNOWN
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class GracefulExecutor:
|
|
156
|
+
"""Executor that provides graceful degradation for benchmark tasks.
|
|
157
|
+
|
|
158
|
+
Isolates task failures so that one failing task does not prevent
|
|
159
|
+
other tasks from executing. Supports configurable error policies
|
|
160
|
+
including continue-on-error and max-failure thresholds.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
continue_on_error: If True, continue executing tasks after failures.
|
|
164
|
+
If False, stop on the first failure.
|
|
165
|
+
max_failures: Maximum number of failures before stopping execution.
|
|
166
|
+
None means no limit (continue until all tasks are processed).
|
|
167
|
+
checkpoint_dir: Directory to save execution checkpoints for crash recovery.
|
|
168
|
+
None means no checkpointing.
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
def __init__(
|
|
172
|
+
self,
|
|
173
|
+
continue_on_error: bool = True,
|
|
174
|
+
max_failures: int | None = None,
|
|
175
|
+
checkpoint_dir: Path | None = None,
|
|
176
|
+
) -> None:
|
|
177
|
+
"""Initialize GracefulExecutor.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
continue_on_error: Whether to continue after task failures.
|
|
181
|
+
max_failures: Maximum failures before halting. None for unlimited.
|
|
182
|
+
checkpoint_dir: Directory for saving checkpoint files.
|
|
183
|
+
"""
|
|
184
|
+
self.continue_on_error = continue_on_error
|
|
185
|
+
self.max_failures = max_failures
|
|
186
|
+
self.checkpoint_dir = checkpoint_dir
|
|
187
|
+
self.checkpoint = ExecutionCheckpoint()
|
|
188
|
+
|
|
189
|
+
async def execute_task(self, task_id: str, coro: Any) -> Any | None:
|
|
190
|
+
"""Execute a single task with failure isolation.
|
|
191
|
+
|
|
192
|
+
Wraps the coroutine execution in error handling that records
|
|
193
|
+
failures without propagating them (when continue_on_error is True).
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
task_id: Identifier for the task being executed.
|
|
197
|
+
coro: Awaitable coroutine to execute.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
The result of the coroutine, or None if the task failed.
|
|
201
|
+
"""
|
|
202
|
+
try:
|
|
203
|
+
result = await coro
|
|
204
|
+
self.checkpoint.completed_tasks.append(task_id)
|
|
205
|
+
self._save_checkpoint()
|
|
206
|
+
return result
|
|
207
|
+
except Exception as e:
|
|
208
|
+
failure_type = classify_failure(e)
|
|
209
|
+
failure = TaskFailure(
|
|
210
|
+
task_id=task_id,
|
|
211
|
+
error=str(e),
|
|
212
|
+
failure_type=failure_type,
|
|
213
|
+
timestamp=datetime.now(timezone.utc).isoformat(),
|
|
214
|
+
retryable=failure_type == FailureType.TRANSIENT,
|
|
215
|
+
)
|
|
216
|
+
self.checkpoint.failed_tasks.append(failure)
|
|
217
|
+
self._save_checkpoint()
|
|
218
|
+
return None
|
|
219
|
+
|
|
220
|
+
def should_continue(self) -> bool:
|
|
221
|
+
"""Determine whether execution should continue.
|
|
222
|
+
|
|
223
|
+
Considers the continue_on_error flag and the max_failures threshold.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
True if execution should continue, False if it should stop.
|
|
227
|
+
"""
|
|
228
|
+
failure_count = len(self.checkpoint.failed_tasks)
|
|
229
|
+
|
|
230
|
+
# If any failure occurred and continue_on_error is False, stop
|
|
231
|
+
if not self.continue_on_error and failure_count > 0:
|
|
232
|
+
return False
|
|
233
|
+
|
|
234
|
+
# If max_failures is set and we've reached it, stop
|
|
235
|
+
if self.max_failures is not None and failure_count >= self.max_failures:
|
|
236
|
+
return False
|
|
237
|
+
|
|
238
|
+
return True
|
|
239
|
+
|
|
240
|
+
def get_partial_report(self) -> dict[str, Any]:
|
|
241
|
+
"""Generate a report of execution progress including partial results.
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
Dictionary with execution statistics and failure details.
|
|
245
|
+
"""
|
|
246
|
+
completed_count = len(self.checkpoint.completed_tasks)
|
|
247
|
+
failed_count = len(self.checkpoint.failed_tasks)
|
|
248
|
+
skipped_count = len(self.checkpoint.skipped_tasks)
|
|
249
|
+
total_tasks = completed_count + failed_count + skipped_count
|
|
250
|
+
|
|
251
|
+
success_rate = completed_count / total_tasks if total_tasks > 0 else 0.0
|
|
252
|
+
|
|
253
|
+
failures = [
|
|
254
|
+
{
|
|
255
|
+
"task_id": f.task_id,
|
|
256
|
+
"error": f.error,
|
|
257
|
+
"failure_type": f.failure_type.value,
|
|
258
|
+
"timestamp": f.timestamp,
|
|
259
|
+
"retryable": f.retryable,
|
|
260
|
+
}
|
|
261
|
+
for f in self.checkpoint.failed_tasks
|
|
262
|
+
]
|
|
263
|
+
|
|
264
|
+
return {
|
|
265
|
+
"total_tasks": total_tasks,
|
|
266
|
+
"completed_count": completed_count,
|
|
267
|
+
"failed_count": failed_count,
|
|
268
|
+
"skipped_count": skipped_count,
|
|
269
|
+
"success_rate": success_rate,
|
|
270
|
+
"failures": failures,
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
def _save_checkpoint(self) -> None:
|
|
274
|
+
"""Save checkpoint to disk if checkpoint_dir is configured."""
|
|
275
|
+
if self.checkpoint_dir is not None:
|
|
276
|
+
checkpoint_path = self.checkpoint_dir / "checkpoint.json"
|
|
277
|
+
self.checkpoint.save(checkpoint_path)
|
mcpbr/languages.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""Multi-language support for code generation benchmarks.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- Language enum defining supported programming languages.
|
|
5
|
+
- LanguageConfig dataclass with per-language Docker, run, compile, and test settings.
|
|
6
|
+
- detect_language() to identify the language from a filename or code snippet.
|
|
7
|
+
- get_language_config() to retrieve configuration for a given language.
|
|
8
|
+
- get_supported_languages() to list all supported language names.
|
|
9
|
+
- CrossLanguageMetrics for comparing benchmark performance across languages.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from enum import Enum
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Language(Enum):
|
|
22
|
+
"""Supported programming languages for code generation benchmarks."""
|
|
23
|
+
|
|
24
|
+
PYTHON = "python"
|
|
25
|
+
JAVASCRIPT = "javascript"
|
|
26
|
+
TYPESCRIPT = "typescript"
|
|
27
|
+
JAVA = "java"
|
|
28
|
+
GO = "go"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class LanguageConfig:
|
|
33
|
+
"""Configuration for running and testing code in a specific language.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
name: Human-readable language name.
|
|
37
|
+
language: The Language enum member.
|
|
38
|
+
file_extension: File extension including the dot (e.g., ".py").
|
|
39
|
+
docker_image: Docker image used to run code in this language.
|
|
40
|
+
run_command: Command template to run a file. Use {file} as placeholder.
|
|
41
|
+
test_framework: Name of the default test framework for this language.
|
|
42
|
+
compile_command: Optional command template to compile. None for interpreted languages.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
name: str
|
|
46
|
+
language: Language
|
|
47
|
+
file_extension: str
|
|
48
|
+
docker_image: str
|
|
49
|
+
run_command: str
|
|
50
|
+
test_framework: str
|
|
51
|
+
compile_command: str | None = None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
LANGUAGE_CONFIGS: dict[Language, LanguageConfig] = {
|
|
55
|
+
Language.PYTHON: LanguageConfig(
|
|
56
|
+
name="Python",
|
|
57
|
+
language=Language.PYTHON,
|
|
58
|
+
file_extension=".py",
|
|
59
|
+
docker_image="python:3.11-slim",
|
|
60
|
+
run_command="python {file}",
|
|
61
|
+
test_framework="pytest",
|
|
62
|
+
),
|
|
63
|
+
Language.JAVASCRIPT: LanguageConfig(
|
|
64
|
+
name="JavaScript",
|
|
65
|
+
language=Language.JAVASCRIPT,
|
|
66
|
+
file_extension=".js",
|
|
67
|
+
docker_image="node:20-slim",
|
|
68
|
+
run_command="node {file}",
|
|
69
|
+
test_framework="jest",
|
|
70
|
+
),
|
|
71
|
+
Language.TYPESCRIPT: LanguageConfig(
|
|
72
|
+
name="TypeScript",
|
|
73
|
+
language=Language.TYPESCRIPT,
|
|
74
|
+
file_extension=".ts",
|
|
75
|
+
docker_image="node:20-slim",
|
|
76
|
+
run_command="npx ts-node {file}",
|
|
77
|
+
test_framework="jest",
|
|
78
|
+
compile_command="npx tsc {file}",
|
|
79
|
+
),
|
|
80
|
+
Language.JAVA: LanguageConfig(
|
|
81
|
+
name="Java",
|
|
82
|
+
language=Language.JAVA,
|
|
83
|
+
file_extension=".java",
|
|
84
|
+
docker_image="eclipse-temurin:17-jdk-jammy",
|
|
85
|
+
run_command="java {file}", # Requires Java 11+ single-file source execution
|
|
86
|
+
test_framework="junit",
|
|
87
|
+
compile_command="javac {file}",
|
|
88
|
+
),
|
|
89
|
+
Language.GO: LanguageConfig(
|
|
90
|
+
name="Go",
|
|
91
|
+
language=Language.GO,
|
|
92
|
+
file_extension=".go",
|
|
93
|
+
docker_image="golang:1.21-alpine",
|
|
94
|
+
run_command="go run {file}",
|
|
95
|
+
test_framework="go test",
|
|
96
|
+
compile_command="go build {file}",
|
|
97
|
+
),
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
# Map file extensions to languages for filename-based detection.
|
|
101
|
+
_EXTENSION_MAP: dict[str, Language] = {
|
|
102
|
+
config.file_extension: lang for lang, config in LANGUAGE_CONFIGS.items()
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
# Ordered list of (pattern, language) tuples for code content detection.
|
|
106
|
+
# More specific patterns come first to avoid false positives.
|
|
107
|
+
_CODE_PATTERNS: list[tuple[re.Pattern[str], Language]] = [
|
|
108
|
+
# Go: package declaration is highly distinctive
|
|
109
|
+
(re.compile(r"^package\s+\w+", re.MULTILINE), Language.GO),
|
|
110
|
+
(re.compile(r"\bfunc\s+\w+\s*\("), Language.GO),
|
|
111
|
+
# Java: class declaration with access modifier
|
|
112
|
+
(re.compile(r"\bpublic\s+class\s+\w+"), Language.JAVA),
|
|
113
|
+
(re.compile(r"\bpublic\s+static\s+void\s+main"), Language.JAVA),
|
|
114
|
+
# TypeScript: type annotations on const/let/var, or interface keyword
|
|
115
|
+
(re.compile(r"\b(?:const|let|var)\s+\w+\s*:\s*\w+"), Language.TYPESCRIPT),
|
|
116
|
+
(re.compile(r"\binterface\s+\w+\s*\{"), Language.TYPESCRIPT),
|
|
117
|
+
# JavaScript: const/let/var without type annotations, require(), console.log
|
|
118
|
+
(re.compile(r"\brequire\s*\(\s*['\"]"), Language.JAVASCRIPT),
|
|
119
|
+
(re.compile(r"\bconsole\.log\s*\("), Language.JAVASCRIPT),
|
|
120
|
+
(re.compile(r"\b(?:const|let|var)\s+\w+\s*="), Language.JAVASCRIPT),
|
|
121
|
+
# Python: def/class with colon, import, print()
|
|
122
|
+
(re.compile(r"^def\s+\w+\s*\(.*\)\s*:", re.MULTILINE), Language.PYTHON),
|
|
123
|
+
(re.compile(r"^import\s+\w+", re.MULTILINE), Language.PYTHON),
|
|
124
|
+
(re.compile(r"\bprint\s*\("), Language.PYTHON),
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def detect_language(code: str | None = None, filename: str | None = None) -> Language | None:
|
|
129
|
+
"""Detect the programming language from a filename or code snippet.
|
|
130
|
+
|
|
131
|
+
Filename-based detection takes priority over code content analysis.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
code: Source code string to analyze.
|
|
135
|
+
filename: Filename (with or without path) to check extension.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
The detected Language, or None if detection fails.
|
|
139
|
+
"""
|
|
140
|
+
# Try filename-based detection first (higher confidence).
|
|
141
|
+
if filename:
|
|
142
|
+
_, ext = os.path.splitext(filename)
|
|
143
|
+
if ext in _EXTENSION_MAP:
|
|
144
|
+
return _EXTENSION_MAP[ext]
|
|
145
|
+
|
|
146
|
+
# Fall back to code content analysis.
|
|
147
|
+
if code:
|
|
148
|
+
for pattern, language in _CODE_PATTERNS:
|
|
149
|
+
if pattern.search(code):
|
|
150
|
+
return language
|
|
151
|
+
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def get_language_config(language: Language) -> LanguageConfig:
|
|
156
|
+
"""Get the configuration for a given language.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
language: A Language enum member.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
The LanguageConfig for the specified language.
|
|
163
|
+
"""
|
|
164
|
+
return LANGUAGE_CONFIGS[language]
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def get_supported_languages() -> list[str]:
|
|
168
|
+
"""Return a list of all supported language name strings.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
List of language value strings (e.g., ["python", "javascript", ...]).
|
|
172
|
+
"""
|
|
173
|
+
return [lang.value for lang in Language]
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
@dataclass
|
|
177
|
+
class CrossLanguageMetrics:
|
|
178
|
+
"""Aggregated benchmark metrics across multiple programming languages.
|
|
179
|
+
|
|
180
|
+
Attributes:
|
|
181
|
+
language_scores: Mapping of language name to its pass rate (resolved ratio).
|
|
182
|
+
best_language: The language with the highest pass rate.
|
|
183
|
+
worst_language: The language with the lowest pass rate.
|
|
184
|
+
average_score: The mean pass rate across all languages.
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
language_scores: dict[str, float]
|
|
188
|
+
best_language: str
|
|
189
|
+
worst_language: str
|
|
190
|
+
average_score: float
|
|
191
|
+
|
|
192
|
+
@classmethod
|
|
193
|
+
def from_results(cls, results: dict[str, list[dict[str, Any]]]) -> CrossLanguageMetrics:
|
|
194
|
+
"""Compute cross-language metrics from per-language result lists.
|
|
195
|
+
|
|
196
|
+
Each result dict is expected to have a ``"resolved"`` boolean key.
|
|
197
|
+
The pass rate for a language is the fraction of results where
|
|
198
|
+
``resolved`` is ``True``.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
results: Mapping of language name to list of result dicts.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
A CrossLanguageMetrics instance with computed scores.
|
|
205
|
+
|
|
206
|
+
Raises:
|
|
207
|
+
ValueError: If results is empty or any language has an empty result list.
|
|
208
|
+
"""
|
|
209
|
+
if not results:
|
|
210
|
+
raise ValueError("results must not be empty")
|
|
211
|
+
|
|
212
|
+
language_scores: dict[str, float] = {}
|
|
213
|
+
for lang_name, lang_results in results.items():
|
|
214
|
+
if not lang_results:
|
|
215
|
+
raise ValueError(f"Result list for language '{lang_name}' must not be empty")
|
|
216
|
+
resolved_count = sum(1 for r in lang_results if r.get("resolved", False))
|
|
217
|
+
language_scores[lang_name] = resolved_count / len(lang_results)
|
|
218
|
+
|
|
219
|
+
best_language = max(language_scores, key=language_scores.get) # type: ignore[arg-type]
|
|
220
|
+
worst_language = min(language_scores, key=language_scores.get) # type: ignore[arg-type]
|
|
221
|
+
average_score = sum(language_scores.values()) / len(language_scores)
|
|
222
|
+
|
|
223
|
+
return cls(
|
|
224
|
+
language_scores=language_scores,
|
|
225
|
+
best_language=best_language,
|
|
226
|
+
worst_language=worst_language,
|
|
227
|
+
average_score=average_score,
|
|
228
|
+
)
|