arbiter-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arbiter/__init__.py +3 -0
- arbiter/cli/__init__.py +0 -0
- arbiter/cli/app.py +699 -0
- arbiter/cli/display.py +381 -0
- arbiter/core/__init__.py +0 -0
- arbiter/core/benchmarks.py +804 -0
- arbiter/core/config.py +137 -0
- arbiter/core/discover.py +184 -0
- arbiter/core/judge.py +193 -0
- arbiter/core/leaderboard.py +197 -0
- arbiter/core/metrics.py +367 -0
- arbiter/core/providers/__init__.py +19 -0
- arbiter/core/providers/anthropic_provider.py +133 -0
- arbiter/core/providers/base.py +62 -0
- arbiter/core/providers/factory.py +79 -0
- arbiter/core/providers/google_provider.py +126 -0
- arbiter/core/providers/ollama.py +103 -0
- arbiter/core/providers/openai_provider.py +120 -0
- arbiter/core/runner.py +257 -0
- arbiter/core/swe/__init__.py +1 -0
- arbiter/core/swe/container.py +158 -0
- arbiter/core/swe/runner.py +220 -0
- arbiter/core/swe/sandbox.py +111 -0
- arbiter/core/swe/test_packs.py +548 -0
- arbiter/dashboard/__init__.py +0 -0
- arbiter/dashboard/frontend/dist/assets/index-1tkxJouQ.css +1 -0
- arbiter/dashboard/frontend/dist/assets/index-dHa4zmvw.js +298 -0
- arbiter/dashboard/frontend/dist/index.html +16 -0
- arbiter/dashboard/server.py +426 -0
- arbiter_cli-0.1.0.dist-info/METADATA +299 -0
- arbiter_cli-0.1.0.dist-info/RECORD +35 -0
- arbiter_cli-0.1.0.dist-info/WHEEL +5 -0
- arbiter_cli-0.1.0.dist-info/entry_points.txt +2 -0
- arbiter_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- arbiter_cli-0.1.0.dist-info/top_level.txt +1 -0
arbiter/core/runner.py
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
"""Core runner -- executes prompts against multiple models in parallel."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from typing import AsyncIterator, Callable, Optional
|
|
8
|
+
|
|
9
|
+
from arbiter.core.config import resolve_model
|
|
10
|
+
from arbiter.core.metrics import ComparisonResult, ModelMetrics
|
|
11
|
+
from arbiter.core.providers.factory import create_provider
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Callback type for streaming progress updates
|
|
15
|
+
# (model_name, token_text, metrics_snapshot)
|
|
16
|
+
StreamCallback = Callable[[str, str, ModelMetrics], None]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
async def run_single_model(
|
|
20
|
+
model_spec: str,
|
|
21
|
+
prompt: str,
|
|
22
|
+
system: Optional[str] = None,
|
|
23
|
+
image_path: Optional[str] = None,
|
|
24
|
+
on_token: Optional[StreamCallback] = None,
|
|
25
|
+
) -> ModelMetrics:
|
|
26
|
+
"""Run a single model and collect metrics.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
model_spec: Model specifier (e.g. "gemma4", "openai:gpt-4o")
|
|
30
|
+
prompt: The prompt to send
|
|
31
|
+
system: Optional system prompt
|
|
32
|
+
image_path: Optional image for multimodal models
|
|
33
|
+
on_token: Optional callback fired for each token
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
ModelMetrics with all performance data and generated output
|
|
37
|
+
"""
|
|
38
|
+
config = resolve_model(model_spec)
|
|
39
|
+
provider, model_name = create_provider(config)
|
|
40
|
+
|
|
41
|
+
metrics = ModelMetrics(
|
|
42
|
+
model=model_name,
|
|
43
|
+
provider=config.provider,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
metrics.start()
|
|
47
|
+
last_meta: dict = {}
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
async for chunk in provider.stream_generate(
|
|
51
|
+
model=model_name,
|
|
52
|
+
prompt=prompt,
|
|
53
|
+
system=system,
|
|
54
|
+
image_path=image_path,
|
|
55
|
+
):
|
|
56
|
+
if chunk.text:
|
|
57
|
+
if metrics._first_token_time is None:
|
|
58
|
+
metrics.record_first_token()
|
|
59
|
+
metrics.record_token(chunk.text)
|
|
60
|
+
|
|
61
|
+
if on_token:
|
|
62
|
+
on_token(model_name, chunk.text, metrics)
|
|
63
|
+
|
|
64
|
+
if chunk.meta:
|
|
65
|
+
last_meta.update(chunk.meta)
|
|
66
|
+
|
|
67
|
+
if chunk.done:
|
|
68
|
+
break
|
|
69
|
+
|
|
70
|
+
except Exception as e:
|
|
71
|
+
metrics.output = f"[ERROR] {type(e).__name__}: {e}"
|
|
72
|
+
|
|
73
|
+
metrics.finish(provider_meta=last_meta)
|
|
74
|
+
return metrics
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
async def run_comparison(
|
|
78
|
+
model_specs: list[str],
|
|
79
|
+
prompt: str,
|
|
80
|
+
system: Optional[str] = None,
|
|
81
|
+
image_path: Optional[str] = None,
|
|
82
|
+
on_token: Optional[StreamCallback] = None,
|
|
83
|
+
sequential: bool = False,
|
|
84
|
+
) -> ComparisonResult:
|
|
85
|
+
"""Run the same prompt through multiple models.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
model_specs: List of model specifiers
|
|
89
|
+
prompt: The prompt to send to all models
|
|
90
|
+
system: Optional system prompt
|
|
91
|
+
image_path: Optional image path for multimodal
|
|
92
|
+
on_token: Optional callback for streaming updates
|
|
93
|
+
sequential: If True, run models one at a time (saves memory).
|
|
94
|
+
If False, run all in parallel (faster but uses more RAM).
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
ComparisonResult with metrics for all models
|
|
98
|
+
"""
|
|
99
|
+
if sequential:
|
|
100
|
+
return await _run_sequential(model_specs, prompt, system, image_path, on_token)
|
|
101
|
+
else:
|
|
102
|
+
return await _run_parallel(model_specs, prompt, system, image_path, on_token)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
async def _run_parallel(
|
|
106
|
+
model_specs: list[str],
|
|
107
|
+
prompt: str,
|
|
108
|
+
system: Optional[str],
|
|
109
|
+
image_path: Optional[str],
|
|
110
|
+
on_token: Optional[StreamCallback],
|
|
111
|
+
) -> ComparisonResult:
|
|
112
|
+
"""Run all models in parallel (faster, more memory)."""
|
|
113
|
+
tasks = [
|
|
114
|
+
run_single_model(
|
|
115
|
+
model_spec=spec,
|
|
116
|
+
prompt=prompt,
|
|
117
|
+
system=system,
|
|
118
|
+
image_path=image_path,
|
|
119
|
+
on_token=on_token,
|
|
120
|
+
)
|
|
121
|
+
for spec in model_specs
|
|
122
|
+
]
|
|
123
|
+
|
|
124
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
125
|
+
|
|
126
|
+
model_metrics = []
|
|
127
|
+
for i, result in enumerate(results):
|
|
128
|
+
if isinstance(result, Exception):
|
|
129
|
+
metrics = ModelMetrics(
|
|
130
|
+
model=model_specs[i],
|
|
131
|
+
provider="unknown",
|
|
132
|
+
output=f"[ERROR] {type(result).__name__}: {result}",
|
|
133
|
+
)
|
|
134
|
+
model_metrics.append(metrics)
|
|
135
|
+
else:
|
|
136
|
+
model_metrics.append(result)
|
|
137
|
+
|
|
138
|
+
return ComparisonResult(
|
|
139
|
+
prompt=prompt,
|
|
140
|
+
models=model_metrics,
|
|
141
|
+
timestamp=datetime.now(timezone.utc).isoformat(),
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
async def _run_sequential(
|
|
146
|
+
model_specs: list[str],
|
|
147
|
+
prompt: str,
|
|
148
|
+
system: Optional[str],
|
|
149
|
+
image_path: Optional[str],
|
|
150
|
+
on_token: Optional[StreamCallback],
|
|
151
|
+
) -> ComparisonResult:
|
|
152
|
+
"""Run models one at a time. Each finishes and frees memory before the next starts.
|
|
153
|
+
|
|
154
|
+
Better for low-RAM machines (8GB). Timing is still fair because each model
|
|
155
|
+
gets the machine to itself with no contention.
|
|
156
|
+
"""
|
|
157
|
+
model_metrics = []
|
|
158
|
+
|
|
159
|
+
for spec in model_specs:
|
|
160
|
+
try:
|
|
161
|
+
metrics = await run_single_model(
|
|
162
|
+
model_spec=spec,
|
|
163
|
+
prompt=prompt,
|
|
164
|
+
system=system,
|
|
165
|
+
image_path=image_path,
|
|
166
|
+
on_token=on_token,
|
|
167
|
+
)
|
|
168
|
+
model_metrics.append(metrics)
|
|
169
|
+
except Exception as e:
|
|
170
|
+
metrics = ModelMetrics(
|
|
171
|
+
model=spec,
|
|
172
|
+
provider="unknown",
|
|
173
|
+
output=f"[ERROR] {type(e).__name__}: {e}",
|
|
174
|
+
)
|
|
175
|
+
model_metrics.append(metrics)
|
|
176
|
+
|
|
177
|
+
return ComparisonResult(
|
|
178
|
+
prompt=prompt,
|
|
179
|
+
models=model_metrics,
|
|
180
|
+
timestamp=datetime.now(timezone.utc).isoformat(),
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
async def stream_comparison(
|
|
185
|
+
model_specs: list[str],
|
|
186
|
+
prompt: str,
|
|
187
|
+
system: Optional[str] = None,
|
|
188
|
+
image_path: Optional[str] = None,
|
|
189
|
+
) -> AsyncIterator[tuple[str, str, ModelMetrics | None]]:
|
|
190
|
+
"""Stream a comparison, yielding (event_type, model_name, data) tuples.
|
|
191
|
+
|
|
192
|
+
Event types:
|
|
193
|
+
"start" - model generation started
|
|
194
|
+
"token" - a token was received
|
|
195
|
+
"done" - model generation complete
|
|
196
|
+
"error" - model encountered an error
|
|
197
|
+
|
|
198
|
+
This is used by the WebSocket server for real-time dashboard updates.
|
|
199
|
+
"""
|
|
200
|
+
queues: dict[str, asyncio.Queue] = {}
|
|
201
|
+
tasks: list[asyncio.Task] = []
|
|
202
|
+
|
|
203
|
+
async def _run_model(spec: str, queue: asyncio.Queue):
|
|
204
|
+
config = resolve_model(spec)
|
|
205
|
+
provider, model_name = create_provider(config)
|
|
206
|
+
metrics = ModelMetrics(model=model_name, provider=config.provider)
|
|
207
|
+
|
|
208
|
+
await queue.put(("start", model_name, metrics))
|
|
209
|
+
metrics.start()
|
|
210
|
+
last_meta: dict = {}
|
|
211
|
+
|
|
212
|
+
try:
|
|
213
|
+
async for chunk in provider.stream_generate(
|
|
214
|
+
model=model_name, prompt=prompt, system=system, image_path=image_path
|
|
215
|
+
):
|
|
216
|
+
if chunk.text:
|
|
217
|
+
if metrics._first_token_time is None:
|
|
218
|
+
metrics.record_first_token()
|
|
219
|
+
metrics.record_token(chunk.text)
|
|
220
|
+
await queue.put(("token", model_name, metrics))
|
|
221
|
+
|
|
222
|
+
if chunk.meta:
|
|
223
|
+
last_meta.update(chunk.meta)
|
|
224
|
+
|
|
225
|
+
if chunk.done:
|
|
226
|
+
break
|
|
227
|
+
|
|
228
|
+
metrics.finish(provider_meta=last_meta)
|
|
229
|
+
await queue.put(("done", model_name, metrics))
|
|
230
|
+
|
|
231
|
+
except Exception as e:
|
|
232
|
+
metrics.output = f"[ERROR] {type(e).__name__}: {e}"
|
|
233
|
+
metrics.finish()
|
|
234
|
+
await queue.put(("error", model_name, metrics))
|
|
235
|
+
|
|
236
|
+
await queue.put(None) # sentinel
|
|
237
|
+
|
|
238
|
+
# Start all models
|
|
239
|
+
merged_queue: asyncio.Queue = asyncio.Queue()
|
|
240
|
+
for spec in model_specs:
|
|
241
|
+
task = asyncio.create_task(_run_model(spec, merged_queue))
|
|
242
|
+
tasks.append(task)
|
|
243
|
+
|
|
244
|
+
finished = 0
|
|
245
|
+
total = len(model_specs)
|
|
246
|
+
|
|
247
|
+
while finished < total:
|
|
248
|
+
item = await merged_queue.get()
|
|
249
|
+
if item is None:
|
|
250
|
+
finished += 1
|
|
251
|
+
continue
|
|
252
|
+
yield item
|
|
253
|
+
|
|
254
|
+
# Ensure all tasks are cleaned up
|
|
255
|
+
for task in tasks:
|
|
256
|
+
if not task.done():
|
|
257
|
+
task.cancel()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""SWE-bench style testing -- real code execution in Docker containers."""
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""Docker container management for safe code execution."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import subprocess
|
|
7
|
+
import tempfile
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class ContainerResult:
|
|
15
|
+
"""Result from running tests in a Docker container."""
|
|
16
|
+
exit_code: int
|
|
17
|
+
stdout: str
|
|
18
|
+
stderr: str
|
|
19
|
+
tests_passed: int
|
|
20
|
+
tests_failed: int
|
|
21
|
+
tests_total: int
|
|
22
|
+
duration_s: float
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def all_passed(self) -> bool:
|
|
26
|
+
return self.tests_failed == 0 and self.tests_passed > 0
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def check_docker() -> bool:
|
|
30
|
+
"""Check if Docker is available and running."""
|
|
31
|
+
try:
|
|
32
|
+
result = subprocess.run(
|
|
33
|
+
["docker", "info"],
|
|
34
|
+
capture_output=True, text=True, timeout=10,
|
|
35
|
+
)
|
|
36
|
+
return result.returncode == 0
|
|
37
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def build_test_image(test_pack_dir: Path, tag: str = "arbiter-test") -> bool:
|
|
42
|
+
"""Build a Docker image for a test pack.
|
|
43
|
+
|
|
44
|
+
The test pack directory must contain a Dockerfile.
|
|
45
|
+
"""
|
|
46
|
+
dockerfile = test_pack_dir / "Dockerfile"
|
|
47
|
+
if not dockerfile.exists():
|
|
48
|
+
# Generate a default Python Dockerfile
|
|
49
|
+
dockerfile.write_text(
|
|
50
|
+
"FROM python:3.11-slim\n"
|
|
51
|
+
"WORKDIR /workspace\n"
|
|
52
|
+
"COPY requirements.txt* ./\n"
|
|
53
|
+
"RUN pip install --no-cache-dir -r requirements.txt 2>/dev/null || true\n"
|
|
54
|
+
"RUN pip install --no-cache-dir pytest\n"
|
|
55
|
+
"COPY . .\n"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
result = subprocess.run(
|
|
59
|
+
["docker", "build", "-t", tag, "."],
|
|
60
|
+
cwd=str(test_pack_dir),
|
|
61
|
+
capture_output=True, text=True, timeout=120,
|
|
62
|
+
)
|
|
63
|
+
return result.returncode == 0
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def run_in_container(
|
|
67
|
+
image: str,
|
|
68
|
+
fix_code: str,
|
|
69
|
+
test_code: str,
|
|
70
|
+
test_command: str = "python -m pytest /workspace/test_fix.py -v --tb=short",
|
|
71
|
+
timeout: int = 60,
|
|
72
|
+
) -> ContainerResult:
|
|
73
|
+
"""Run a model's fix inside a Docker container against a test suite.
|
|
74
|
+
|
|
75
|
+
1. Writes the model's fix to a temp dir as fix.py
|
|
76
|
+
2. Writes the test suite as test_fix.py
|
|
77
|
+
3. Mounts both into the container
|
|
78
|
+
4. Runs pytest
|
|
79
|
+
5. Returns results
|
|
80
|
+
|
|
81
|
+
Network is disabled. Memory limited to 512MB. CPU limited to 1 core.
|
|
82
|
+
"""
|
|
83
|
+
import time
|
|
84
|
+
|
|
85
|
+
tmpdir = tempfile.mkdtemp(prefix="arbiter-swe-")
|
|
86
|
+
fix_path = Path(tmpdir) / "fix.py"
|
|
87
|
+
test_path = Path(tmpdir) / "test_fix.py"
|
|
88
|
+
|
|
89
|
+
fix_path.write_text(fix_code)
|
|
90
|
+
test_path.write_text(test_code)
|
|
91
|
+
|
|
92
|
+
start = time.perf_counter()
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
cmd = [
|
|
96
|
+
"docker", "run", "--rm",
|
|
97
|
+
"--network=none",
|
|
98
|
+
"--memory=512m",
|
|
99
|
+
"--cpus=1",
|
|
100
|
+
"-v", f"{tmpdir}:/workspace",
|
|
101
|
+
image,
|
|
102
|
+
"sh", "-c", test_command,
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
result = subprocess.run(
|
|
106
|
+
cmd, capture_output=True, text=True, timeout=timeout,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
elapsed = time.perf_counter() - start
|
|
110
|
+
stdout = result.stdout
|
|
111
|
+
stderr = result.stderr
|
|
112
|
+
|
|
113
|
+
passed, failed, total = _parse_pytest_output(stdout + stderr)
|
|
114
|
+
|
|
115
|
+
return ContainerResult(
|
|
116
|
+
exit_code=result.returncode,
|
|
117
|
+
stdout=stdout[-2000:],
|
|
118
|
+
stderr=stderr[-1000:],
|
|
119
|
+
tests_passed=passed,
|
|
120
|
+
tests_failed=failed,
|
|
121
|
+
tests_total=total,
|
|
122
|
+
duration_s=elapsed,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
except subprocess.TimeoutExpired:
|
|
126
|
+
return ContainerResult(
|
|
127
|
+
exit_code=-1, stdout="", stderr="Timeout",
|
|
128
|
+
tests_passed=0, tests_failed=0, tests_total=0,
|
|
129
|
+
duration_s=time.perf_counter() - start,
|
|
130
|
+
)
|
|
131
|
+
finally:
|
|
132
|
+
import shutil
|
|
133
|
+
shutil.rmtree(tmpdir, ignore_errors=True)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _parse_pytest_output(output: str) -> tuple[int, int, int]:
|
|
137
|
+
"""Parse pytest output to extract pass/fail counts."""
|
|
138
|
+
import re
|
|
139
|
+
|
|
140
|
+
# Look for "X passed, Y failed" pattern
|
|
141
|
+
passed = 0
|
|
142
|
+
failed = 0
|
|
143
|
+
|
|
144
|
+
# pytest summary line: "5 passed, 2 failed in 1.23s"
|
|
145
|
+
match = re.search(r"(\d+) passed", output)
|
|
146
|
+
if match:
|
|
147
|
+
passed = int(match.group(1))
|
|
148
|
+
|
|
149
|
+
match = re.search(r"(\d+) failed", output)
|
|
150
|
+
if match:
|
|
151
|
+
failed = int(match.group(1))
|
|
152
|
+
|
|
153
|
+
match = re.search(r"(\d+) error", output)
|
|
154
|
+
if match:
|
|
155
|
+
failed += int(match.group(1))
|
|
156
|
+
|
|
157
|
+
total = passed + failed
|
|
158
|
+
return passed, failed, total
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
"""SWE test runner -- sends buggy code to models, verifies fixes in Docker."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from arbiter.core.config import resolve_model
|
|
12
|
+
from arbiter.core.providers.factory import create_provider
|
|
13
|
+
from arbiter.core.swe.container import ContainerResult, check_docker, run_in_container
|
|
14
|
+
from arbiter.core.swe.sandbox import run_in_sandbox, check_sandbox
|
|
15
|
+
from arbiter.core.swe.test_packs import BUILT_IN_PACKS, TestPack, TestCase
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class SWETestResult:
|
|
20
|
+
"""Result from one model attempting one SWE test case."""
|
|
21
|
+
model: str
|
|
22
|
+
test_case: str
|
|
23
|
+
category: str
|
|
24
|
+
passed: bool
|
|
25
|
+
tests_passed: int
|
|
26
|
+
tests_total: int
|
|
27
|
+
model_response: str = ""
|
|
28
|
+
container_output: str = ""
|
|
29
|
+
duration_s: float = 0.0
|
|
30
|
+
error: Optional[str] = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class SWESuiteResult:
|
|
35
|
+
"""Results from running a full SWE suite against one model."""
|
|
36
|
+
model: str
|
|
37
|
+
results: list[SWETestResult] = field(default_factory=list)
|
|
38
|
+
total_passed: int = 0
|
|
39
|
+
total_tests: int = 0
|
|
40
|
+
total_duration_s: float = 0.0
|
|
41
|
+
categories: dict = field(default_factory=dict) # category -> {passed, total}
|
|
42
|
+
|
|
43
|
+
def compute(self) -> None:
|
|
44
|
+
self.total_passed = sum(1 for r in self.results if r.passed)
|
|
45
|
+
self.total_tests = len(self.results)
|
|
46
|
+
self.total_duration_s = sum(r.duration_s for r in self.results)
|
|
47
|
+
cats: dict[str, dict] = {}
|
|
48
|
+
for r in self.results:
|
|
49
|
+
if r.category not in cats:
|
|
50
|
+
cats[r.category] = {"passed": 0, "total": 0}
|
|
51
|
+
cats[r.category]["total"] += 1
|
|
52
|
+
if r.passed:
|
|
53
|
+
cats[r.category]["passed"] += 1
|
|
54
|
+
self.categories = cats
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def pass_rate(self) -> float:
|
|
58
|
+
return self.total_passed / self.total_tests if self.total_tests > 0 else 0
|
|
59
|
+
|
|
60
|
+
def to_dict(self) -> dict:
|
|
61
|
+
return {
|
|
62
|
+
"model": self.model,
|
|
63
|
+
"total_passed": self.total_passed,
|
|
64
|
+
"total_tests": self.total_tests,
|
|
65
|
+
"pass_rate": round(self.pass_rate * 100, 1),
|
|
66
|
+
"total_duration_s": round(self.total_duration_s, 1),
|
|
67
|
+
"categories": self.categories,
|
|
68
|
+
"results": [
|
|
69
|
+
{
|
|
70
|
+
"test_case": r.test_case,
|
|
71
|
+
"category": r.category,
|
|
72
|
+
"passed": r.passed,
|
|
73
|
+
"tests_passed": r.tests_passed,
|
|
74
|
+
"tests_total": r.tests_total,
|
|
75
|
+
"duration_s": round(r.duration_s, 2),
|
|
76
|
+
"error": r.error,
|
|
77
|
+
}
|
|
78
|
+
for r in self.results
|
|
79
|
+
],
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
SWE_SYSTEM_PROMPT = (
|
|
84
|
+
"You are a senior software engineer fixing bugs in a codebase. "
|
|
85
|
+
"You will be given the current code and a bug report. "
|
|
86
|
+
"Return ONLY the complete fixed file. No explanation, no markdown fencing, "
|
|
87
|
+
"just the corrected Python code."
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
async def run_swe_test(
|
|
92
|
+
model_spec: str,
|
|
93
|
+
test_case: TestCase,
|
|
94
|
+
use_docker: bool = False,
|
|
95
|
+
) -> SWETestResult:
|
|
96
|
+
"""Run one SWE test case against one model.
|
|
97
|
+
|
|
98
|
+
1. Send the buggy code + issue to the model
|
|
99
|
+
2. Get the model's fix
|
|
100
|
+
3. Run the fix in a Docker container against the test suite
|
|
101
|
+
4. Report pass/fail
|
|
102
|
+
"""
|
|
103
|
+
start = time.perf_counter()
|
|
104
|
+
|
|
105
|
+
# Build prompt
|
|
106
|
+
prompt = (
|
|
107
|
+
f"## Bug Report\n{test_case.issue}\n\n"
|
|
108
|
+
f"## Current Code ({test_case.filename})\n"
|
|
109
|
+
f"```python\n{test_case.buggy_code}\n```\n\n"
|
|
110
|
+
f"Fix the bug. Return ONLY the complete corrected file."
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Get model's fix
|
|
114
|
+
config = resolve_model(model_spec)
|
|
115
|
+
provider, model_name = create_provider(config)
|
|
116
|
+
|
|
117
|
+
parts = []
|
|
118
|
+
try:
|
|
119
|
+
async for chunk in provider.stream_generate(
|
|
120
|
+
model=model_name, prompt=prompt, system=SWE_SYSTEM_PROMPT,
|
|
121
|
+
):
|
|
122
|
+
parts.append(chunk.text)
|
|
123
|
+
if chunk.done:
|
|
124
|
+
break
|
|
125
|
+
except Exception as e:
|
|
126
|
+
return SWETestResult(
|
|
127
|
+
model=model_spec, test_case=test_case.name,
|
|
128
|
+
category=test_case.category, passed=False,
|
|
129
|
+
tests_passed=0, tests_total=test_case.expected_tests,
|
|
130
|
+
error=f"Model error: {e}",
|
|
131
|
+
duration_s=time.perf_counter() - start,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
model_response = "".join(parts).strip()
|
|
135
|
+
|
|
136
|
+
# Clean markdown fencing if present
|
|
137
|
+
import re
|
|
138
|
+
if "```" in model_response:
|
|
139
|
+
match = re.search(r"```(?:python)?\s*(.*?)```", model_response, re.DOTALL)
|
|
140
|
+
if match:
|
|
141
|
+
model_response = match.group(1).strip()
|
|
142
|
+
|
|
143
|
+
# Run tests -- sandbox (default) or Docker
|
|
144
|
+
try:
|
|
145
|
+
if use_docker and check_docker():
|
|
146
|
+
result = run_in_container(
|
|
147
|
+
image=test_case.docker_image,
|
|
148
|
+
fix_code=model_response,
|
|
149
|
+
test_code=test_case.test_code,
|
|
150
|
+
test_command=test_case.test_command,
|
|
151
|
+
timeout=test_case.timeout,
|
|
152
|
+
)
|
|
153
|
+
else:
|
|
154
|
+
result = run_in_sandbox(
|
|
155
|
+
fix_code=model_response,
|
|
156
|
+
test_code=test_case.test_code,
|
|
157
|
+
timeout=test_case.timeout,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
return SWETestResult(
|
|
161
|
+
model=model_spec, test_case=test_case.name,
|
|
162
|
+
category=test_case.category,
|
|
163
|
+
passed=result.all_passed,
|
|
164
|
+
tests_passed=result.tests_passed,
|
|
165
|
+
tests_total=result.tests_total or test_case.expected_tests,
|
|
166
|
+
model_response=model_response[:1000],
|
|
167
|
+
container_output=result.stdout[:500],
|
|
168
|
+
duration_s=time.perf_counter() - start,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
except Exception as e:
|
|
172
|
+
return SWETestResult(
|
|
173
|
+
model=model_spec, test_case=test_case.name,
|
|
174
|
+
category=test_case.category, passed=False,
|
|
175
|
+
tests_passed=0, tests_total=test_case.expected_tests,
|
|
176
|
+
error=f"Execution error: {e}",
|
|
177
|
+
model_response=model_response[:500],
|
|
178
|
+
duration_s=time.perf_counter() - start,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
async def run_swe_suite(
|
|
183
|
+
model_spec: str,
|
|
184
|
+
test_pack: Optional[str] = None,
|
|
185
|
+
use_docker: bool = False,
|
|
186
|
+
on_progress=None,
|
|
187
|
+
) -> SWESuiteResult:
|
|
188
|
+
"""Run the full SWE suite against one model."""
|
|
189
|
+
packs = BUILT_IN_PACKS if test_pack is None else [
|
|
190
|
+
p for p in BUILT_IN_PACKS if p.name == test_pack
|
|
191
|
+
]
|
|
192
|
+
|
|
193
|
+
all_cases = []
|
|
194
|
+
for pack in packs:
|
|
195
|
+
all_cases.extend(pack.cases)
|
|
196
|
+
|
|
197
|
+
result = SWESuiteResult(model=model_spec)
|
|
198
|
+
|
|
199
|
+
for i, case in enumerate(all_cases):
|
|
200
|
+
if on_progress:
|
|
201
|
+
on_progress(model_spec, i + 1, len(all_cases), case.name)
|
|
202
|
+
|
|
203
|
+
test_result = await run_swe_test(model_spec, case, use_docker=use_docker)
|
|
204
|
+
result.results.append(test_result)
|
|
205
|
+
|
|
206
|
+
result.compute()
|
|
207
|
+
return result
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
async def run_swe_comparison(
|
|
211
|
+
model_specs: list[str],
|
|
212
|
+
test_pack: Optional[str] = None,
|
|
213
|
+
use_docker: bool = False,
|
|
214
|
+
) -> list[SWESuiteResult]:
|
|
215
|
+
"""Run SWE suite against multiple models (sequentially for fairness)."""
|
|
216
|
+
results = []
|
|
217
|
+
for spec in model_specs:
|
|
218
|
+
r = await run_swe_suite(spec, test_pack=test_pack, use_docker=use_docker)
|
|
219
|
+
results.append(r)
|
|
220
|
+
return results
|