arbiter-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
arbiter/core/runner.py ADDED
@@ -0,0 +1,257 @@
1
+ """Core runner -- executes prompts against multiple models in parallel."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from datetime import datetime, timezone
7
+ from typing import AsyncIterator, Callable, Optional
8
+
9
+ from arbiter.core.config import resolve_model
10
+ from arbiter.core.metrics import ComparisonResult, ModelMetrics
11
+ from arbiter.core.providers.factory import create_provider
12
+
13
+
14
+ # Callback type for streaming progress updates
15
+ # (model_name, token_text, metrics_snapshot)
16
+ StreamCallback = Callable[[str, str, ModelMetrics], None]
17
+
18
+
19
+ async def run_single_model(
20
+ model_spec: str,
21
+ prompt: str,
22
+ system: Optional[str] = None,
23
+ image_path: Optional[str] = None,
24
+ on_token: Optional[StreamCallback] = None,
25
+ ) -> ModelMetrics:
26
+ """Run a single model and collect metrics.
27
+
28
+ Args:
29
+ model_spec: Model specifier (e.g. "gemma4", "openai:gpt-4o")
30
+ prompt: The prompt to send
31
+ system: Optional system prompt
32
+ image_path: Optional image for multimodal models
33
+ on_token: Optional callback fired for each token
34
+
35
+ Returns:
36
+ ModelMetrics with all performance data and generated output
37
+ """
38
+ config = resolve_model(model_spec)
39
+ provider, model_name = create_provider(config)
40
+
41
+ metrics = ModelMetrics(
42
+ model=model_name,
43
+ provider=config.provider,
44
+ )
45
+
46
+ metrics.start()
47
+ last_meta: dict = {}
48
+
49
+ try:
50
+ async for chunk in provider.stream_generate(
51
+ model=model_name,
52
+ prompt=prompt,
53
+ system=system,
54
+ image_path=image_path,
55
+ ):
56
+ if chunk.text:
57
+ if metrics._first_token_time is None:
58
+ metrics.record_first_token()
59
+ metrics.record_token(chunk.text)
60
+
61
+ if on_token:
62
+ on_token(model_name, chunk.text, metrics)
63
+
64
+ if chunk.meta:
65
+ last_meta.update(chunk.meta)
66
+
67
+ if chunk.done:
68
+ break
69
+
70
+ except Exception as e:
71
+ metrics.output = f"[ERROR] {type(e).__name__}: {e}"
72
+
73
+ metrics.finish(provider_meta=last_meta)
74
+ return metrics
75
+
76
+
77
+ async def run_comparison(
78
+ model_specs: list[str],
79
+ prompt: str,
80
+ system: Optional[str] = None,
81
+ image_path: Optional[str] = None,
82
+ on_token: Optional[StreamCallback] = None,
83
+ sequential: bool = False,
84
+ ) -> ComparisonResult:
85
+ """Run the same prompt through multiple models.
86
+
87
+ Args:
88
+ model_specs: List of model specifiers
89
+ prompt: The prompt to send to all models
90
+ system: Optional system prompt
91
+ image_path: Optional image path for multimodal
92
+ on_token: Optional callback for streaming updates
93
+ sequential: If True, run models one at a time (saves memory).
94
+ If False, run all in parallel (faster but uses more RAM).
95
+
96
+ Returns:
97
+ ComparisonResult with metrics for all models
98
+ """
99
+ if sequential:
100
+ return await _run_sequential(model_specs, prompt, system, image_path, on_token)
101
+ else:
102
+ return await _run_parallel(model_specs, prompt, system, image_path, on_token)
103
+
104
+
105
+ async def _run_parallel(
106
+ model_specs: list[str],
107
+ prompt: str,
108
+ system: Optional[str],
109
+ image_path: Optional[str],
110
+ on_token: Optional[StreamCallback],
111
+ ) -> ComparisonResult:
112
+ """Run all models in parallel (faster, more memory)."""
113
+ tasks = [
114
+ run_single_model(
115
+ model_spec=spec,
116
+ prompt=prompt,
117
+ system=system,
118
+ image_path=image_path,
119
+ on_token=on_token,
120
+ )
121
+ for spec in model_specs
122
+ ]
123
+
124
+ results = await asyncio.gather(*tasks, return_exceptions=True)
125
+
126
+ model_metrics = []
127
+ for i, result in enumerate(results):
128
+ if isinstance(result, Exception):
129
+ metrics = ModelMetrics(
130
+ model=model_specs[i],
131
+ provider="unknown",
132
+ output=f"[ERROR] {type(result).__name__}: {result}",
133
+ )
134
+ model_metrics.append(metrics)
135
+ else:
136
+ model_metrics.append(result)
137
+
138
+ return ComparisonResult(
139
+ prompt=prompt,
140
+ models=model_metrics,
141
+ timestamp=datetime.now(timezone.utc).isoformat(),
142
+ )
143
+
144
+
145
+ async def _run_sequential(
146
+ model_specs: list[str],
147
+ prompt: str,
148
+ system: Optional[str],
149
+ image_path: Optional[str],
150
+ on_token: Optional[StreamCallback],
151
+ ) -> ComparisonResult:
152
+ """Run models one at a time. Each finishes and frees memory before the next starts.
153
+
154
+ Better for low-RAM machines (8GB). Timing is still fair because each model
155
+ gets the machine to itself with no contention.
156
+ """
157
+ model_metrics = []
158
+
159
+ for spec in model_specs:
160
+ try:
161
+ metrics = await run_single_model(
162
+ model_spec=spec,
163
+ prompt=prompt,
164
+ system=system,
165
+ image_path=image_path,
166
+ on_token=on_token,
167
+ )
168
+ model_metrics.append(metrics)
169
+ except Exception as e:
170
+ metrics = ModelMetrics(
171
+ model=spec,
172
+ provider="unknown",
173
+ output=f"[ERROR] {type(e).__name__}: {e}",
174
+ )
175
+ model_metrics.append(metrics)
176
+
177
+ return ComparisonResult(
178
+ prompt=prompt,
179
+ models=model_metrics,
180
+ timestamp=datetime.now(timezone.utc).isoformat(),
181
+ )
182
+
183
+
184
+ async def stream_comparison(
185
+ model_specs: list[str],
186
+ prompt: str,
187
+ system: Optional[str] = None,
188
+ image_path: Optional[str] = None,
189
+ ) -> AsyncIterator[tuple[str, str, ModelMetrics | None]]:
190
+ """Stream a comparison, yielding (event_type, model_name, data) tuples.
191
+
192
+ Event types:
193
+ "start" - model generation started
194
+ "token" - a token was received
195
+ "done" - model generation complete
196
+ "error" - model encountered an error
197
+
198
+ This is used by the WebSocket server for real-time dashboard updates.
199
+ """
200
+ queues: dict[str, asyncio.Queue] = {}
201
+ tasks: list[asyncio.Task] = []
202
+
203
+ async def _run_model(spec: str, queue: asyncio.Queue):
204
+ config = resolve_model(spec)
205
+ provider, model_name = create_provider(config)
206
+ metrics = ModelMetrics(model=model_name, provider=config.provider)
207
+
208
+ await queue.put(("start", model_name, metrics))
209
+ metrics.start()
210
+ last_meta: dict = {}
211
+
212
+ try:
213
+ async for chunk in provider.stream_generate(
214
+ model=model_name, prompt=prompt, system=system, image_path=image_path
215
+ ):
216
+ if chunk.text:
217
+ if metrics._first_token_time is None:
218
+ metrics.record_first_token()
219
+ metrics.record_token(chunk.text)
220
+ await queue.put(("token", model_name, metrics))
221
+
222
+ if chunk.meta:
223
+ last_meta.update(chunk.meta)
224
+
225
+ if chunk.done:
226
+ break
227
+
228
+ metrics.finish(provider_meta=last_meta)
229
+ await queue.put(("done", model_name, metrics))
230
+
231
+ except Exception as e:
232
+ metrics.output = f"[ERROR] {type(e).__name__}: {e}"
233
+ metrics.finish()
234
+ await queue.put(("error", model_name, metrics))
235
+
236
+ await queue.put(None) # sentinel
237
+
238
+ # Start all models
239
+ merged_queue: asyncio.Queue = asyncio.Queue()
240
+ for spec in model_specs:
241
+ task = asyncio.create_task(_run_model(spec, merged_queue))
242
+ tasks.append(task)
243
+
244
+ finished = 0
245
+ total = len(model_specs)
246
+
247
+ while finished < total:
248
+ item = await merged_queue.get()
249
+ if item is None:
250
+ finished += 1
251
+ continue
252
+ yield item
253
+
254
+ # Ensure all tasks are cleaned up
255
+ for task in tasks:
256
+ if not task.done():
257
+ task.cancel()
@@ -0,0 +1 @@
1
+ """SWE-bench style testing -- real code execution in Docker containers."""
@@ -0,0 +1,158 @@
1
+ """Docker container management for safe code execution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import subprocess
7
+ import tempfile
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+ from typing import Optional
11
+
12
+
13
+ @dataclass
14
+ class ContainerResult:
15
+ """Result from running tests in a Docker container."""
16
+ exit_code: int
17
+ stdout: str
18
+ stderr: str
19
+ tests_passed: int
20
+ tests_failed: int
21
+ tests_total: int
22
+ duration_s: float
23
+
24
+ @property
25
+ def all_passed(self) -> bool:
26
+ return self.tests_failed == 0 and self.tests_passed > 0
27
+
28
+
29
+ def check_docker() -> bool:
30
+ """Check if Docker is available and running."""
31
+ try:
32
+ result = subprocess.run(
33
+ ["docker", "info"],
34
+ capture_output=True, text=True, timeout=10,
35
+ )
36
+ return result.returncode == 0
37
+ except (FileNotFoundError, subprocess.TimeoutExpired):
38
+ return False
39
+
40
+
41
+ def build_test_image(test_pack_dir: Path, tag: str = "arbiter-test") -> bool:
42
+ """Build a Docker image for a test pack.
43
+
44
+ The test pack directory must contain a Dockerfile.
45
+ """
46
+ dockerfile = test_pack_dir / "Dockerfile"
47
+ if not dockerfile.exists():
48
+ # Generate a default Python Dockerfile
49
+ dockerfile.write_text(
50
+ "FROM python:3.11-slim\n"
51
+ "WORKDIR /workspace\n"
52
+ "COPY requirements.txt* ./\n"
53
+ "RUN pip install --no-cache-dir -r requirements.txt 2>/dev/null || true\n"
54
+ "RUN pip install --no-cache-dir pytest\n"
55
+ "COPY . .\n"
56
+ )
57
+
58
+ result = subprocess.run(
59
+ ["docker", "build", "-t", tag, "."],
60
+ cwd=str(test_pack_dir),
61
+ capture_output=True, text=True, timeout=120,
62
+ )
63
+ return result.returncode == 0
64
+
65
+
66
+ def run_in_container(
67
+ image: str,
68
+ fix_code: str,
69
+ test_code: str,
70
+ test_command: str = "python -m pytest /workspace/test_fix.py -v --tb=short",
71
+ timeout: int = 60,
72
+ ) -> ContainerResult:
73
+ """Run a model's fix inside a Docker container against a test suite.
74
+
75
+ 1. Writes the model's fix to a temp dir as fix.py
76
+ 2. Writes the test suite as test_fix.py
77
+ 3. Mounts both into the container
78
+ 4. Runs pytest
79
+ 5. Returns results
80
+
81
+ Network is disabled. Memory limited to 512MB. CPU limited to 1 core.
82
+ """
83
+ import time
84
+
85
+ tmpdir = tempfile.mkdtemp(prefix="arbiter-swe-")
86
+ fix_path = Path(tmpdir) / "fix.py"
87
+ test_path = Path(tmpdir) / "test_fix.py"
88
+
89
+ fix_path.write_text(fix_code)
90
+ test_path.write_text(test_code)
91
+
92
+ start = time.perf_counter()
93
+
94
+ try:
95
+ cmd = [
96
+ "docker", "run", "--rm",
97
+ "--network=none",
98
+ "--memory=512m",
99
+ "--cpus=1",
100
+ "-v", f"{tmpdir}:/workspace",
101
+ image,
102
+ "sh", "-c", test_command,
103
+ ]
104
+
105
+ result = subprocess.run(
106
+ cmd, capture_output=True, text=True, timeout=timeout,
107
+ )
108
+
109
+ elapsed = time.perf_counter() - start
110
+ stdout = result.stdout
111
+ stderr = result.stderr
112
+
113
+ passed, failed, total = _parse_pytest_output(stdout + stderr)
114
+
115
+ return ContainerResult(
116
+ exit_code=result.returncode,
117
+ stdout=stdout[-2000:],
118
+ stderr=stderr[-1000:],
119
+ tests_passed=passed,
120
+ tests_failed=failed,
121
+ tests_total=total,
122
+ duration_s=elapsed,
123
+ )
124
+
125
+ except subprocess.TimeoutExpired:
126
+ return ContainerResult(
127
+ exit_code=-1, stdout="", stderr="Timeout",
128
+ tests_passed=0, tests_failed=0, tests_total=0,
129
+ duration_s=time.perf_counter() - start,
130
+ )
131
+ finally:
132
+ import shutil
133
+ shutil.rmtree(tmpdir, ignore_errors=True)
134
+
135
+
136
+ def _parse_pytest_output(output: str) -> tuple[int, int, int]:
137
+ """Parse pytest output to extract pass/fail counts."""
138
+ import re
139
+
140
+ # Look for "X passed, Y failed" pattern
141
+ passed = 0
142
+ failed = 0
143
+
144
+ # pytest summary line: "5 passed, 2 failed in 1.23s"
145
+ match = re.search(r"(\d+) passed", output)
146
+ if match:
147
+ passed = int(match.group(1))
148
+
149
+ match = re.search(r"(\d+) failed", output)
150
+ if match:
151
+ failed = int(match.group(1))
152
+
153
+ match = re.search(r"(\d+) error", output)
154
+ if match:
155
+ failed += int(match.group(1))
156
+
157
+ total = passed + failed
158
+ return passed, failed, total
@@ -0,0 +1,220 @@
1
+ """SWE test runner -- sends buggy code to models, verifies fixes in Docker."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import time
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+ from arbiter.core.config import resolve_model
12
+ from arbiter.core.providers.factory import create_provider
13
+ from arbiter.core.swe.container import ContainerResult, check_docker, run_in_container
14
+ from arbiter.core.swe.sandbox import run_in_sandbox, check_sandbox
15
+ from arbiter.core.swe.test_packs import BUILT_IN_PACKS, TestPack, TestCase
16
+
17
+
18
+ @dataclass
19
+ class SWETestResult:
20
+ """Result from one model attempting one SWE test case."""
21
+ model: str
22
+ test_case: str
23
+ category: str
24
+ passed: bool
25
+ tests_passed: int
26
+ tests_total: int
27
+ model_response: str = ""
28
+ container_output: str = ""
29
+ duration_s: float = 0.0
30
+ error: Optional[str] = None
31
+
32
+
33
+ @dataclass
34
+ class SWESuiteResult:
35
+ """Results from running a full SWE suite against one model."""
36
+ model: str
37
+ results: list[SWETestResult] = field(default_factory=list)
38
+ total_passed: int = 0
39
+ total_tests: int = 0
40
+ total_duration_s: float = 0.0
41
+ categories: dict = field(default_factory=dict) # category -> {passed, total}
42
+
43
+ def compute(self) -> None:
44
+ self.total_passed = sum(1 for r in self.results if r.passed)
45
+ self.total_tests = len(self.results)
46
+ self.total_duration_s = sum(r.duration_s for r in self.results)
47
+ cats: dict[str, dict] = {}
48
+ for r in self.results:
49
+ if r.category not in cats:
50
+ cats[r.category] = {"passed": 0, "total": 0}
51
+ cats[r.category]["total"] += 1
52
+ if r.passed:
53
+ cats[r.category]["passed"] += 1
54
+ self.categories = cats
55
+
56
+ @property
57
+ def pass_rate(self) -> float:
58
+ return self.total_passed / self.total_tests if self.total_tests > 0 else 0
59
+
60
+ def to_dict(self) -> dict:
61
+ return {
62
+ "model": self.model,
63
+ "total_passed": self.total_passed,
64
+ "total_tests": self.total_tests,
65
+ "pass_rate": round(self.pass_rate * 100, 1),
66
+ "total_duration_s": round(self.total_duration_s, 1),
67
+ "categories": self.categories,
68
+ "results": [
69
+ {
70
+ "test_case": r.test_case,
71
+ "category": r.category,
72
+ "passed": r.passed,
73
+ "tests_passed": r.tests_passed,
74
+ "tests_total": r.tests_total,
75
+ "duration_s": round(r.duration_s, 2),
76
+ "error": r.error,
77
+ }
78
+ for r in self.results
79
+ ],
80
+ }
81
+
82
+
83
+ SWE_SYSTEM_PROMPT = (
84
+ "You are a senior software engineer fixing bugs in a codebase. "
85
+ "You will be given the current code and a bug report. "
86
+ "Return ONLY the complete fixed file. No explanation, no markdown fencing, "
87
+ "just the corrected Python code."
88
+ )
89
+
90
+
91
+ async def run_swe_test(
92
+ model_spec: str,
93
+ test_case: TestCase,
94
+ use_docker: bool = False,
95
+ ) -> SWETestResult:
96
+ """Run one SWE test case against one model.
97
+
98
+ 1. Send the buggy code + issue to the model
99
+ 2. Get the model's fix
100
+ 3. Run the fix in a Docker container against the test suite
101
+ 4. Report pass/fail
102
+ """
103
+ start = time.perf_counter()
104
+
105
+ # Build prompt
106
+ prompt = (
107
+ f"## Bug Report\n{test_case.issue}\n\n"
108
+ f"## Current Code ({test_case.filename})\n"
109
+ f"```python\n{test_case.buggy_code}\n```\n\n"
110
+ f"Fix the bug. Return ONLY the complete corrected file."
111
+ )
112
+
113
+ # Get model's fix
114
+ config = resolve_model(model_spec)
115
+ provider, model_name = create_provider(config)
116
+
117
+ parts = []
118
+ try:
119
+ async for chunk in provider.stream_generate(
120
+ model=model_name, prompt=prompt, system=SWE_SYSTEM_PROMPT,
121
+ ):
122
+ parts.append(chunk.text)
123
+ if chunk.done:
124
+ break
125
+ except Exception as e:
126
+ return SWETestResult(
127
+ model=model_spec, test_case=test_case.name,
128
+ category=test_case.category, passed=False,
129
+ tests_passed=0, tests_total=test_case.expected_tests,
130
+ error=f"Model error: {e}",
131
+ duration_s=time.perf_counter() - start,
132
+ )
133
+
134
+ model_response = "".join(parts).strip()
135
+
136
+ # Clean markdown fencing if present
137
+ import re
138
+ if "```" in model_response:
139
+ match = re.search(r"```(?:python)?\s*(.*?)```", model_response, re.DOTALL)
140
+ if match:
141
+ model_response = match.group(1).strip()
142
+
143
+ # Run tests -- sandbox (default) or Docker
144
+ try:
145
+ if use_docker and check_docker():
146
+ result = run_in_container(
147
+ image=test_case.docker_image,
148
+ fix_code=model_response,
149
+ test_code=test_case.test_code,
150
+ test_command=test_case.test_command,
151
+ timeout=test_case.timeout,
152
+ )
153
+ else:
154
+ result = run_in_sandbox(
155
+ fix_code=model_response,
156
+ test_code=test_case.test_code,
157
+ timeout=test_case.timeout,
158
+ )
159
+
160
+ return SWETestResult(
161
+ model=model_spec, test_case=test_case.name,
162
+ category=test_case.category,
163
+ passed=result.all_passed,
164
+ tests_passed=result.tests_passed,
165
+ tests_total=result.tests_total or test_case.expected_tests,
166
+ model_response=model_response[:1000],
167
+ container_output=result.stdout[:500],
168
+ duration_s=time.perf_counter() - start,
169
+ )
170
+
171
+ except Exception as e:
172
+ return SWETestResult(
173
+ model=model_spec, test_case=test_case.name,
174
+ category=test_case.category, passed=False,
175
+ tests_passed=0, tests_total=test_case.expected_tests,
176
+ error=f"Execution error: {e}",
177
+ model_response=model_response[:500],
178
+ duration_s=time.perf_counter() - start,
179
+ )
180
+
181
+
182
+ async def run_swe_suite(
183
+ model_spec: str,
184
+ test_pack: Optional[str] = None,
185
+ use_docker: bool = False,
186
+ on_progress=None,
187
+ ) -> SWESuiteResult:
188
+ """Run the full SWE suite against one model."""
189
+ packs = BUILT_IN_PACKS if test_pack is None else [
190
+ p for p in BUILT_IN_PACKS if p.name == test_pack
191
+ ]
192
+
193
+ all_cases = []
194
+ for pack in packs:
195
+ all_cases.extend(pack.cases)
196
+
197
+ result = SWESuiteResult(model=model_spec)
198
+
199
+ for i, case in enumerate(all_cases):
200
+ if on_progress:
201
+ on_progress(model_spec, i + 1, len(all_cases), case.name)
202
+
203
+ test_result = await run_swe_test(model_spec, case, use_docker=use_docker)
204
+ result.results.append(test_result)
205
+
206
+ result.compute()
207
+ return result
208
+
209
+
210
+ async def run_swe_comparison(
211
+ model_specs: list[str],
212
+ test_pack: Optional[str] = None,
213
+ use_docker: bool = False,
214
+ ) -> list[SWESuiteResult]:
215
+ """Run SWE suite against multiple models (sequentially for fairness)."""
216
+ results = []
217
+ for spec in model_specs:
218
+ r = await run_swe_suite(spec, test_pack=test_pack, use_docker=use_docker)
219
+ results.append(r)
220
+ return results