arbiter-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arbiter/__init__.py +3 -0
- arbiter/cli/__init__.py +0 -0
- arbiter/cli/app.py +699 -0
- arbiter/cli/display.py +381 -0
- arbiter/core/__init__.py +0 -0
- arbiter/core/benchmarks.py +804 -0
- arbiter/core/config.py +137 -0
- arbiter/core/discover.py +184 -0
- arbiter/core/judge.py +193 -0
- arbiter/core/leaderboard.py +197 -0
- arbiter/core/metrics.py +367 -0
- arbiter/core/providers/__init__.py +19 -0
- arbiter/core/providers/anthropic_provider.py +133 -0
- arbiter/core/providers/base.py +62 -0
- arbiter/core/providers/factory.py +79 -0
- arbiter/core/providers/google_provider.py +126 -0
- arbiter/core/providers/ollama.py +103 -0
- arbiter/core/providers/openai_provider.py +120 -0
- arbiter/core/runner.py +257 -0
- arbiter/core/swe/__init__.py +1 -0
- arbiter/core/swe/container.py +158 -0
- arbiter/core/swe/runner.py +220 -0
- arbiter/core/swe/sandbox.py +111 -0
- arbiter/core/swe/test_packs.py +548 -0
- arbiter/dashboard/__init__.py +0 -0
- arbiter/dashboard/frontend/dist/assets/index-1tkxJouQ.css +1 -0
- arbiter/dashboard/frontend/dist/assets/index-dHa4zmvw.js +298 -0
- arbiter/dashboard/frontend/dist/index.html +16 -0
- arbiter/dashboard/server.py +426 -0
- arbiter_cli-0.1.0.dist-info/METADATA +299 -0
- arbiter_cli-0.1.0.dist-info/RECORD +35 -0
- arbiter_cli-0.1.0.dist-info/WHEEL +5 -0
- arbiter_cli-0.1.0.dist-info/entry_points.txt +2 -0
- arbiter_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- arbiter_cli-0.1.0.dist-info/top_level.txt +1 -0
arbiter/cli/app.py
ADDED
|
@@ -0,0 +1,699 @@
|
|
|
1
|
+
"""Arbiter CLI - Typer-based command line interface."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
from rich.live import Live
|
|
10
|
+
|
|
11
|
+
from arbiter.cli.display import (
|
|
12
|
+
console,
|
|
13
|
+
create_progress,
|
|
14
|
+
get_model_color,
|
|
15
|
+
print_comparing,
|
|
16
|
+
print_discover,
|
|
17
|
+
print_error,
|
|
18
|
+
print_header,
|
|
19
|
+
print_json_output,
|
|
20
|
+
print_leaderboard,
|
|
21
|
+
print_model_output,
|
|
22
|
+
print_results,
|
|
23
|
+
update_progress,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
app = typer.Typer(
|
|
27
|
+
name="arbiter",
|
|
28
|
+
help="The final word on your local models. Compare LLMs side-by-side.",
|
|
29
|
+
add_completion=False,
|
|
30
|
+
no_args_is_help=True,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
# arbiter run -- the main command
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
@app.command(context_settings={"allow_extra_args": True, "allow_interspersed_args": True})
|
|
39
|
+
def run(
|
|
40
|
+
ctx: typer.Context,
|
|
41
|
+
seq: bool = typer.Option(
|
|
42
|
+
False, "--seq", help="Run models one at a time (saves memory for 8GB machines)",
|
|
43
|
+
),
|
|
44
|
+
dashboard: bool = typer.Option(
|
|
45
|
+
False, "--dashboard", "-d", help="Open animated web dashboard",
|
|
46
|
+
),
|
|
47
|
+
image: Optional[str] = typer.Option(None, "--image", "-i", help="Image path for multimodal"),
|
|
48
|
+
output: Optional[str] = typer.Option(None, "--output", "-o", help="Output format: json"),
|
|
49
|
+
show: bool = typer.Option(False, "--show", "-s", help="Show full model outputs"),
|
|
50
|
+
no_judge: bool = typer.Option(False, "--no-judge", help="Skip quality judging"),
|
|
51
|
+
judge: str = typer.Option("auto", "--judge", "-j", help="Judge model"),
|
|
52
|
+
system: Optional[str] = typer.Option(None, "--system", help="System prompt"),
|
|
53
|
+
) -> None:
|
|
54
|
+
"""Run a comparison. Put models first, prompt last in quotes.
|
|
55
|
+
|
|
56
|
+
Examples:
|
|
57
|
+
arbiter run gemma4:e2b qwen3.5:4b "explain recursion"
|
|
58
|
+
arbiter run gemma4:e2b qwen3.5:4b "explain recursion" --seq
|
|
59
|
+
arbiter run gemma4:e2b qwen3.5:4b "write a sort" --seq --dashboard
|
|
60
|
+
"""
|
|
61
|
+
args = ctx.args
|
|
62
|
+
|
|
63
|
+
# Separate models from prompt
|
|
64
|
+
# If no args at all, interactive mode: pick models + prompt
|
|
65
|
+
# If one arg, it's the prompt -- auto-detect models
|
|
66
|
+
# If 2+, last is prompt, rest are models
|
|
67
|
+
if len(args) == 0:
|
|
68
|
+
# Fully interactive: pick models and prompt
|
|
69
|
+
models, prompt = asyncio.run(_interactive_setup())
|
|
70
|
+
elif len(args) == 1:
|
|
71
|
+
# Just a prompt -- auto-pick models
|
|
72
|
+
prompt = args[0]
|
|
73
|
+
models = asyncio.run(_auto_select_models())
|
|
74
|
+
else:
|
|
75
|
+
prompt = args[-1]
|
|
76
|
+
models = args[:-1]
|
|
77
|
+
|
|
78
|
+
if not models:
|
|
79
|
+
print_error("No models found. Is Ollama running? Do you have models installed?")
|
|
80
|
+
console.print("[dim]Run: ollama pull gemma4:e2b[/dim]")
|
|
81
|
+
raise typer.Exit(1)
|
|
82
|
+
|
|
83
|
+
if dashboard:
|
|
84
|
+
asyncio.run(_run_with_dashboard(models, prompt, image, judge, no_judge, seq, system))
|
|
85
|
+
else:
|
|
86
|
+
asyncio.run(_run_cli(models, prompt, image, judge, output, show, no_judge, seq, system))
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
async def _run_cli(
|
|
90
|
+
model_specs: list[str],
|
|
91
|
+
prompt: str,
|
|
92
|
+
image: Optional[str],
|
|
93
|
+
judge_model: str,
|
|
94
|
+
output_format: Optional[str],
|
|
95
|
+
show_output: bool,
|
|
96
|
+
no_judge: bool,
|
|
97
|
+
sequential: bool,
|
|
98
|
+
system: Optional[str],
|
|
99
|
+
) -> None:
|
|
100
|
+
"""Run comparison in CLI mode."""
|
|
101
|
+
from arbiter.core.judge import judge_comparison
|
|
102
|
+
from arbiter.core.leaderboard import Leaderboard
|
|
103
|
+
from arbiter.core.runner import run_comparison, run_single_model
|
|
104
|
+
from arbiter.core.metrics import ComparisonResult, compute_composite_scores, ScoreWeights
|
|
105
|
+
from datetime import datetime, timezone
|
|
106
|
+
|
|
107
|
+
print_header()
|
|
108
|
+
print_comparing(model_specs, prompt)
|
|
109
|
+
|
|
110
|
+
if sequential:
|
|
111
|
+
console.print("[dim]Sequential mode (low memory)[/dim]")
|
|
112
|
+
console.print()
|
|
113
|
+
|
|
114
|
+
all_metrics = []
|
|
115
|
+
for i, spec in enumerate(model_specs):
|
|
116
|
+
color = get_model_color(i)
|
|
117
|
+
console.print(f"[{color}]Running {spec}...[/{color}]")
|
|
118
|
+
|
|
119
|
+
progress, task_ids = create_progress([spec])
|
|
120
|
+
|
|
121
|
+
def on_token(model: str, text: str, metrics, _p=progress, _t=task_ids):
|
|
122
|
+
update_progress(_p, _t, model, metrics)
|
|
123
|
+
|
|
124
|
+
with Live(progress, console=console, refresh_per_second=10):
|
|
125
|
+
metrics = await run_single_model(
|
|
126
|
+
model_spec=spec, prompt=prompt, system=system,
|
|
127
|
+
image_path=image, on_token=on_token,
|
|
128
|
+
)
|
|
129
|
+
all_metrics.append(metrics)
|
|
130
|
+
|
|
131
|
+
tps = f"{metrics.tokens_per_sec:.1f} tok/s" if metrics.tokens_per_sec else "--"
|
|
132
|
+
console.print(f" [{color}]Done[/{color}] - {metrics.total_tokens} tokens, {tps}")
|
|
133
|
+
console.print()
|
|
134
|
+
|
|
135
|
+
result = ComparisonResult(
|
|
136
|
+
prompt=prompt, models=all_metrics,
|
|
137
|
+
timestamp=datetime.now(timezone.utc).isoformat(),
|
|
138
|
+
)
|
|
139
|
+
else:
|
|
140
|
+
progress, task_ids = create_progress(model_specs)
|
|
141
|
+
|
|
142
|
+
def on_token(model: str, text: str, metrics):
|
|
143
|
+
update_progress(progress, task_ids, model, metrics)
|
|
144
|
+
|
|
145
|
+
with Live(progress, console=console, refresh_per_second=10):
|
|
146
|
+
result = await run_comparison(
|
|
147
|
+
model_specs=model_specs, prompt=prompt, system=system,
|
|
148
|
+
image_path=image, on_token=on_token, sequential=False,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Judge quality (optional)
|
|
152
|
+
if not no_judge and len(model_specs) > 1:
|
|
153
|
+
with console.status("[bold cyan]Judging quality..."):
|
|
154
|
+
result = await judge_comparison(result, judge_model=judge_model)
|
|
155
|
+
|
|
156
|
+
# Compute composite scores and determine winner
|
|
157
|
+
has_quality = not no_judge and len(model_specs) > 1
|
|
158
|
+
result.scoring = compute_composite_scores(result, has_quality=has_quality)
|
|
159
|
+
result.winner = result.scoring.winner if result.scoring else None
|
|
160
|
+
|
|
161
|
+
# Update leaderboard
|
|
162
|
+
if len(model_specs) > 1:
|
|
163
|
+
lb = Leaderboard()
|
|
164
|
+
lb.update_from_comparison(result)
|
|
165
|
+
|
|
166
|
+
# Output
|
|
167
|
+
if output_format == "json":
|
|
168
|
+
print_json_output(result)
|
|
169
|
+
else:
|
|
170
|
+
print_results(result)
|
|
171
|
+
if show_output:
|
|
172
|
+
for i, m in enumerate(result.models):
|
|
173
|
+
print_model_output(m.model, m.output, index=i)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
async def _auto_select_models(max_models: int = 2) -> list[str]:
|
|
177
|
+
"""Auto-detect installed models and pick the best ones to compare."""
|
|
178
|
+
from arbiter.core.discover import discover_ollama
|
|
179
|
+
|
|
180
|
+
models = await discover_ollama()
|
|
181
|
+
if not models:
|
|
182
|
+
return []
|
|
183
|
+
|
|
184
|
+
# Sort by size (prefer smaller models that fit in memory)
|
|
185
|
+
models.sort(key=lambda m: m.size or 0)
|
|
186
|
+
|
|
187
|
+
# Prefer models that fit in memory, but include all
|
|
188
|
+
safe = [m for m in models if m.fits_in_memory]
|
|
189
|
+
pool = safe if len(safe) >= 2 else models
|
|
190
|
+
|
|
191
|
+
# Pick up to max_models, preferring different families
|
|
192
|
+
selected = []
|
|
193
|
+
seen_families = set()
|
|
194
|
+
for m in pool:
|
|
195
|
+
family = m.family or m.name.split(":")[0]
|
|
196
|
+
if family not in seen_families or len(selected) < max_models:
|
|
197
|
+
if m.memory_warning:
|
|
198
|
+
console.print(f"[yellow]Note: {m.name} ({m.size_gb}GB) - {m.memory_warning}[/yellow]")
|
|
199
|
+
selected.append(m.spec)
|
|
200
|
+
seen_families.add(family)
|
|
201
|
+
if len(selected) >= max_models:
|
|
202
|
+
break
|
|
203
|
+
|
|
204
|
+
return selected
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
async def _interactive_setup() -> tuple[list[str], str]:
|
|
208
|
+
"""Interactive model selection and prompt input."""
|
|
209
|
+
from arbiter.core.discover import discover_ollama
|
|
210
|
+
from rich.prompt import Prompt
|
|
211
|
+
|
|
212
|
+
print_header()
|
|
213
|
+
|
|
214
|
+
with console.status("[bold cyan]Finding installed models..."):
|
|
215
|
+
available = await discover_ollama()
|
|
216
|
+
|
|
217
|
+
if not available:
|
|
218
|
+
print_error("No models found. Is Ollama running?")
|
|
219
|
+
console.print("[dim]Install: https://ollama.com/download[/dim]")
|
|
220
|
+
console.print("[dim]Then: ollama pull gemma4:e2b[/dim]")
|
|
221
|
+
raise typer.Exit(1)
|
|
222
|
+
|
|
223
|
+
# Show available models with numbers
|
|
224
|
+
console.print("[bold]Available models:[/bold]")
|
|
225
|
+
console.print()
|
|
226
|
+
for i, m in enumerate(available, 1):
|
|
227
|
+
size = f"{m.size_gb}GB" if m.size_gb else "?"
|
|
228
|
+
params = m.parameter_size or ""
|
|
229
|
+
console.print(f" [cyan]{i}[/cyan]. {m.name} [dim]{size} {params}[/dim]")
|
|
230
|
+
console.print()
|
|
231
|
+
|
|
232
|
+
# Let user pick
|
|
233
|
+
selection = Prompt.ask(
|
|
234
|
+
"Pick models to compare (comma-separated numbers, or 'all')",
|
|
235
|
+
default="1,2" if len(available) >= 2 else "1",
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
if selection.strip().lower() == "all":
|
|
239
|
+
selected = [m.spec for m in available]
|
|
240
|
+
else:
|
|
241
|
+
indices = []
|
|
242
|
+
for part in selection.split(","):
|
|
243
|
+
part = part.strip()
|
|
244
|
+
if part.isdigit():
|
|
245
|
+
idx = int(part) - 1
|
|
246
|
+
if 0 <= idx < len(available):
|
|
247
|
+
indices.append(idx)
|
|
248
|
+
selected = [available[i].spec for i in indices]
|
|
249
|
+
|
|
250
|
+
if not selected:
|
|
251
|
+
print_error("No models selected.")
|
|
252
|
+
raise typer.Exit(1)
|
|
253
|
+
|
|
254
|
+
models_str = ", ".join(f"[cyan]{s}[/cyan]" for s in selected)
|
|
255
|
+
console.print(f"Selected: {models_str}")
|
|
256
|
+
console.print()
|
|
257
|
+
|
|
258
|
+
prompt = Prompt.ask("Enter your prompt")
|
|
259
|
+
return selected, prompt
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
async def _run_with_dashboard(
|
|
263
|
+
model_specs: list[str],
|
|
264
|
+
prompt: str,
|
|
265
|
+
image: Optional[str],
|
|
266
|
+
judge_model: str,
|
|
267
|
+
no_judge: bool,
|
|
268
|
+
sequential: bool,
|
|
269
|
+
system: Optional[str],
|
|
270
|
+
) -> None:
|
|
271
|
+
"""Run comparison with the web dashboard."""
|
|
272
|
+
from arbiter.dashboard.server import start_server
|
|
273
|
+
|
|
274
|
+
mode = "sequentially" if sequential else "in parallel"
|
|
275
|
+
console.print(f"[bold cyan]Starting Arbiter Dashboard...[/bold cyan] ({mode})")
|
|
276
|
+
console.print("[dim]Opening browser at http://127.0.0.1:7878 ...[/dim]")
|
|
277
|
+
|
|
278
|
+
await start_server(
|
|
279
|
+
model_specs=model_specs, prompt=prompt, image_path=image,
|
|
280
|
+
judge_model=judge_model, no_judge=no_judge,
|
|
281
|
+
sequential=sequential, system=system,
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
# ---------------------------------------------------------------------------
|
|
286
|
+
# arbiter dashboard -- just open the dashboard
|
|
287
|
+
# ---------------------------------------------------------------------------
|
|
288
|
+
|
|
289
|
+
@app.command()
|
|
290
|
+
def dashboard(
|
|
291
|
+
port: int = typer.Option(7878, "--port", "-p", help="Port to run on"),
|
|
292
|
+
) -> None:
|
|
293
|
+
"""Open the Arbiter dashboard in your browser.
|
|
294
|
+
|
|
295
|
+
Shows your leaderboard, past results, and waits for new comparisons.
|
|
296
|
+
"""
|
|
297
|
+
import webbrowser
|
|
298
|
+
from arbiter.dashboard.server import app as fastapi_app
|
|
299
|
+
import uvicorn
|
|
300
|
+
|
|
301
|
+
print_header()
|
|
302
|
+
console.print(f"[bold cyan]Dashboard[/bold cyan] running at [underline]http://127.0.0.1:{port}[/underline]")
|
|
303
|
+
console.print("[dim]Press Ctrl+C to stop[/dim]")
|
|
304
|
+
console.print()
|
|
305
|
+
|
|
306
|
+
# Open browser after a short delay
|
|
307
|
+
import threading
|
|
308
|
+
threading.Timer(1.0, lambda: webbrowser.open(f"http://127.0.0.1:{port}")).start()
|
|
309
|
+
|
|
310
|
+
uvicorn.run(fastapi_app, host="127.0.0.1", port=port, log_level="warning")
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
# ---------------------------------------------------------------------------
|
|
314
|
+
# arbiter benchmark -- the real test suite
|
|
315
|
+
# ---------------------------------------------------------------------------
|
|
316
|
+
|
|
317
|
+
@app.command(context_settings={"allow_extra_args": True, "allow_interspersed_args": True})
|
|
318
|
+
def benchmark(
|
|
319
|
+
ctx: typer.Context,
|
|
320
|
+
quick: bool = typer.Option(False, "--quick", "-q", help="Run quick subset (5 tests)"),
|
|
321
|
+
seq: bool = typer.Option(False, "--seq", help="Run models one at a time (saves memory)"),
|
|
322
|
+
output: Optional[str] = typer.Option(None, "--output", "-o", help="Output format: json"),
|
|
323
|
+
) -> None:
|
|
324
|
+
"""Run the real benchmark suite. Auto-detects your models.
|
|
325
|
+
|
|
326
|
+
20 automated tests across 10 categories. Every test has a
|
|
327
|
+
programmatic pass/fail -- no LLM-as-judge.
|
|
328
|
+
|
|
329
|
+
Categories: math, coding, reasoning, instruction fidelity,
|
|
330
|
+
constraint adherence, consistency, sycophancy resistance,
|
|
331
|
+
temporal reasoning, context recall, output stability.
|
|
332
|
+
|
|
333
|
+
Examples:
|
|
334
|
+
arbiter benchmark (auto-detect models)
|
|
335
|
+
arbiter benchmark --quick (5 tests instead of 20)
|
|
336
|
+
arbiter benchmark --seq (one model at a time)
|
|
337
|
+
arbiter benchmark gemma4:e2b (specific model)
|
|
338
|
+
"""
|
|
339
|
+
from arbiter.core.benchmarks import run_benchmark_comparison, run_benchmark_suite
|
|
340
|
+
|
|
341
|
+
print_header()
|
|
342
|
+
|
|
343
|
+
# Models are optional -- auto-detect if not provided
|
|
344
|
+
models = ctx.args if ctx.args else None
|
|
345
|
+
if not models:
|
|
346
|
+
console.print("[dim]Auto-detecting installed models...[/dim]")
|
|
347
|
+
detected = asyncio.run(_auto_select_models(max_models=5))
|
|
348
|
+
if not detected:
|
|
349
|
+
print_error("No models found. Is Ollama running?")
|
|
350
|
+
console.print("[dim]Run: ollama pull gemma4:e2b[/dim]")
|
|
351
|
+
raise typer.Exit(1)
|
|
352
|
+
models = detected
|
|
353
|
+
|
|
354
|
+
from arbiter.core.benchmarks import ALL_TESTS, QUICK_TESTS
|
|
355
|
+
n_tests = str(len(QUICK_TESTS)) if quick else str(len(ALL_TESTS))
|
|
356
|
+
model_str = " vs ".join(
|
|
357
|
+
f"[{get_model_color(i)}]{m}[/{get_model_color(i)}]"
|
|
358
|
+
for i, m in enumerate(models)
|
|
359
|
+
)
|
|
360
|
+
console.print(f"[bold cyan]Benchmarking[/bold cyan] {model_str}")
|
|
361
|
+
console.print(f"[dim]{n_tests} tests, automated verification, no LLM judge[/dim]")
|
|
362
|
+
console.print()
|
|
363
|
+
|
|
364
|
+
if len(models) == 1:
|
|
365
|
+
with console.status("[bold cyan]Running benchmark suite..."):
|
|
366
|
+
result = asyncio.run(run_benchmark_suite(models[0], quick=quick))
|
|
367
|
+
results = [result]
|
|
368
|
+
else:
|
|
369
|
+
if seq:
|
|
370
|
+
# Run one model at a time
|
|
371
|
+
results = []
|
|
372
|
+
for i, model in enumerate(models):
|
|
373
|
+
color = get_model_color(i)
|
|
374
|
+
console.print(f"[{color}]Benchmarking {model}...[/{color}]")
|
|
375
|
+
with console.status(f"[bold cyan]Running tests on {model}..."):
|
|
376
|
+
result = asyncio.run(run_benchmark_suite(model, quick=quick))
|
|
377
|
+
results.append(result)
|
|
378
|
+
console.print(f" [{color}]Done[/{color}] - {result.overall_score * 100:.0f}%")
|
|
379
|
+
console.print()
|
|
380
|
+
else:
|
|
381
|
+
with console.status("[bold cyan]Running benchmark suite..."):
|
|
382
|
+
results = asyncio.run(run_benchmark_comparison(models, quick=quick))
|
|
383
|
+
|
|
384
|
+
if output == "json":
|
|
385
|
+
import json
|
|
386
|
+
console.print_json(json.dumps([r.to_dict() for r in results], indent=2))
|
|
387
|
+
else:
|
|
388
|
+
_print_benchmark_results(results)
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def _print_benchmark_results(results: list) -> None:
|
|
392
|
+
"""Print benchmark results in a way any user can understand."""
|
|
393
|
+
from rich.table import Table
|
|
394
|
+
from rich.panel import Panel
|
|
395
|
+
from rich.text import Text
|
|
396
|
+
|
|
397
|
+
# Category descriptions humans understand
|
|
398
|
+
CAT_EXPLAIN = {
|
|
399
|
+
"instruction_following": "Can it follow your instructions exactly?",
|
|
400
|
+
"code_generation": "Can it write working code?",
|
|
401
|
+
"factual_accuracy": "Does it give real facts or make things up?",
|
|
402
|
+
"reasoning": "Can it think step-by-step and solve problems?",
|
|
403
|
+
"consistency": "Does it give the same answer every time?",
|
|
404
|
+
"pressure_resistance": "Does it hold its ground when you push back?",
|
|
405
|
+
"speed": "How fast does it generate on your hardware?",
|
|
406
|
+
"context_recall": "Can it find specific info in longer text?",
|
|
407
|
+
"timeout": "Tests that took too long to complete",
|
|
408
|
+
"error": "Tests that encountered errors",
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
console.print()
|
|
412
|
+
|
|
413
|
+
# ── Overall scores (big and clear) ──
|
|
414
|
+
console.print("[bold]Overall Scores[/bold]")
|
|
415
|
+
console.print("[dim]Higher is better. 100 = perfect on every test.[/dim]")
|
|
416
|
+
console.print()
|
|
417
|
+
|
|
418
|
+
for i, r in enumerate(results):
|
|
419
|
+
color = get_model_color(i)
|
|
420
|
+
score = r.overall_score * 100
|
|
421
|
+
bar_width = 30
|
|
422
|
+
filled = int((score / 100) * bar_width)
|
|
423
|
+
bar = "[green]" + "█" * filled + "[/green]" + "[dim]░[/dim]" * (bar_width - filled)
|
|
424
|
+
score_color = "green" if score >= 70 else "yellow" if score >= 50 else "red"
|
|
425
|
+
console.print(
|
|
426
|
+
f" [{color}]{r.model:<20}[/{color}] {bar} [{score_color}]{score:.0f}/100[/{score_color}]"
|
|
427
|
+
f" [dim]({r.total_passed}/{r.total_tests} tests passed)[/dim]"
|
|
428
|
+
)
|
|
429
|
+
console.print()
|
|
430
|
+
|
|
431
|
+
# ── Category breakdown with explanations ──
|
|
432
|
+
# Group tests by category
|
|
433
|
+
all_cats = []
|
|
434
|
+
seen_cats = set()
|
|
435
|
+
for r in results:
|
|
436
|
+
for br in r.results:
|
|
437
|
+
if br.category not in seen_cats:
|
|
438
|
+
all_cats.append(br.category)
|
|
439
|
+
seen_cats.add(br.category)
|
|
440
|
+
|
|
441
|
+
for cat in all_cats:
|
|
442
|
+
explain = CAT_EXPLAIN.get(cat, cat.replace("_", " ").title())
|
|
443
|
+
cat_label = cat.replace("_", " ").title()
|
|
444
|
+
|
|
445
|
+
# Get scores for this category per model
|
|
446
|
+
cat_scores = []
|
|
447
|
+
for i, r in enumerate(results):
|
|
448
|
+
score = r.category_scores.get(cat, 0) * 100
|
|
449
|
+
cat_scores.append((r.model, score, get_model_color(i)))
|
|
450
|
+
|
|
451
|
+
# Find best in category
|
|
452
|
+
best_score = max(s for _, s, _ in cat_scores) if cat_scores else 0
|
|
453
|
+
|
|
454
|
+
console.print(f" [bold]{cat_label}[/bold] [dim]{explain}[/dim]")
|
|
455
|
+
|
|
456
|
+
# Per-model scores for this category
|
|
457
|
+
for model_name, score, color in cat_scores:
|
|
458
|
+
is_best = score == best_score and best_score > 0 and len(results) > 1
|
|
459
|
+
badge = " [green]BEST[/green]" if is_best else ""
|
|
460
|
+
score_color = "green" if score >= 70 else "yellow" if score >= 50 else "red"
|
|
461
|
+
console.print(f" [{color}]{model_name:<18}[/{color}] [{score_color}]{score:>3.0f}%[/{score_color}]{badge}")
|
|
462
|
+
|
|
463
|
+
# Show individual test results for this category
|
|
464
|
+
for r in results[:1]: # use first model's test list as reference
|
|
465
|
+
cat_tests = [br for br in r.results if br.category == cat]
|
|
466
|
+
for test in cat_tests:
|
|
467
|
+
# Collect all model results for this test
|
|
468
|
+
test_results = []
|
|
469
|
+
for ri, rr in enumerate(results):
|
|
470
|
+
br = next((b for b in rr.results if b.name == test.name), None)
|
|
471
|
+
if br:
|
|
472
|
+
if br.passed:
|
|
473
|
+
test_results.append(f"[green]PASS[/green]")
|
|
474
|
+
else:
|
|
475
|
+
test_results.append(f"[red]FAIL[/red]")
|
|
476
|
+
else:
|
|
477
|
+
test_results.append("[dim]--[/dim]")
|
|
478
|
+
|
|
479
|
+
result_str = " ".join(test_results)
|
|
480
|
+
# Show what the test actually checked
|
|
481
|
+
desc = test.description if test.description else test.name
|
|
482
|
+
console.print(f" [dim]{desc}:[/dim] {result_str}")
|
|
483
|
+
|
|
484
|
+
console.print()
|
|
485
|
+
|
|
486
|
+
# ── Winner explanation ──
|
|
487
|
+
if len(results) > 1:
|
|
488
|
+
best = max(results, key=lambda r: r.overall_score)
|
|
489
|
+
best_color = get_model_color(results.index(best))
|
|
490
|
+
|
|
491
|
+
# Build explanation
|
|
492
|
+
strengths = []
|
|
493
|
+
weaknesses = []
|
|
494
|
+
for cat in all_cats:
|
|
495
|
+
if cat in ("timeout", "error"):
|
|
496
|
+
continue
|
|
497
|
+
scores = [(r.model, r.category_scores.get(cat, 0)) for r in results]
|
|
498
|
+
scores.sort(key=lambda x: x[1], reverse=True)
|
|
499
|
+
if scores[0][0] == best.model and scores[0][1] > 0:
|
|
500
|
+
strengths.append(cat.replace("_", " "))
|
|
501
|
+
elif len(scores) > 1 and scores[0][0] != best.model and scores[0][1] > scores[-1][1]:
|
|
502
|
+
weaknesses.append((cat.replace("_", " "), scores[0][0]))
|
|
503
|
+
|
|
504
|
+
explanation_parts = [
|
|
505
|
+
f"[bold {best_color}]{best.model}[/bold {best_color}] scored [bold]{best.overall_score * 100:.0f}/100[/bold]",
|
|
506
|
+
f"passing {best.total_passed} of {best.total_tests} tests.",
|
|
507
|
+
]
|
|
508
|
+
if strengths:
|
|
509
|
+
explanation_parts.append(f"\nStrong at: {', '.join(strengths)}.")
|
|
510
|
+
if weaknesses:
|
|
511
|
+
weak_strs = [f"{cat} (beaten by {model})" for cat, model in weaknesses[:3]]
|
|
512
|
+
explanation_parts.append(f"Weaker at: {', '.join(weak_strs)}.")
|
|
513
|
+
|
|
514
|
+
console.print(Panel(
|
|
515
|
+
" ".join(explanation_parts),
|
|
516
|
+
title="[bold]Winner[/bold]",
|
|
517
|
+
border_style="green",
|
|
518
|
+
padding=(1, 2),
|
|
519
|
+
))
|
|
520
|
+
elif len(results) == 1:
|
|
521
|
+
r = results[0]
|
|
522
|
+
console.print(Panel(
|
|
523
|
+
f"[bold]{r.model}[/bold] scored [bold]{r.overall_score * 100:.0f}/100[/bold], "
|
|
524
|
+
f"passing {r.total_passed} of {r.total_tests} tests.",
|
|
525
|
+
title="[bold]Result[/bold]",
|
|
526
|
+
border_style="cyan",
|
|
527
|
+
padding=(1, 2),
|
|
528
|
+
))
|
|
529
|
+
console.print()
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
# ---------------------------------------------------------------------------
|
|
533
|
+
# ---------------------------------------------------------------------------
|
|
534
|
+
# arbiter swe -- SWE-bench style testing in Docker
|
|
535
|
+
# ---------------------------------------------------------------------------
|
|
536
|
+
|
|
537
|
+
@app.command(context_settings={"allow_extra_args": True, "allow_interspersed_args": True})
|
|
538
|
+
def swe(
|
|
539
|
+
ctx: typer.Context,
|
|
540
|
+
output: Optional[str] = typer.Option(None, "--output", "-o", help="Output format: json"),
|
|
541
|
+
) -> None:
|
|
542
|
+
"""Run SWE-bench style tests in Docker containers.
|
|
543
|
+
|
|
544
|
+
Real buggy code + real test suites. Models write patches,
|
|
545
|
+
Docker runs pytest to verify. The gold standard for code testing.
|
|
546
|
+
|
|
547
|
+
Requires Docker Desktop to be running.
|
|
548
|
+
|
|
549
|
+
Examples:
|
|
550
|
+
arbiter swe (auto-detect models)
|
|
551
|
+
arbiter swe gemma4:e2b qwen3.5:4b (specific models)
|
|
552
|
+
"""
|
|
553
|
+
from arbiter.core.swe.container import check_docker
|
|
554
|
+
from arbiter.core.swe.runner import run_swe_comparison
|
|
555
|
+
from arbiter.core.swe.test_packs import TOTAL_SWE_TESTS
|
|
556
|
+
from rich.table import Table
|
|
557
|
+
from rich.panel import Panel
|
|
558
|
+
|
|
559
|
+
print_header()
|
|
560
|
+
|
|
561
|
+
# Check Docker
|
|
562
|
+
if not check_docker():
|
|
563
|
+
print_error("Docker is not running.")
|
|
564
|
+
console.print("[dim]Install Docker Desktop: https://docker.com/products/docker-desktop[/dim]")
|
|
565
|
+
console.print("[dim]Then start it and try again.[/dim]")
|
|
566
|
+
raise typer.Exit(1)
|
|
567
|
+
|
|
568
|
+
# Models
|
|
569
|
+
models = ctx.args if ctx.args else None
|
|
570
|
+
if not models:
|
|
571
|
+
console.print("[dim]Auto-detecting models...[/dim]")
|
|
572
|
+
detected = asyncio.run(_auto_select_models(max_models=2))
|
|
573
|
+
if not detected:
|
|
574
|
+
print_error("No models found.")
|
|
575
|
+
raise typer.Exit(1)
|
|
576
|
+
models = detected
|
|
577
|
+
|
|
578
|
+
model_str = " vs ".join(
|
|
579
|
+
f"[{get_model_color(i)}]{m}[/{get_model_color(i)}]"
|
|
580
|
+
for i, m in enumerate(models)
|
|
581
|
+
)
|
|
582
|
+
console.print(f"[bold magenta]SWE Testing[/bold magenta] {model_str}")
|
|
583
|
+
console.print(f"[dim]{TOTAL_SWE_TESTS} real code tests in Docker containers[/dim]")
|
|
584
|
+
console.print()
|
|
585
|
+
|
|
586
|
+
with console.status("[bold magenta]Running SWE tests (this takes a while)..."):
|
|
587
|
+
results = asyncio.run(run_swe_comparison(models))
|
|
588
|
+
|
|
589
|
+
if output == "json":
|
|
590
|
+
import json
|
|
591
|
+
console.print_json(json.dumps([r.to_dict() for r in results], indent=2))
|
|
592
|
+
return
|
|
593
|
+
|
|
594
|
+
# Display results
|
|
595
|
+
table = Table(title="SWE Test Results", border_style="dim")
|
|
596
|
+
table.add_column("Model", style="bold")
|
|
597
|
+
table.add_column("Passed", justify="right")
|
|
598
|
+
table.add_column("Total", justify="right")
|
|
599
|
+
table.add_column("Pass Rate", justify="right", style="bold")
|
|
600
|
+
|
|
601
|
+
for i, r in enumerate(results):
|
|
602
|
+
color = get_model_color(i)
|
|
603
|
+
rate = f"{r.pass_rate * 100:.0f}%"
|
|
604
|
+
rate_color = "green" if r.pass_rate >= 0.7 else "yellow" if r.pass_rate >= 0.5 else "red"
|
|
605
|
+
table.add_row(
|
|
606
|
+
f"[{color}]{r.model}[/{color}]",
|
|
607
|
+
str(r.total_passed), str(r.total_tests),
|
|
608
|
+
f"[{rate_color}]{rate}[/{rate_color}]",
|
|
609
|
+
)
|
|
610
|
+
console.print(table)
|
|
611
|
+
console.print()
|
|
612
|
+
|
|
613
|
+
# Per-test breakdown
|
|
614
|
+
detail = Table(title="Test Details", border_style="dim")
|
|
615
|
+
detail.add_column("Test", style="bold")
|
|
616
|
+
detail.add_column("Category", style="dim")
|
|
617
|
+
for i, r in enumerate(results):
|
|
618
|
+
detail.add_column(f"[{get_model_color(i)}]{r.model}[/{get_model_color(i)}]", justify="center")
|
|
619
|
+
|
|
620
|
+
if results:
|
|
621
|
+
for ti, test in enumerate(results[0].results):
|
|
622
|
+
row = [test.test_case, test.category]
|
|
623
|
+
for r in results:
|
|
624
|
+
t = r.results[ti] if ti < len(r.results) else None
|
|
625
|
+
if t and t.passed:
|
|
626
|
+
row.append(f"[green]{t.tests_passed}/{t.tests_total} PASS[/green]")
|
|
627
|
+
elif t:
|
|
628
|
+
row.append(f"[red]{t.tests_passed}/{t.tests_total} FAIL[/red]")
|
|
629
|
+
else:
|
|
630
|
+
row.append("[dim]--[/dim]")
|
|
631
|
+
detail.add_row(*row)
|
|
632
|
+
|
|
633
|
+
console.print(detail)
|
|
634
|
+
console.print()
|
|
635
|
+
|
|
636
|
+
if len(results) > 1:
|
|
637
|
+
best = max(results, key=lambda r: r.pass_rate)
|
|
638
|
+
console.print(Panel(
|
|
639
|
+
f"[bold green]{best.model}[/bold green] passed {best.total_passed}/{best.total_tests} tests ({best.pass_rate*100:.0f}%)",
|
|
640
|
+
title="[bold]SWE Winner[/bold]",
|
|
641
|
+
border_style="green",
|
|
642
|
+
))
|
|
643
|
+
console.print()
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
# ---------------------------------------------------------------------------
|
|
647
|
+
# arbiter discover / leaderboard / config
|
|
648
|
+
# ---------------------------------------------------------------------------
|
|
649
|
+
|
|
650
|
+
@app.command()
|
|
651
|
+
def discover() -> None:
|
|
652
|
+
"""List all available models across providers."""
|
|
653
|
+
from arbiter.core.discover import discover_all
|
|
654
|
+
|
|
655
|
+
print_header()
|
|
656
|
+
with console.status("[bold cyan]Discovering models..."):
|
|
657
|
+
models = asyncio.run(discover_all())
|
|
658
|
+
print_discover(models)
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
@app.command()
|
|
662
|
+
def leaderboard() -> None:
|
|
663
|
+
"""View the persistent ELO leaderboard."""
|
|
664
|
+
from arbiter.core.leaderboard import Leaderboard
|
|
665
|
+
|
|
666
|
+
print_header()
|
|
667
|
+
lb = Leaderboard()
|
|
668
|
+
print_leaderboard(lb)
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
@app.command()
|
|
672
|
+
def config(
|
|
673
|
+
ollama_host: Optional[str] = typer.Option(None, "--ollama-host", help="Set Ollama API host"),
|
|
674
|
+
show_config: bool = typer.Option(False, "--show", help="Show current config"),
|
|
675
|
+
) -> None:
|
|
676
|
+
"""View or modify Arbiter configuration."""
|
|
677
|
+
from arbiter.core.config import load_config, save_config, get_ollama_host
|
|
678
|
+
|
|
679
|
+
if show_config:
|
|
680
|
+
cfg = load_config()
|
|
681
|
+
console.print(f"Ollama host: {get_ollama_host()}")
|
|
682
|
+
for k, v in cfg.items():
|
|
683
|
+
console.print(f"{k}: {v}")
|
|
684
|
+
return
|
|
685
|
+
|
|
686
|
+
if ollama_host:
|
|
687
|
+
cfg = load_config()
|
|
688
|
+
cfg["ollama_host"] = ollama_host
|
|
689
|
+
save_config(cfg)
|
|
690
|
+
console.print(f"[green]Ollama host set to: {ollama_host}[/green]")
|
|
691
|
+
|
|
692
|
+
|
|
693
|
+
def entry() -> None:
|
|
694
|
+
"""Entry point for the CLI."""
|
|
695
|
+
app()
|
|
696
|
+
|
|
697
|
+
|
|
698
|
+
if __name__ == "__main__":
|
|
699
|
+
entry()
|