arbiter-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
arbiter/cli/app.py ADDED
@@ -0,0 +1,699 @@
1
+ """Arbiter CLI - Typer-based command line interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from typing import Optional
7
+
8
+ import typer
9
+ from rich.live import Live
10
+
11
+ from arbiter.cli.display import (
12
+ console,
13
+ create_progress,
14
+ get_model_color,
15
+ print_comparing,
16
+ print_discover,
17
+ print_error,
18
+ print_header,
19
+ print_json_output,
20
+ print_leaderboard,
21
+ print_model_output,
22
+ print_results,
23
+ update_progress,
24
+ )
25
+
26
+ app = typer.Typer(
27
+ name="arbiter",
28
+ help="The final word on your local models. Compare LLMs side-by-side.",
29
+ add_completion=False,
30
+ no_args_is_help=True,
31
+ )
32
+
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # arbiter run -- the main command
36
+ # ---------------------------------------------------------------------------
37
+
38
+ @app.command(context_settings={"allow_extra_args": True, "allow_interspersed_args": True})
39
+ def run(
40
+ ctx: typer.Context,
41
+ seq: bool = typer.Option(
42
+ False, "--seq", help="Run models one at a time (saves memory for 8GB machines)",
43
+ ),
44
+ dashboard: bool = typer.Option(
45
+ False, "--dashboard", "-d", help="Open animated web dashboard",
46
+ ),
47
+ image: Optional[str] = typer.Option(None, "--image", "-i", help="Image path for multimodal"),
48
+ output: Optional[str] = typer.Option(None, "--output", "-o", help="Output format: json"),
49
+ show: bool = typer.Option(False, "--show", "-s", help="Show full model outputs"),
50
+ no_judge: bool = typer.Option(False, "--no-judge", help="Skip quality judging"),
51
+ judge: str = typer.Option("auto", "--judge", "-j", help="Judge model"),
52
+ system: Optional[str] = typer.Option(None, "--system", help="System prompt"),
53
+ ) -> None:
54
+ """Run a comparison. Put models first, prompt last in quotes.
55
+
56
+ Examples:
57
+ arbiter run gemma4:e2b qwen3.5:4b "explain recursion"
58
+ arbiter run gemma4:e2b qwen3.5:4b "explain recursion" --seq
59
+ arbiter run gemma4:e2b qwen3.5:4b "write a sort" --seq --dashboard
60
+ """
61
+ args = ctx.args
62
+
63
+ # Separate models from prompt
64
+ # If no args at all, interactive mode: pick models + prompt
65
+ # If one arg, it's the prompt -- auto-detect models
66
+ # If 2+, last is prompt, rest are models
67
+ if len(args) == 0:
68
+ # Fully interactive: pick models and prompt
69
+ models, prompt = asyncio.run(_interactive_setup())
70
+ elif len(args) == 1:
71
+ # Just a prompt -- auto-pick models
72
+ prompt = args[0]
73
+ models = asyncio.run(_auto_select_models())
74
+ else:
75
+ prompt = args[-1]
76
+ models = args[:-1]
77
+
78
+ if not models:
79
+ print_error("No models found. Is Ollama running? Do you have models installed?")
80
+ console.print("[dim]Run: ollama pull gemma4:e2b[/dim]")
81
+ raise typer.Exit(1)
82
+
83
+ if dashboard:
84
+ asyncio.run(_run_with_dashboard(models, prompt, image, judge, no_judge, seq, system))
85
+ else:
86
+ asyncio.run(_run_cli(models, prompt, image, judge, output, show, no_judge, seq, system))
87
+
88
+
89
+ async def _run_cli(
90
+ model_specs: list[str],
91
+ prompt: str,
92
+ image: Optional[str],
93
+ judge_model: str,
94
+ output_format: Optional[str],
95
+ show_output: bool,
96
+ no_judge: bool,
97
+ sequential: bool,
98
+ system: Optional[str],
99
+ ) -> None:
100
+ """Run comparison in CLI mode."""
101
+ from arbiter.core.judge import judge_comparison
102
+ from arbiter.core.leaderboard import Leaderboard
103
+ from arbiter.core.runner import run_comparison, run_single_model
104
+ from arbiter.core.metrics import ComparisonResult, compute_composite_scores, ScoreWeights
105
+ from datetime import datetime, timezone
106
+
107
+ print_header()
108
+ print_comparing(model_specs, prompt)
109
+
110
+ if sequential:
111
+ console.print("[dim]Sequential mode (low memory)[/dim]")
112
+ console.print()
113
+
114
+ all_metrics = []
115
+ for i, spec in enumerate(model_specs):
116
+ color = get_model_color(i)
117
+ console.print(f"[{color}]Running {spec}...[/{color}]")
118
+
119
+ progress, task_ids = create_progress([spec])
120
+
121
+ def on_token(model: str, text: str, metrics, _p=progress, _t=task_ids):
122
+ update_progress(_p, _t, model, metrics)
123
+
124
+ with Live(progress, console=console, refresh_per_second=10):
125
+ metrics = await run_single_model(
126
+ model_spec=spec, prompt=prompt, system=system,
127
+ image_path=image, on_token=on_token,
128
+ )
129
+ all_metrics.append(metrics)
130
+
131
+ tps = f"{metrics.tokens_per_sec:.1f} tok/s" if metrics.tokens_per_sec else "--"
132
+ console.print(f" [{color}]Done[/{color}] - {metrics.total_tokens} tokens, {tps}")
133
+ console.print()
134
+
135
+ result = ComparisonResult(
136
+ prompt=prompt, models=all_metrics,
137
+ timestamp=datetime.now(timezone.utc).isoformat(),
138
+ )
139
+ else:
140
+ progress, task_ids = create_progress(model_specs)
141
+
142
+ def on_token(model: str, text: str, metrics):
143
+ update_progress(progress, task_ids, model, metrics)
144
+
145
+ with Live(progress, console=console, refresh_per_second=10):
146
+ result = await run_comparison(
147
+ model_specs=model_specs, prompt=prompt, system=system,
148
+ image_path=image, on_token=on_token, sequential=False,
149
+ )
150
+
151
+ # Judge quality (optional)
152
+ if not no_judge and len(model_specs) > 1:
153
+ with console.status("[bold cyan]Judging quality..."):
154
+ result = await judge_comparison(result, judge_model=judge_model)
155
+
156
+ # Compute composite scores and determine winner
157
+ has_quality = not no_judge and len(model_specs) > 1
158
+ result.scoring = compute_composite_scores(result, has_quality=has_quality)
159
+ result.winner = result.scoring.winner if result.scoring else None
160
+
161
+ # Update leaderboard
162
+ if len(model_specs) > 1:
163
+ lb = Leaderboard()
164
+ lb.update_from_comparison(result)
165
+
166
+ # Output
167
+ if output_format == "json":
168
+ print_json_output(result)
169
+ else:
170
+ print_results(result)
171
+ if show_output:
172
+ for i, m in enumerate(result.models):
173
+ print_model_output(m.model, m.output, index=i)
174
+
175
+
176
+ async def _auto_select_models(max_models: int = 2) -> list[str]:
177
+ """Auto-detect installed models and pick the best ones to compare."""
178
+ from arbiter.core.discover import discover_ollama
179
+
180
+ models = await discover_ollama()
181
+ if not models:
182
+ return []
183
+
184
+ # Sort by size (prefer smaller models that fit in memory)
185
+ models.sort(key=lambda m: m.size or 0)
186
+
187
+ # Prefer models that fit in memory, but include all
188
+ safe = [m for m in models if m.fits_in_memory]
189
+ pool = safe if len(safe) >= 2 else models
190
+
191
+ # Pick up to max_models, preferring different families
192
+ selected = []
193
+ seen_families = set()
194
+ for m in pool:
195
+ family = m.family or m.name.split(":")[0]
196
+ if family not in seen_families or len(selected) < max_models:
197
+ if m.memory_warning:
198
+ console.print(f"[yellow]Note: {m.name} ({m.size_gb}GB) - {m.memory_warning}[/yellow]")
199
+ selected.append(m.spec)
200
+ seen_families.add(family)
201
+ if len(selected) >= max_models:
202
+ break
203
+
204
+ return selected
205
+
206
+
207
+ async def _interactive_setup() -> tuple[list[str], str]:
208
+ """Interactive model selection and prompt input."""
209
+ from arbiter.core.discover import discover_ollama
210
+ from rich.prompt import Prompt
211
+
212
+ print_header()
213
+
214
+ with console.status("[bold cyan]Finding installed models..."):
215
+ available = await discover_ollama()
216
+
217
+ if not available:
218
+ print_error("No models found. Is Ollama running?")
219
+ console.print("[dim]Install: https://ollama.com/download[/dim]")
220
+ console.print("[dim]Then: ollama pull gemma4:e2b[/dim]")
221
+ raise typer.Exit(1)
222
+
223
+ # Show available models with numbers
224
+ console.print("[bold]Available models:[/bold]")
225
+ console.print()
226
+ for i, m in enumerate(available, 1):
227
+ size = f"{m.size_gb}GB" if m.size_gb else "?"
228
+ params = m.parameter_size or ""
229
+ console.print(f" [cyan]{i}[/cyan]. {m.name} [dim]{size} {params}[/dim]")
230
+ console.print()
231
+
232
+ # Let user pick
233
+ selection = Prompt.ask(
234
+ "Pick models to compare (comma-separated numbers, or 'all')",
235
+ default="1,2" if len(available) >= 2 else "1",
236
+ )
237
+
238
+ if selection.strip().lower() == "all":
239
+ selected = [m.spec for m in available]
240
+ else:
241
+ indices = []
242
+ for part in selection.split(","):
243
+ part = part.strip()
244
+ if part.isdigit():
245
+ idx = int(part) - 1
246
+ if 0 <= idx < len(available):
247
+ indices.append(idx)
248
+ selected = [available[i].spec for i in indices]
249
+
250
+ if not selected:
251
+ print_error("No models selected.")
252
+ raise typer.Exit(1)
253
+
254
+ models_str = ", ".join(f"[cyan]{s}[/cyan]" for s in selected)
255
+ console.print(f"Selected: {models_str}")
256
+ console.print()
257
+
258
+ prompt = Prompt.ask("Enter your prompt")
259
+ return selected, prompt
260
+
261
+
262
+ async def _run_with_dashboard(
263
+ model_specs: list[str],
264
+ prompt: str,
265
+ image: Optional[str],
266
+ judge_model: str,
267
+ no_judge: bool,
268
+ sequential: bool,
269
+ system: Optional[str],
270
+ ) -> None:
271
+ """Run comparison with the web dashboard."""
272
+ from arbiter.dashboard.server import start_server
273
+
274
+ mode = "sequentially" if sequential else "in parallel"
275
+ console.print(f"[bold cyan]Starting Arbiter Dashboard...[/bold cyan] ({mode})")
276
+ console.print("[dim]Opening browser at http://127.0.0.1:7878 ...[/dim]")
277
+
278
+ await start_server(
279
+ model_specs=model_specs, prompt=prompt, image_path=image,
280
+ judge_model=judge_model, no_judge=no_judge,
281
+ sequential=sequential, system=system,
282
+ )
283
+
284
+
285
+ # ---------------------------------------------------------------------------
286
+ # arbiter dashboard -- just open the dashboard
287
+ # ---------------------------------------------------------------------------
288
+
289
+ @app.command()
290
+ def dashboard(
291
+ port: int = typer.Option(7878, "--port", "-p", help="Port to run on"),
292
+ ) -> None:
293
+ """Open the Arbiter dashboard in your browser.
294
+
295
+ Shows your leaderboard, past results, and waits for new comparisons.
296
+ """
297
+ import webbrowser
298
+ from arbiter.dashboard.server import app as fastapi_app
299
+ import uvicorn
300
+
301
+ print_header()
302
+ console.print(f"[bold cyan]Dashboard[/bold cyan] running at [underline]http://127.0.0.1:{port}[/underline]")
303
+ console.print("[dim]Press Ctrl+C to stop[/dim]")
304
+ console.print()
305
+
306
+ # Open browser after a short delay
307
+ import threading
308
+ threading.Timer(1.0, lambda: webbrowser.open(f"http://127.0.0.1:{port}")).start()
309
+
310
+ uvicorn.run(fastapi_app, host="127.0.0.1", port=port, log_level="warning")
311
+
312
+
313
+ # ---------------------------------------------------------------------------
314
+ # arbiter benchmark -- the real test suite
315
+ # ---------------------------------------------------------------------------
316
+
317
+ @app.command(context_settings={"allow_extra_args": True, "allow_interspersed_args": True})
318
+ def benchmark(
319
+ ctx: typer.Context,
320
+ quick: bool = typer.Option(False, "--quick", "-q", help="Run quick subset (5 tests)"),
321
+ seq: bool = typer.Option(False, "--seq", help="Run models one at a time (saves memory)"),
322
+ output: Optional[str] = typer.Option(None, "--output", "-o", help="Output format: json"),
323
+ ) -> None:
324
+ """Run the real benchmark suite. Auto-detects your models.
325
+
326
+ 20 automated tests across 10 categories. Every test has a
327
+ programmatic pass/fail -- no LLM-as-judge.
328
+
329
+ Categories: math, coding, reasoning, instruction fidelity,
330
+ constraint adherence, consistency, sycophancy resistance,
331
+ temporal reasoning, context recall, output stability.
332
+
333
+ Examples:
334
+ arbiter benchmark (auto-detect models)
335
+ arbiter benchmark --quick (5 tests instead of 20)
336
+ arbiter benchmark --seq (one model at a time)
337
+ arbiter benchmark gemma4:e2b (specific model)
338
+ """
339
+ from arbiter.core.benchmarks import run_benchmark_comparison, run_benchmark_suite
340
+
341
+ print_header()
342
+
343
+ # Models are optional -- auto-detect if not provided
344
+ models = ctx.args if ctx.args else None
345
+ if not models:
346
+ console.print("[dim]Auto-detecting installed models...[/dim]")
347
+ detected = asyncio.run(_auto_select_models(max_models=5))
348
+ if not detected:
349
+ print_error("No models found. Is Ollama running?")
350
+ console.print("[dim]Run: ollama pull gemma4:e2b[/dim]")
351
+ raise typer.Exit(1)
352
+ models = detected
353
+
354
+ from arbiter.core.benchmarks import ALL_TESTS, QUICK_TESTS
355
+ n_tests = str(len(QUICK_TESTS)) if quick else str(len(ALL_TESTS))
356
+ model_str = " vs ".join(
357
+ f"[{get_model_color(i)}]{m}[/{get_model_color(i)}]"
358
+ for i, m in enumerate(models)
359
+ )
360
+ console.print(f"[bold cyan]Benchmarking[/bold cyan] {model_str}")
361
+ console.print(f"[dim]{n_tests} tests, automated verification, no LLM judge[/dim]")
362
+ console.print()
363
+
364
+ if len(models) == 1:
365
+ with console.status("[bold cyan]Running benchmark suite..."):
366
+ result = asyncio.run(run_benchmark_suite(models[0], quick=quick))
367
+ results = [result]
368
+ else:
369
+ if seq:
370
+ # Run one model at a time
371
+ results = []
372
+ for i, model in enumerate(models):
373
+ color = get_model_color(i)
374
+ console.print(f"[{color}]Benchmarking {model}...[/{color}]")
375
+ with console.status(f"[bold cyan]Running tests on {model}..."):
376
+ result = asyncio.run(run_benchmark_suite(model, quick=quick))
377
+ results.append(result)
378
+ console.print(f" [{color}]Done[/{color}] - {result.overall_score * 100:.0f}%")
379
+ console.print()
380
+ else:
381
+ with console.status("[bold cyan]Running benchmark suite..."):
382
+ results = asyncio.run(run_benchmark_comparison(models, quick=quick))
383
+
384
+ if output == "json":
385
+ import json
386
+ console.print_json(json.dumps([r.to_dict() for r in results], indent=2))
387
+ else:
388
+ _print_benchmark_results(results)
389
+
390
+
391
+ def _print_benchmark_results(results: list) -> None:
392
+ """Print benchmark results in a way any user can understand."""
393
+ from rich.table import Table
394
+ from rich.panel import Panel
395
+ from rich.text import Text
396
+
397
+ # Category descriptions humans understand
398
+ CAT_EXPLAIN = {
399
+ "instruction_following": "Can it follow your instructions exactly?",
400
+ "code_generation": "Can it write working code?",
401
+ "factual_accuracy": "Does it give real facts or make things up?",
402
+ "reasoning": "Can it think step-by-step and solve problems?",
403
+ "consistency": "Does it give the same answer every time?",
404
+ "pressure_resistance": "Does it hold its ground when you push back?",
405
+ "speed": "How fast does it generate on your hardware?",
406
+ "context_recall": "Can it find specific info in longer text?",
407
+ "timeout": "Tests that took too long to complete",
408
+ "error": "Tests that encountered errors",
409
+ }
410
+
411
+ console.print()
412
+
413
+ # ── Overall scores (big and clear) ──
414
+ console.print("[bold]Overall Scores[/bold]")
415
+ console.print("[dim]Higher is better. 100 = perfect on every test.[/dim]")
416
+ console.print()
417
+
418
+ for i, r in enumerate(results):
419
+ color = get_model_color(i)
420
+ score = r.overall_score * 100
421
+ bar_width = 30
422
+ filled = int((score / 100) * bar_width)
423
+ bar = "[green]" + "█" * filled + "[/green]" + "[dim]░[/dim]" * (bar_width - filled)
424
+ score_color = "green" if score >= 70 else "yellow" if score >= 50 else "red"
425
+ console.print(
426
+ f" [{color}]{r.model:<20}[/{color}] {bar} [{score_color}]{score:.0f}/100[/{score_color}]"
427
+ f" [dim]({r.total_passed}/{r.total_tests} tests passed)[/dim]"
428
+ )
429
+ console.print()
430
+
431
+ # ── Category breakdown with explanations ──
432
+ # Group tests by category
433
+ all_cats = []
434
+ seen_cats = set()
435
+ for r in results:
436
+ for br in r.results:
437
+ if br.category not in seen_cats:
438
+ all_cats.append(br.category)
439
+ seen_cats.add(br.category)
440
+
441
+ for cat in all_cats:
442
+ explain = CAT_EXPLAIN.get(cat, cat.replace("_", " ").title())
443
+ cat_label = cat.replace("_", " ").title()
444
+
445
+ # Get scores for this category per model
446
+ cat_scores = []
447
+ for i, r in enumerate(results):
448
+ score = r.category_scores.get(cat, 0) * 100
449
+ cat_scores.append((r.model, score, get_model_color(i)))
450
+
451
+ # Find best in category
452
+ best_score = max(s for _, s, _ in cat_scores) if cat_scores else 0
453
+
454
+ console.print(f" [bold]{cat_label}[/bold] [dim]{explain}[/dim]")
455
+
456
+ # Per-model scores for this category
457
+ for model_name, score, color in cat_scores:
458
+ is_best = score == best_score and best_score > 0 and len(results) > 1
459
+ badge = " [green]BEST[/green]" if is_best else ""
460
+ score_color = "green" if score >= 70 else "yellow" if score >= 50 else "red"
461
+ console.print(f" [{color}]{model_name:<18}[/{color}] [{score_color}]{score:>3.0f}%[/{score_color}]{badge}")
462
+
463
+ # Show individual test results for this category
464
+ for r in results[:1]: # use first model's test list as reference
465
+ cat_tests = [br for br in r.results if br.category == cat]
466
+ for test in cat_tests:
467
+ # Collect all model results for this test
468
+ test_results = []
469
+ for ri, rr in enumerate(results):
470
+ br = next((b for b in rr.results if b.name == test.name), None)
471
+ if br:
472
+ if br.passed:
473
+ test_results.append(f"[green]PASS[/green]")
474
+ else:
475
+ test_results.append(f"[red]FAIL[/red]")
476
+ else:
477
+ test_results.append("[dim]--[/dim]")
478
+
479
+ result_str = " ".join(test_results)
480
+ # Show what the test actually checked
481
+ desc = test.description if test.description else test.name
482
+ console.print(f" [dim]{desc}:[/dim] {result_str}")
483
+
484
+ console.print()
485
+
486
+ # ── Winner explanation ──
487
+ if len(results) > 1:
488
+ best = max(results, key=lambda r: r.overall_score)
489
+ best_color = get_model_color(results.index(best))
490
+
491
+ # Build explanation
492
+ strengths = []
493
+ weaknesses = []
494
+ for cat in all_cats:
495
+ if cat in ("timeout", "error"):
496
+ continue
497
+ scores = [(r.model, r.category_scores.get(cat, 0)) for r in results]
498
+ scores.sort(key=lambda x: x[1], reverse=True)
499
+ if scores[0][0] == best.model and scores[0][1] > 0:
500
+ strengths.append(cat.replace("_", " "))
501
+ elif len(scores) > 1 and scores[0][0] != best.model and scores[0][1] > scores[-1][1]:
502
+ weaknesses.append((cat.replace("_", " "), scores[0][0]))
503
+
504
+ explanation_parts = [
505
+ f"[bold {best_color}]{best.model}[/bold {best_color}] scored [bold]{best.overall_score * 100:.0f}/100[/bold]",
506
+ f"passing {best.total_passed} of {best.total_tests} tests.",
507
+ ]
508
+ if strengths:
509
+ explanation_parts.append(f"\nStrong at: {', '.join(strengths)}.")
510
+ if weaknesses:
511
+ weak_strs = [f"{cat} (beaten by {model})" for cat, model in weaknesses[:3]]
512
+ explanation_parts.append(f"Weaker at: {', '.join(weak_strs)}.")
513
+
514
+ console.print(Panel(
515
+ " ".join(explanation_parts),
516
+ title="[bold]Winner[/bold]",
517
+ border_style="green",
518
+ padding=(1, 2),
519
+ ))
520
+ elif len(results) == 1:
521
+ r = results[0]
522
+ console.print(Panel(
523
+ f"[bold]{r.model}[/bold] scored [bold]{r.overall_score * 100:.0f}/100[/bold], "
524
+ f"passing {r.total_passed} of {r.total_tests} tests.",
525
+ title="[bold]Result[/bold]",
526
+ border_style="cyan",
527
+ padding=(1, 2),
528
+ ))
529
+ console.print()
530
+
531
+
532
+ # ---------------------------------------------------------------------------
533
+ # ---------------------------------------------------------------------------
534
+ # arbiter swe -- SWE-bench style testing in Docker
535
+ # ---------------------------------------------------------------------------
536
+
537
+ @app.command(context_settings={"allow_extra_args": True, "allow_interspersed_args": True})
538
+ def swe(
539
+ ctx: typer.Context,
540
+ output: Optional[str] = typer.Option(None, "--output", "-o", help="Output format: json"),
541
+ ) -> None:
542
+ """Run SWE-bench style tests in Docker containers.
543
+
544
+ Real buggy code + real test suites. Models write patches,
545
+ Docker runs pytest to verify. The gold standard for code testing.
546
+
547
+ Requires Docker Desktop to be running.
548
+
549
+ Examples:
550
+ arbiter swe (auto-detect models)
551
+ arbiter swe gemma4:e2b qwen3.5:4b (specific models)
552
+ """
553
+ from arbiter.core.swe.container import check_docker
554
+ from arbiter.core.swe.runner import run_swe_comparison
555
+ from arbiter.core.swe.test_packs import TOTAL_SWE_TESTS
556
+ from rich.table import Table
557
+ from rich.panel import Panel
558
+
559
+ print_header()
560
+
561
+ # Check Docker
562
+ if not check_docker():
563
+ print_error("Docker is not running.")
564
+ console.print("[dim]Install Docker Desktop: https://docker.com/products/docker-desktop[/dim]")
565
+ console.print("[dim]Then start it and try again.[/dim]")
566
+ raise typer.Exit(1)
567
+
568
+ # Models
569
+ models = ctx.args if ctx.args else None
570
+ if not models:
571
+ console.print("[dim]Auto-detecting models...[/dim]")
572
+ detected = asyncio.run(_auto_select_models(max_models=2))
573
+ if not detected:
574
+ print_error("No models found.")
575
+ raise typer.Exit(1)
576
+ models = detected
577
+
578
+ model_str = " vs ".join(
579
+ f"[{get_model_color(i)}]{m}[/{get_model_color(i)}]"
580
+ for i, m in enumerate(models)
581
+ )
582
+ console.print(f"[bold magenta]SWE Testing[/bold magenta] {model_str}")
583
+ console.print(f"[dim]{TOTAL_SWE_TESTS} real code tests in Docker containers[/dim]")
584
+ console.print()
585
+
586
+ with console.status("[bold magenta]Running SWE tests (this takes a while)..."):
587
+ results = asyncio.run(run_swe_comparison(models))
588
+
589
+ if output == "json":
590
+ import json
591
+ console.print_json(json.dumps([r.to_dict() for r in results], indent=2))
592
+ return
593
+
594
+ # Display results
595
+ table = Table(title="SWE Test Results", border_style="dim")
596
+ table.add_column("Model", style="bold")
597
+ table.add_column("Passed", justify="right")
598
+ table.add_column("Total", justify="right")
599
+ table.add_column("Pass Rate", justify="right", style="bold")
600
+
601
+ for i, r in enumerate(results):
602
+ color = get_model_color(i)
603
+ rate = f"{r.pass_rate * 100:.0f}%"
604
+ rate_color = "green" if r.pass_rate >= 0.7 else "yellow" if r.pass_rate >= 0.5 else "red"
605
+ table.add_row(
606
+ f"[{color}]{r.model}[/{color}]",
607
+ str(r.total_passed), str(r.total_tests),
608
+ f"[{rate_color}]{rate}[/{rate_color}]",
609
+ )
610
+ console.print(table)
611
+ console.print()
612
+
613
+ # Per-test breakdown
614
+ detail = Table(title="Test Details", border_style="dim")
615
+ detail.add_column("Test", style="bold")
616
+ detail.add_column("Category", style="dim")
617
+ for i, r in enumerate(results):
618
+ detail.add_column(f"[{get_model_color(i)}]{r.model}[/{get_model_color(i)}]", justify="center")
619
+
620
+ if results:
621
+ for ti, test in enumerate(results[0].results):
622
+ row = [test.test_case, test.category]
623
+ for r in results:
624
+ t = r.results[ti] if ti < len(r.results) else None
625
+ if t and t.passed:
626
+ row.append(f"[green]{t.tests_passed}/{t.tests_total} PASS[/green]")
627
+ elif t:
628
+ row.append(f"[red]{t.tests_passed}/{t.tests_total} FAIL[/red]")
629
+ else:
630
+ row.append("[dim]--[/dim]")
631
+ detail.add_row(*row)
632
+
633
+ console.print(detail)
634
+ console.print()
635
+
636
+ if len(results) > 1:
637
+ best = max(results, key=lambda r: r.pass_rate)
638
+ console.print(Panel(
639
+ f"[bold green]{best.model}[/bold green] passed {best.total_passed}/{best.total_tests} tests ({best.pass_rate*100:.0f}%)",
640
+ title="[bold]SWE Winner[/bold]",
641
+ border_style="green",
642
+ ))
643
+ console.print()
644
+
645
+
646
+ # ---------------------------------------------------------------------------
647
+ # arbiter discover / leaderboard / config
648
+ # ---------------------------------------------------------------------------
649
+
650
+ @app.command()
651
+ def discover() -> None:
652
+ """List all available models across providers."""
653
+ from arbiter.core.discover import discover_all
654
+
655
+ print_header()
656
+ with console.status("[bold cyan]Discovering models..."):
657
+ models = asyncio.run(discover_all())
658
+ print_discover(models)
659
+
660
+
661
+ @app.command()
662
+ def leaderboard() -> None:
663
+ """View the persistent ELO leaderboard."""
664
+ from arbiter.core.leaderboard import Leaderboard
665
+
666
+ print_header()
667
+ lb = Leaderboard()
668
+ print_leaderboard(lb)
669
+
670
+
671
+ @app.command()
672
+ def config(
673
+ ollama_host: Optional[str] = typer.Option(None, "--ollama-host", help="Set Ollama API host"),
674
+ show_config: bool = typer.Option(False, "--show", help="Show current config"),
675
+ ) -> None:
676
+ """View or modify Arbiter configuration."""
677
+ from arbiter.core.config import load_config, save_config, get_ollama_host
678
+
679
+ if show_config:
680
+ cfg = load_config()
681
+ console.print(f"Ollama host: {get_ollama_host()}")
682
+ for k, v in cfg.items():
683
+ console.print(f"{k}: {v}")
684
+ return
685
+
686
+ if ollama_host:
687
+ cfg = load_config()
688
+ cfg["ollama_host"] = ollama_host
689
+ save_config(cfg)
690
+ console.print(f"[green]Ollama host set to: {ollama_host}[/green]")
691
+
692
+
693
+ def entry() -> None:
694
+ """Entry point for the CLI."""
695
+ app()
696
+
697
+
698
+ if __name__ == "__main__":
699
+ entry()