arbiter-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en" class="dark">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>Arbiter - The Final Word on Your Local Models</title>
7
+ <link rel="preconnect" href="https://fonts.googleapis.com" />
8
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
9
+ <link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600;700&family=Inter:wght@400;500;600;700;800&display=swap" rel="stylesheet" />
10
+ <script type="module" crossorigin src="/assets/index-dHa4zmvw.js"></script>
11
+ <link rel="stylesheet" crossorigin href="/assets/index-1tkxJouQ.css">
12
+ </head>
13
+ <body class="bg-arbiter-bg text-white font-sans antialiased">
14
+ <div id="root"></div>
15
+ </body>
16
+ </html>
@@ -0,0 +1,426 @@
1
+ """FastAPI dashboard backend with WebSocket streaming."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import os
8
+ import signal
9
+ import webbrowser
10
+ from pathlib import Path
11
+ from typing import Optional
12
+
13
+ import uvicorn
14
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
15
+ from fastapi.middleware.cors import CORSMiddleware
16
+ from fastapi.responses import FileResponse, HTMLResponse
17
+ from fastapi.staticfiles import StaticFiles
18
+
19
+ from arbiter.core.judge import judge_comparison
20
+ from arbiter.core.leaderboard import Leaderboard
21
+ from arbiter.core.metrics import ComparisonResult, ModelMetrics
22
+ from arbiter.core.runner import stream_comparison, run_single_model
23
+ from arbiter.core.config import resolve_model
24
+
25
+ FRONTEND_DIR = Path(__file__).parent / "frontend" / "dist"
26
+
27
+ app = FastAPI(title="Arbiter Dashboard")
28
+
29
+ app.add_middleware(
30
+ CORSMiddleware,
31
+ allow_origins=["*"],
32
+ allow_methods=["*"],
33
+ allow_headers=["*"],
34
+ )
35
+
36
+ # State for the current comparison
37
+ _comparison_state: dict = {
38
+ "model_specs": [],
39
+ "prompt": "",
40
+ "image_path": None,
41
+ "judge_model": "auto",
42
+ "no_judge": False,
43
+ "system": None,
44
+ "result": None,
45
+ "leaderboard": None,
46
+ "sequential": False,
47
+ }
48
+
49
+
50
+ @app.get("/api/health")
51
+ async def health():
52
+ return {"status": "ok"}
53
+
54
+
55
+ @app.get("/api/leaderboard")
56
+ async def get_leaderboard():
57
+ lb = Leaderboard()
58
+ return lb.to_dict()
59
+
60
+
61
+ @app.get("/api/config")
62
+ async def get_config():
63
+ return {
64
+ "model_specs": _comparison_state["model_specs"],
65
+ "prompt": _comparison_state["prompt"],
66
+ }
67
+
68
+
69
+ @app.get("/api/result")
70
+ async def get_result():
71
+ if _comparison_state["result"]:
72
+ return _comparison_state["result"].to_dict()
73
+ return {"status": "pending"}
74
+
75
+
76
+ @app.get("/api/models")
77
+ async def get_models():
78
+ """List all available models with memory fitness info."""
79
+ from arbiter.core.discover import (
80
+ discover_ollama, discover_openai, discover_anthropic,
81
+ discover_google, get_system_memory,
82
+ )
83
+
84
+ all_models = []
85
+ for discover_fn in [discover_ollama, discover_openai, discover_anthropic, discover_google]:
86
+ try:
87
+ found = await discover_fn()
88
+ all_models.extend(found)
89
+ except Exception:
90
+ pass
91
+
92
+ mem = get_system_memory()
93
+
94
+ return {
95
+ "system": {
96
+ "total_ram_gb": mem["total_gb"],
97
+ "available_ram_gb": mem["available_gb"],
98
+ "ram_percent_used": mem["percent"],
99
+ },
100
+ "models": [
101
+ {
102
+ "name": m.name,
103
+ "provider": m.provider,
104
+ "size_gb": m.size_gb,
105
+ "parameter_size": m.parameter_size,
106
+ "family": m.family,
107
+ "multimodal": m.multimodal,
108
+ "spec": m.spec,
109
+ "fits_in_memory": m.fits_in_memory,
110
+ "memory_warning": m.memory_warning,
111
+ }
112
+ for m in all_models
113
+ ],
114
+ }
115
+
116
+
117
+ @app.post("/api/run")
118
+ async def start_run(body: dict):
119
+ """Start a comparison from the dashboard.
120
+
121
+ Body: {"models": ["gemma4:e2b", "qwen3.5:4b"], "prompt": "...", "sequential": true}
122
+ """
123
+ models = body.get("models", [])
124
+ prompt = body.get("prompt", "")
125
+ sequential = body.get("sequential", True)
126
+
127
+ if not models or not prompt:
128
+ return {"error": "Need at least one model and a prompt"}
129
+
130
+ # Update state so the WebSocket picks it up
131
+ _comparison_state.update({
132
+ "model_specs": models,
133
+ "prompt": prompt,
134
+ "sequential": sequential,
135
+ "no_judge": body.get("no_judge", False),
136
+ "judge_model": body.get("judge_model", "auto"),
137
+ "system": body.get("system"),
138
+ "image_path": None,
139
+ "result": None,
140
+ })
141
+
142
+ return {"status": "started", "models": models, "prompt": prompt}
143
+
144
+
145
+ @app.post("/api/benchmark")
146
+ async def start_benchmark(body: dict):
147
+ """Start a benchmark run from the dashboard.
148
+
149
+ Body: {"models": ["gemma4:e2b"], "quick": false}
150
+ """
151
+ from arbiter.core.benchmarks import run_benchmark_comparison, run_benchmark_suite
152
+
153
+ models = body.get("models", [])
154
+ quick = body.get("quick", False)
155
+
156
+ if not models:
157
+ return {"error": "Need at least one model"}
158
+
159
+ results = await run_benchmark_comparison(models, quick=quick)
160
+ return {"results": [r.to_dict() for r in results]}
161
+
162
+
163
+ @app.get("/api/benchmark/categories")
164
+ async def get_benchmark_categories():
165
+ """Get available benchmark test categories for the UI."""
166
+ from arbiter.core.benchmarks import CATEGORIES, ALL_TESTS, QUICK_TESTS
167
+ return {
168
+ "categories": CATEGORIES,
169
+ "total_tests": len(ALL_TESTS),
170
+ "quick_tests": len(QUICK_TESTS),
171
+ }
172
+
173
+
174
+ @app.websocket("/ws")
175
+ async def websocket_endpoint(ws: WebSocket):
176
+ """WebSocket endpoint for real-time comparison streaming.
177
+
178
+ Sends events:
179
+ {"type": "config", "models": [...], "prompt": "..."}
180
+ {"type": "start", "model": "gemma4"}
181
+ {"type": "token", "model": "gemma4", "text": "...", "metrics": {...}}
182
+ {"type": "done", "model": "gemma4", "metrics": {...}}
183
+ {"type": "judging"}
184
+ {"type": "result", "data": {...}}
185
+ {"type": "leaderboard", "data": {...}}
186
+ """
187
+ await ws.accept()
188
+
189
+ try:
190
+ # Send config
191
+ await ws.send_json(
192
+ {
193
+ "type": "config",
194
+ "models": _comparison_state["model_specs"],
195
+ "prompt": _comparison_state["prompt"],
196
+ }
197
+ )
198
+
199
+ # Always send leaderboard data
200
+ lb = Leaderboard()
201
+ await ws.send_json({"type": "leaderboard", "data": lb.to_dict()})
202
+
203
+ # If no models configured, just keep connection alive (dashboard-only mode)
204
+ if not _comparison_state["model_specs"]:
205
+ await ws.send_json({"type": "idle", "message": "No comparison running."})
206
+ while True:
207
+ try:
208
+ await asyncio.wait_for(ws.receive_text(), timeout=30)
209
+ except asyncio.TimeoutError:
210
+ try:
211
+ await ws.send_json({"type": "ping"})
212
+ except Exception:
213
+ break
214
+ except WebSocketDisconnect:
215
+ break
216
+ return
217
+
218
+ # If we already have a result, send it immediately
219
+ if _comparison_state["result"]:
220
+ await ws.send_json(
221
+ {
222
+ "type": "result",
223
+ "data": _comparison_state["result"].to_dict(),
224
+ }
225
+ )
226
+ lb = Leaderboard()
227
+ await ws.send_json({"type": "leaderboard", "data": lb.to_dict()})
228
+ return
229
+
230
+ # Stream the comparison
231
+ all_metrics = []
232
+ is_seq = _comparison_state.get("sequential", False)
233
+
234
+ if is_seq:
235
+ # Sequential: run one model at a time, queue tokens for ordered sends
236
+ for spec in _comparison_state["model_specs"]:
237
+ cfg = resolve_model(spec)
238
+ model_name = cfg.extra["model"]
239
+ await ws.send_json({"type": "start", "model": model_name})
240
+
241
+ token_queue = asyncio.Queue()
242
+
243
+ def _on_token_sync(name, text, m, _q=token_queue):
244
+ _q.put_nowait({
245
+ "type": "token",
246
+ "model": name,
247
+ "text": m.output[-50:],
248
+ "total_text_length": len(m.output),
249
+ "metrics": {
250
+ "tokens": m._token_count,
251
+ "tokens_per_sec": round(m.tokens_per_sec, 1) if m.tokens_per_sec else None,
252
+ "ttft_ms": round(m.ttft_ms, 1) if m.ttft_ms else None,
253
+ },
254
+ })
255
+
256
+ async def _drain_queue(q, done_event):
257
+ while not done_event.is_set() or not q.empty():
258
+ try:
259
+ msg = q.get_nowait()
260
+ await ws.send_json(msg)
261
+ except asyncio.QueueEmpty:
262
+ await asyncio.sleep(0.05)
263
+
264
+ try:
265
+ done_event = asyncio.Event()
266
+ drain_task = asyncio.create_task(_drain_queue(token_queue, done_event))
267
+
268
+ metrics = await run_single_model(
269
+ model_spec=spec,
270
+ prompt=_comparison_state["prompt"],
271
+ system=_comparison_state.get("system"),
272
+ image_path=_comparison_state.get("image_path"),
273
+ on_token=_on_token_sync,
274
+ )
275
+
276
+ done_event.set()
277
+ await drain_task
278
+
279
+ all_metrics.append(metrics)
280
+ await ws.send_json({"type": "done", "model": model_name, "metrics": metrics.to_dict()})
281
+ except Exception as e:
282
+ done_event.set()
283
+ m = ModelMetrics(model=model_name, provider=cfg.provider, output=f"[ERROR] {e}")
284
+ all_metrics.append(m)
285
+ await ws.send_json({"type": "error", "model": model_name, "metrics": m.to_dict()})
286
+ else:
287
+ # Parallel: stream all at once
288
+ async for event_type, model_name, metrics in stream_comparison(
289
+ model_specs=_comparison_state["model_specs"],
290
+ prompt=_comparison_state["prompt"],
291
+ system=_comparison_state.get("system"),
292
+ image_path=_comparison_state.get("image_path"),
293
+ ):
294
+ if event_type == "start":
295
+ await ws.send_json({"type": "start", "model": model_name})
296
+ elif event_type == "token":
297
+ await ws.send_json({
298
+ "type": "token",
299
+ "model": model_name,
300
+ "text": metrics.output[-50:],
301
+ "total_text_length": len(metrics.output),
302
+ "metrics": {
303
+ "tokens": metrics._token_count,
304
+ "tokens_per_sec": round(metrics.tokens_per_sec, 1) if metrics.tokens_per_sec else None,
305
+ "ttft_ms": round(metrics.ttft_ms, 1) if metrics.ttft_ms else None,
306
+ },
307
+ })
308
+ elif event_type in ("done", "error"):
309
+ all_metrics.append(metrics)
310
+ await ws.send_json({
311
+ "type": "done" if event_type == "done" else "error",
312
+ "model": model_name,
313
+ "metrics": metrics.to_dict(),
314
+ })
315
+
316
+ # Build comparison result
317
+ result = ComparisonResult(
318
+ prompt=_comparison_state["prompt"],
319
+ models=all_metrics,
320
+ )
321
+
322
+ # Judge
323
+ has_quality = False
324
+ if not _comparison_state["no_judge"] and len(all_metrics) > 1:
325
+ await ws.send_json({"type": "judging"})
326
+ result = await judge_comparison(
327
+ result, judge_model=_comparison_state["judge_model"]
328
+ )
329
+ has_quality = True
330
+
331
+ # Compute composite scores and winner
332
+ from arbiter.core.metrics import compute_composite_scores
333
+ result.scoring = compute_composite_scores(result, has_quality=has_quality)
334
+ result.winner = result.scoring.winner if result.scoring else None
335
+
336
+ # Update leaderboard
337
+ if len(all_metrics) > 1:
338
+ lb = Leaderboard()
339
+ lb.update_from_comparison(result)
340
+ await ws.send_json({"type": "leaderboard", "data": lb.to_dict()})
341
+
342
+ _comparison_state["result"] = result
343
+ await ws.send_json({"type": "result", "data": result.to_dict()})
344
+
345
+ except WebSocketDisconnect:
346
+ pass
347
+ except Exception as e:
348
+ try:
349
+ await ws.send_json({"type": "error", "message": str(e)})
350
+ except Exception:
351
+ pass
352
+
353
+
354
+ def _find_frontend_dist() -> Optional[Path]:
355
+ """Find the frontend dist directory, checking multiple locations."""
356
+ candidates = [
357
+ Path(__file__).parent / "frontend" / "dist",
358
+ Path(__file__).resolve().parent / "frontend" / "dist",
359
+ ]
360
+ for c in candidates:
361
+ if c.exists() and (c / "index.html").exists():
362
+ return c
363
+ return None
364
+
365
+
366
+ _DIST = _find_frontend_dist()
367
+
368
+ if _DIST and (_DIST / "assets").exists():
369
+ app.mount("/assets", StaticFiles(directory=str(_DIST / "assets")), name="assets")
370
+
371
+
372
+ @app.get("/")
373
+ async def serve_index():
374
+ """Serve the dashboard index page."""
375
+ dist = _find_frontend_dist()
376
+ if dist:
377
+ return FileResponse(str(dist / "index.html"))
378
+ return HTMLResponse(
379
+ "<h1>Arbiter Dashboard</h1>"
380
+ "<p>Frontend not built yet. Run:</p>"
381
+ "<pre>cd arbiter/dashboard/frontend && npm install && npm run build</pre>"
382
+ "<p>WebSocket API is available at /ws</p>"
383
+ )
384
+
385
+
386
+ async def start_server(
387
+ model_specs: list[str],
388
+ prompt: str,
389
+ image_path: Optional[str] = None,
390
+ judge_model: str = "auto",
391
+ no_judge: bool = False,
392
+ system: Optional[str] = None,
393
+ sequential: bool = False,
394
+ port: int = 7878,
395
+ ) -> None:
396
+ """Start the dashboard server and open the browser."""
397
+ _comparison_state.update(
398
+ {
399
+ "model_specs": model_specs,
400
+ "prompt": prompt,
401
+ "image_path": image_path,
402
+ "judge_model": judge_model,
403
+ "no_judge": no_judge,
404
+ "system": system,
405
+ "sequential": sequential,
406
+ "result": None,
407
+ }
408
+ )
409
+
410
+ config = uvicorn.Config(
411
+ app,
412
+ host="127.0.0.1",
413
+ port=port,
414
+ log_level="warning",
415
+ )
416
+ server = uvicorn.Server(config)
417
+
418
+ # Open browser after a short delay
419
+ async def _open_browser():
420
+ await asyncio.sleep(1)
421
+ webbrowser.open(f"http://127.0.0.1:{port}")
422
+
423
+ await asyncio.gather(
424
+ server.serve(),
425
+ _open_browser(),
426
+ )