arbiter-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,197 @@
1
+ """Persistent ELO leaderboard tracking model rankings over time."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import dataclass, field
7
+ from datetime import datetime, timezone
8
+ from typing import Optional
9
+
10
+ from arbiter.core.config import LEADERBOARD_FILE, ensure_arbiter_dir
11
+ from arbiter.core.metrics import ComparisonResult
12
+
13
+
14
+ # ELO constants
15
+ K_FACTOR = 32
16
+ DEFAULT_ELO = 1500
17
+
18
+
19
+ @dataclass
20
+ class ModelRating:
21
+ """Persistent rating for a single model."""
22
+
23
+ name: str
24
+ elo: float = DEFAULT_ELO
25
+ wins: int = 0
26
+ losses: int = 0
27
+ draws: int = 0
28
+ avg_tokens_sec: Optional[float] = None
29
+ avg_quality: Optional[float] = None
30
+ total_comparisons: int = 0
31
+ elo_history: list[float] = field(default_factory=list)
32
+ last_seen: Optional[str] = None
33
+
34
+ @property
35
+ def win_rate(self) -> float:
36
+ total = self.wins + self.losses + self.draws
37
+ if total == 0:
38
+ return 0.0
39
+ return self.wins / total
40
+
41
+ def to_dict(self) -> dict:
42
+ return {
43
+ "name": self.name,
44
+ "elo": round(self.elo, 1),
45
+ "wins": self.wins,
46
+ "losses": self.losses,
47
+ "draws": self.draws,
48
+ "avg_tokens_sec": (
49
+ round(self.avg_tokens_sec, 1) if self.avg_tokens_sec else None
50
+ ),
51
+ "avg_quality": (
52
+ round(self.avg_quality, 1) if self.avg_quality else None
53
+ ),
54
+ "total_comparisons": self.total_comparisons,
55
+ "elo_history": [round(e, 1) for e in self.elo_history[-20:]],
56
+ "last_seen": self.last_seen,
57
+ "win_rate": round(self.win_rate * 100, 1),
58
+ }
59
+
60
+ @classmethod
61
+ def from_dict(cls, d: dict) -> ModelRating:
62
+ return cls(
63
+ name=d["name"],
64
+ elo=d.get("elo", DEFAULT_ELO),
65
+ wins=d.get("wins", 0),
66
+ losses=d.get("losses", 0),
67
+ draws=d.get("draws", 0),
68
+ avg_tokens_sec=d.get("avg_tokens_sec"),
69
+ avg_quality=d.get("avg_quality"),
70
+ total_comparisons=d.get("total_comparisons", 0),
71
+ elo_history=d.get("elo_history", []),
72
+ last_seen=d.get("last_seen"),
73
+ )
74
+
75
+
76
+ class Leaderboard:
77
+ """Persistent ELO leaderboard stored in ~/.arbiter/leaderboard.json."""
78
+
79
+ def __init__(self):
80
+ self.ratings: dict[str, ModelRating] = {}
81
+ self._load()
82
+
83
+ def _load(self) -> None:
84
+ """Load leaderboard from disk."""
85
+ if LEADERBOARD_FILE.exists():
86
+ try:
87
+ with open(LEADERBOARD_FILE) as f:
88
+ data = json.load(f)
89
+ for entry in data.get("models", []):
90
+ rating = ModelRating.from_dict(entry)
91
+ self.ratings[rating.name] = rating
92
+ except (json.JSONDecodeError, KeyError):
93
+ self.ratings = {}
94
+
95
+ def _save(self) -> None:
96
+ """Save leaderboard to disk."""
97
+ ensure_arbiter_dir()
98
+ data = {
99
+ "models": [r.to_dict() for r in self.sorted_ratings()],
100
+ "updated_at": datetime.now(timezone.utc).isoformat(),
101
+ }
102
+ with open(LEADERBOARD_FILE, "w") as f:
103
+ json.dump(data, f, indent=2)
104
+
105
+ def _get_or_create(self, model_name: str) -> ModelRating:
106
+ """Get existing rating or create new one."""
107
+ if model_name not in self.ratings:
108
+ self.ratings[model_name] = ModelRating(name=model_name)
109
+ return self.ratings[model_name]
110
+
111
+ def _expected_score(self, elo_a: float, elo_b: float) -> float:
112
+ """Calculate expected score for player A vs player B."""
113
+ return 1.0 / (1.0 + 10 ** ((elo_b - elo_a) / 400))
114
+
115
+ def _update_elo(self, winner: ModelRating, loser: ModelRating, k_scale: float = 1.0) -> None:
116
+ """Update ELO ratings for a win/loss pair."""
117
+ k = K_FACTOR * k_scale
118
+ expected_w = self._expected_score(winner.elo, loser.elo)
119
+ expected_l = self._expected_score(loser.elo, winner.elo)
120
+
121
+ winner.elo += k * (1 - expected_w)
122
+ loser.elo += k * (0 - expected_l)
123
+
124
+ winner.elo_history.append(winner.elo)
125
+ loser.elo_history.append(loser.elo)
126
+
127
+ def update_from_comparison(self, result: ComparisonResult) -> None:
128
+ """Update the leaderboard from a comparison result.
129
+
130
+ Updates ELO, win/loss counts, and running averages.
131
+ """
132
+ now = datetime.now(timezone.utc).isoformat()
133
+ models = result.models
134
+ winner_name = result.winner
135
+
136
+ # Update per-model stats
137
+ for m in models:
138
+ rating = self._get_or_create(m.model)
139
+ rating.total_comparisons += 1
140
+ rating.last_seen = now
141
+
142
+ # Update running average for tokens/sec
143
+ if m.tokens_per_sec:
144
+ if rating.avg_tokens_sec is None:
145
+ rating.avg_tokens_sec = m.tokens_per_sec
146
+ else:
147
+ # Exponential moving average
148
+ rating.avg_tokens_sec = (
149
+ 0.7 * rating.avg_tokens_sec + 0.3 * m.tokens_per_sec
150
+ )
151
+
152
+ # Update running average for quality
153
+ if m.overall_score is not None:
154
+ if rating.avg_quality is None:
155
+ rating.avg_quality = m.overall_score
156
+ else:
157
+ rating.avg_quality = (
158
+ 0.7 * rating.avg_quality + 0.3 * m.overall_score
159
+ )
160
+
161
+ # Update ELO based on winner
162
+ # Scale K_FACTOR by 1/n_opponents for multi-model comparisons
163
+ if winner_name and len(models) >= 2:
164
+ n_opponents = len(models) - 1
165
+ winner_rating = self._get_or_create(winner_name)
166
+ winner_rating.wins += 1
167
+
168
+ for m in models:
169
+ if m.model != winner_name:
170
+ loser_rating = self._get_or_create(m.model)
171
+ loser_rating.losses += 1
172
+ self._update_elo(winner_rating, loser_rating, k_scale=1.0 / n_opponents)
173
+ else:
174
+ # No clear winner -- count as draws
175
+ for m in models:
176
+ rating = self._get_or_create(m.model)
177
+ rating.draws += 1
178
+ rating.elo_history.append(rating.elo)
179
+
180
+ self._save()
181
+
182
+ def sorted_ratings(self) -> list[ModelRating]:
183
+ """Return ratings sorted by ELO (highest first)."""
184
+ return sorted(self.ratings.values(), key=lambda r: r.elo, reverse=True)
185
+
186
+ def to_dict(self) -> dict:
187
+ """Serialize the full leaderboard."""
188
+ return {
189
+ "models": [r.to_dict() for r in self.sorted_ratings()],
190
+ }
191
+
192
+ def get_model_rank(self, model_name: str) -> Optional[int]:
193
+ """Get 1-indexed rank for a model, or None if not found."""
194
+ for i, r in enumerate(self.sorted_ratings(), 1):
195
+ if r.name == model_name:
196
+ return i
197
+ return None
@@ -0,0 +1,367 @@
1
+ """Performance metrics collection for model comparisons."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+ from dataclasses import dataclass, field
7
+ from typing import Optional
8
+
9
+ import psutil
10
+
11
+
12
+ @dataclass
13
+ class ModelMetrics:
14
+ """Performance metrics for a single model run."""
15
+
16
+ model: str
17
+ provider: str
18
+
19
+ # Timing
20
+ ttft_ms: Optional[float] = None # Time to first token (milliseconds)
21
+ total_time_s: Optional[float] = None # Total generation time (seconds)
22
+ tokens_per_sec: Optional[float] = None # Generation speed
23
+
24
+ # Token counts
25
+ total_tokens: int = 0
26
+ prompt_tokens: Optional[int] = None
27
+
28
+ # Memory
29
+ memory_before_mb: Optional[float] = None
30
+ memory_after_mb: Optional[float] = None
31
+ peak_memory_delta_mb: Optional[float] = None
32
+
33
+ # Quality (filled by judge)
34
+ quality_scores: dict = field(default_factory=dict)
35
+ overall_score: Optional[float] = None
36
+
37
+ # The generated text
38
+ output: str = ""
39
+
40
+ # Streaming state (used during generation, not in final output)
41
+ _start_time: Optional[float] = field(default=None, repr=False)
42
+ _first_token_time: Optional[float] = field(default=None, repr=False)
43
+ _token_count: int = field(default=0, repr=False)
44
+
45
+ def start(self) -> None:
46
+ """Mark the start of generation."""
47
+ self._start_time = time.perf_counter()
48
+ self.memory_before_mb = _get_system_memory_mb()
49
+
50
+ def record_first_token(self) -> None:
51
+ """Record when the first token arrives."""
52
+ if self._first_token_time is None and self._start_time is not None:
53
+ self._first_token_time = time.perf_counter()
54
+ self.ttft_ms = (self._first_token_time - self._start_time) * 1000
55
+
56
+ def record_token(self, text: str) -> None:
57
+ """Record a token being received."""
58
+ self._token_count += 1
59
+ self.output += text
60
+
61
+ def finish(self, provider_meta: dict | None = None) -> None:
62
+ """Finalize metrics after generation completes."""
63
+ end_time = time.perf_counter()
64
+ self.memory_after_mb = _get_system_memory_mb()
65
+
66
+ if self._start_time is not None:
67
+ self.total_time_s = end_time - self._start_time
68
+
69
+ if self.memory_before_mb is not None and self.memory_after_mb is not None:
70
+ self.peak_memory_delta_mb = self.memory_after_mb - self.memory_before_mb
71
+
72
+ # Use provider-reported metrics if available (more accurate)
73
+ if provider_meta:
74
+ eval_count = provider_meta.get("eval_count")
75
+ eval_duration = provider_meta.get("eval_duration")
76
+
77
+ if eval_count and eval_duration:
78
+ # Ollama reports eval_duration in nanoseconds
79
+ self.total_tokens = eval_count
80
+ self.tokens_per_sec = eval_count / (eval_duration / 1e9)
81
+ else:
82
+ # Cloud providers -- use our own measurements
83
+ output_tokens = provider_meta.get("output_tokens")
84
+ if output_tokens:
85
+ self.total_tokens = output_tokens
86
+
87
+ prompt_tokens = provider_meta.get("prompt_eval_count") or provider_meta.get(
88
+ "prompt_tokens"
89
+ ) or provider_meta.get("input_tokens")
90
+ if prompt_tokens:
91
+ self.prompt_tokens = prompt_tokens
92
+
93
+ # For cloud providers, extract usage from nested dict
94
+ usage = provider_meta.get("usage", {})
95
+ if usage:
96
+ self.total_tokens = self.total_tokens or usage.get(
97
+ "completion_tokens", 0
98
+ )
99
+ self.prompt_tokens = self.prompt_tokens or usage.get("prompt_tokens")
100
+
101
+ # Fallback: calculate from our own timing
102
+ if not self.total_tokens:
103
+ self.total_tokens = self._token_count
104
+
105
+ if not self.tokens_per_sec and self.total_time_s and self.total_time_s > 0:
106
+ # Subtract TTFT to get pure generation time
107
+ gen_time = self.total_time_s
108
+ if self.ttft_ms:
109
+ gen_time -= self.ttft_ms / 1000
110
+ if gen_time > 0 and self.total_tokens > 0:
111
+ self.tokens_per_sec = self.total_tokens / gen_time
112
+
113
+ def to_dict(self) -> dict:
114
+ """Serialize to a plain dict (for JSON/WebSocket)."""
115
+ return {
116
+ "model": self.model,
117
+ "provider": self.provider,
118
+ "ttft_ms": round(self.ttft_ms, 1) if self.ttft_ms else None,
119
+ "total_time_s": round(self.total_time_s, 2) if self.total_time_s else None,
120
+ "tokens_per_sec": (
121
+ round(self.tokens_per_sec, 1) if self.tokens_per_sec else None
122
+ ),
123
+ "total_tokens": self.total_tokens,
124
+ "prompt_tokens": self.prompt_tokens,
125
+ "peak_memory_delta_mb": (
126
+ round(self.peak_memory_delta_mb, 1)
127
+ if self.peak_memory_delta_mb is not None
128
+ else None
129
+ ),
130
+ "quality_scores": self.quality_scores,
131
+ "overall_score": self.overall_score,
132
+ "output": self.output,
133
+ }
134
+
135
+
136
+ @dataclass
137
+ class ScoreWeights:
138
+ """Weights for composite scoring. Must sum to 1.0."""
139
+
140
+ speed: float = 0.30
141
+ quality: float = 0.50
142
+ responsiveness: float = 0.20
143
+
144
+ def redistribute_without_quality(self) -> "ScoreWeights":
145
+ """When --no-judge is used, redistribute quality weight."""
146
+ total = self.speed + self.responsiveness
147
+ return ScoreWeights(
148
+ speed=self.speed / total,
149
+ quality=0.0,
150
+ responsiveness=self.responsiveness / total,
151
+ )
152
+
153
+
154
+ @dataclass
155
+ class ComponentScore:
156
+ """A single scoring dimension for one model."""
157
+
158
+ metric_name: str # "Speed", "Quality", "Responsiveness"
159
+ raw_value: float # e.g. 45.2
160
+ raw_unit: str # e.g. "tok/s"
161
+ normalized: float # 0.0 - 1.0 (relative to best in group)
162
+ weight: float # the weight applied
163
+ weighted: float # normalized * weight
164
+ rank: int # 1-indexed rank in this dimension
165
+
166
+
167
+ @dataclass
168
+ class ModelCompositeScore:
169
+ """All scoring components for one model."""
170
+
171
+ model: str
172
+ components: list[ComponentScore]
173
+ composite: float # sum of all weighted scores
174
+ rank: int # overall rank
175
+
176
+
177
+ @dataclass
178
+ class ScoringBreakdown:
179
+ """The complete scoring explanation."""
180
+
181
+ weights: ScoreWeights
182
+ model_scores: list[ModelCompositeScore]
183
+ winner: Optional[str]
184
+ winner_reason: str # human-readable: "Won Speed (45.2 vs 32.1 tok/s)..."
185
+ formula: str # "Speed(30%) + Quality(50%) + Responsiveness(20%)"
186
+
187
+ def to_dict(self) -> dict:
188
+ return {
189
+ "formula": self.formula,
190
+ "winner": self.winner,
191
+ "winner_reason": self.winner_reason,
192
+ "models": [
193
+ {
194
+ "model": ms.model,
195
+ "composite": round(ms.composite, 3),
196
+ "rank": ms.rank,
197
+ "components": [
198
+ {
199
+ "metric": c.metric_name,
200
+ "raw": f"{c.raw_value:.1f} {c.raw_unit}",
201
+ "normalized": round(c.normalized, 3),
202
+ "weight": f"{c.weight:.0%}",
203
+ "weighted": round(c.weighted, 3),
204
+ "rank": c.rank,
205
+ }
206
+ for c in ms.components
207
+ ],
208
+ }
209
+ for ms in self.model_scores
210
+ ],
211
+ }
212
+
213
+
214
+ @dataclass
215
+ class ComparisonResult:
216
+ """Complete result of a multi-model comparison."""
217
+
218
+ prompt: str
219
+ models: list[ModelMetrics]
220
+ winner: Optional[str] = None
221
+ judge_model: Optional[str] = None
222
+ timestamp: Optional[str] = None
223
+ scoring: Optional[ScoringBreakdown] = None
224
+
225
+ def to_dict(self) -> dict:
226
+ """Serialize to a plain dict."""
227
+ d = {
228
+ "prompt": self.prompt,
229
+ "models": [m.to_dict() for m in self.models],
230
+ "winner": self.winner,
231
+ "judge_model": self.judge_model,
232
+ "timestamp": self.timestamp,
233
+ }
234
+ if self.scoring:
235
+ d["scoring"] = self.scoring.to_dict()
236
+ return d
237
+
238
+
239
+ def compute_composite_scores(
240
+ result: "ComparisonResult",
241
+ weights: Optional[ScoreWeights] = None,
242
+ has_quality: bool = True,
243
+ ) -> Optional[ScoringBreakdown]:
244
+ """Compute transparent composite scores to determine a winner.
245
+
246
+ Each metric is normalized 0-1 relative to the best performer:
247
+ - Speed: model_tps / max_tps (higher is better)
248
+ - Responsiveness: min_ttft / model_ttft (lower is better, inverted)
249
+ - Quality: model.overall_score / 10 (from judge, or 0 if no judge)
250
+
251
+ Returns ScoringBreakdown with full explanation of why the winner won.
252
+ """
253
+ valid = [m for m in result.models if not m.output.startswith("[ERROR]")]
254
+ if not valid:
255
+ return None
256
+
257
+ w = weights or ScoreWeights()
258
+ if not has_quality:
259
+ w = w.redistribute_without_quality()
260
+
261
+ # Build formula string
262
+ parts = []
263
+ if w.speed > 0:
264
+ parts.append(f"Speed({w.speed:.0%})")
265
+ if w.quality > 0:
266
+ parts.append(f"Quality({w.quality:.0%})")
267
+ if w.responsiveness > 0:
268
+ parts.append(f"Responsiveness({w.responsiveness:.0%})")
269
+ formula = " + ".join(parts)
270
+
271
+ # Gather raw values
272
+ speeds = {m.model: (m.tokens_per_sec or 0) for m in valid}
273
+ ttfts = {m.model: (m.ttft_ms or 999999) for m in valid}
274
+ qualities = {m.model: (m.overall_score or 0) for m in valid}
275
+
276
+ max_speed = max(speeds.values()) or 1
277
+ min_ttft = min(v for v in ttfts.values() if v > 0) if any(v > 0 for v in ttfts.values()) else 1
278
+
279
+ # Normalize and rank each dimension
280
+ def _rank(values: dict, higher_better: bool) -> dict:
281
+ sorted_items = sorted(values.items(), key=lambda x: x[1], reverse=higher_better)
282
+ return {name: rank + 1 for rank, (name, _) in enumerate(sorted_items)}
283
+
284
+ speed_ranks = _rank(speeds, higher_better=True)
285
+ ttft_ranks = _rank(ttfts, higher_better=False)
286
+ quality_ranks = _rank(qualities, higher_better=True)
287
+
288
+ # Build per-model composite scores
289
+ model_scores = []
290
+ for m in valid:
291
+ components = []
292
+
293
+ # Speed
294
+ norm_speed = speeds[m.model] / max_speed if max_speed > 0 else 0
295
+ components.append(ComponentScore(
296
+ metric_name="Speed", raw_value=speeds[m.model], raw_unit="tok/s",
297
+ normalized=norm_speed, weight=w.speed,
298
+ weighted=norm_speed * w.speed, rank=speed_ranks[m.model],
299
+ ))
300
+
301
+ # Responsiveness (TTFT inverted -- lower is better)
302
+ raw_ttft = ttfts[m.model]
303
+ norm_resp = min_ttft / raw_ttft if raw_ttft > 0 else 0
304
+ components.append(ComponentScore(
305
+ metric_name="Responsiveness", raw_value=raw_ttft, raw_unit="ms TTFT",
306
+ normalized=norm_resp, weight=w.responsiveness,
307
+ weighted=norm_resp * w.responsiveness, rank=ttft_ranks[m.model],
308
+ ))
309
+
310
+ # Quality (only if judge was used)
311
+ if w.quality > 0:
312
+ norm_quality = qualities[m.model] / 10
313
+ components.append(ComponentScore(
314
+ metric_name="Quality", raw_value=qualities[m.model], raw_unit="/10",
315
+ normalized=norm_quality, weight=w.quality,
316
+ weighted=norm_quality * w.quality, rank=quality_ranks[m.model],
317
+ ))
318
+
319
+ composite = sum(c.weighted for c in components)
320
+ model_scores.append(ModelCompositeScore(
321
+ model=m.model, components=components, composite=composite, rank=0,
322
+ ))
323
+
324
+ # Rank by composite
325
+ model_scores.sort(key=lambda x: x.composite, reverse=True)
326
+ for i, ms in enumerate(model_scores):
327
+ ms.rank = i + 1
328
+
329
+ winner = model_scores[0].model if model_scores else None
330
+
331
+ # Build winner reason
332
+ reason_parts = []
333
+ if len(model_scores) >= 2:
334
+ best = model_scores[0]
335
+ second = model_scores[1]
336
+ reason_parts.append(
337
+ f"{best.model} scored {best.composite:.2f} vs {second.model} at {second.composite:.2f}"
338
+ )
339
+ # Which components did the winner lead?
340
+ for comp in best.components:
341
+ other_comp = next(
342
+ (c for ms in model_scores[1:] for c in ms.components if c.metric_name == comp.metric_name),
343
+ None,
344
+ )
345
+ if other_comp and comp.rank == 1:
346
+ reason_parts.append(
347
+ f"Led {comp.metric_name}: {comp.raw_value:.1f} {comp.raw_unit} vs {other_comp.raw_value:.1f} {other_comp.raw_unit}"
348
+ )
349
+ elif other_comp and comp.rank > 1:
350
+ reason_parts.append(
351
+ f"Trailed {comp.metric_name}: {comp.raw_value:.1f} {comp.raw_unit} vs {other_comp.raw_value:.1f} {other_comp.raw_unit}"
352
+ )
353
+ elif len(model_scores) == 1:
354
+ reason_parts.append(f"{model_scores[0].model} was the only model that completed")
355
+
356
+ winner_reason = ". ".join(reason_parts)
357
+
358
+ return ScoringBreakdown(
359
+ weights=w, model_scores=model_scores,
360
+ winner=winner, winner_reason=winner_reason, formula=formula,
361
+ )
362
+
363
+
364
+ def _get_system_memory_mb() -> float:
365
+ """Get current process memory usage in MB."""
366
+ process = psutil.Process()
367
+ return process.memory_info().rss / (1024 * 1024)
@@ -0,0 +1,19 @@
1
+ """LLM provider abstraction layer."""
2
+
3
+ from arbiter.core.providers.base import LLMProvider, StreamChunk, GenerationResult
4
+ from arbiter.core.providers.ollama import OllamaProvider
5
+ from arbiter.core.providers.openai_provider import OpenAIProvider
6
+ from arbiter.core.providers.anthropic_provider import AnthropicProvider
7
+ from arbiter.core.providers.google_provider import GoogleProvider
8
+ from arbiter.core.providers.factory import create_provider
9
+
10
+ __all__ = [
11
+ "LLMProvider",
12
+ "StreamChunk",
13
+ "GenerationResult",
14
+ "OllamaProvider",
15
+ "OpenAIProvider",
16
+ "AnthropicProvider",
17
+ "GoogleProvider",
18
+ "create_provider",
19
+ ]