genarena 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. genarena/__init__.py +49 -2
  2. genarena/__main__.py +10 -0
  3. genarena/arena.py +1685 -0
  4. genarena/battle.py +337 -0
  5. genarena/bt_elo.py +507 -0
  6. genarena/cli.py +1581 -0
  7. genarena/data.py +476 -0
  8. genarena/deploy/Dockerfile +25 -0
  9. genarena/deploy/README.md +55 -0
  10. genarena/deploy/__init__.py +5 -0
  11. genarena/deploy/app.py +84 -0
  12. genarena/experiments.py +121 -0
  13. genarena/leaderboard.py +270 -0
  14. genarena/logs.py +409 -0
  15. genarena/models.py +412 -0
  16. genarena/prompts/__init__.py +127 -0
  17. genarena/prompts/mmrb2.py +373 -0
  18. genarena/sampling.py +336 -0
  19. genarena/state.py +656 -0
  20. genarena/sync/__init__.py +105 -0
  21. genarena/sync/auto_commit.py +118 -0
  22. genarena/sync/deploy_ops.py +543 -0
  23. genarena/sync/git_ops.py +422 -0
  24. genarena/sync/hf_ops.py +891 -0
  25. genarena/sync/init_ops.py +431 -0
  26. genarena/sync/packer.py +587 -0
  27. genarena/sync/submit.py +837 -0
  28. genarena/utils.py +103 -0
  29. genarena/validation/__init__.py +19 -0
  30. genarena/validation/schema.py +327 -0
  31. genarena/validation/validator.py +329 -0
  32. genarena/visualize/README.md +148 -0
  33. genarena/visualize/__init__.py +14 -0
  34. genarena/visualize/app.py +938 -0
  35. genarena/visualize/data_loader.py +2335 -0
  36. genarena/visualize/static/app.js +3762 -0
  37. genarena/visualize/static/model_aliases.json +86 -0
  38. genarena/visualize/static/style.css +4104 -0
  39. genarena/visualize/templates/index.html +413 -0
  40. genarena/vlm.py +519 -0
  41. genarena-0.1.0.dist-info/METADATA +178 -0
  42. genarena-0.1.0.dist-info/RECORD +44 -0
  43. {genarena-0.0.1.dist-info → genarena-0.1.0.dist-info}/WHEEL +1 -2
  44. genarena-0.1.0.dist-info/entry_points.txt +2 -0
  45. genarena-0.0.1.dist-info/METADATA +0 -26
  46. genarena-0.0.1.dist-info/RECORD +0 -5
  47. genarena-0.0.1.dist-info/top_level.txt +0 -1
genarena/bt_elo.py ADDED
@@ -0,0 +1,507 @@
1
+ # Copyright 2026 Ruihang Li.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See LICENSE file in the project root for details.
4
+
5
+ """Bradley-Terry (BT) Elo rating utilities.
6
+
7
+ This module intentionally has **no** dependencies on code outside this package,
8
+ so `genarena_arena_evaluation` can be split out as an independent package.
9
+
10
+ The implementation follows the common BT-to-Elo conversion used by
11
+ VideoAutoArena-style scoring:
12
+ - Build a pairwise "win matrix" where a win counts as 2 points and a tie counts
13
+ as 1 point for each model.
14
+ - Fit BT parameters via a simple MM (minorization-maximization) iterative update
15
+ (no sklearn dependency).
16
+ - Convert BT parameters to Elo scale: Elo = SCALE * beta + INIT_RATING, where
17
+ beta is on a log(BASE) scale.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import math
23
+ import random
24
+ from collections import Counter
25
+ from collections.abc import Iterable, Sequence
26
+ from dataclasses import dataclass
27
+ from typing import Any, Optional
28
+
29
+ # BT-to-Elo conversion constants (VideoAutoArena-style defaults)
30
+ SCALE: float = 400.0
31
+ BASE: float = 10.0
32
+ INIT_RATING: float = 1000.0
33
+
34
+ # Public type: (model_a, model_b, winner) where winner is "model_a"/"model_b"/"tie"
35
+ BattleTuple = tuple[str, str, str]
36
+
37
+
38
+ def compute_bt_elo_ratings(
39
+ battles: Sequence[BattleTuple],
40
+ *,
41
+ models: Optional[Iterable[str]] = None,
42
+ scale: float = SCALE,
43
+ base: float = BASE,
44
+ init_rating: float = INIT_RATING,
45
+ fixed_ratings: Optional[dict[str, float]] = None,
46
+ max_iters: int = 100,
47
+ tol: float = 1e-6,
48
+ ) -> dict[str, float]:
49
+ """Compute BT-derived Elo ratings from battle outcomes.
50
+
51
+ Args:
52
+ battles: List of (model_a, model_b, winner) tuples.
53
+ winner must be one of: "model_a", "model_b", "tie".
54
+ models: Optional iterable of model names to include even if they have no battles.
55
+ scale: Elo scale factor (default 400).
56
+ base: Log base used for BT parameterization (default 10).
57
+ init_rating: Elo offset / initial rating (default 1000).
58
+ fixed_ratings: Optional mapping of model name -> Elo rating for fixed anchors.
59
+ When provided and non-empty, those models' Elo values are kept **exactly**
60
+ unchanged, and only the remaining models are fit via BT MM updates.
61
+ max_iters: Maximum MM iterations.
62
+ tol: Convergence tolerance on the probability vector.
63
+
64
+ Returns:
65
+ Dict mapping model name to Elo rating.
66
+ """
67
+ model_set = set(models or [])
68
+ for a, b, _ in battles:
69
+ if a:
70
+ model_set.add(a)
71
+ if b:
72
+ model_set.add(b)
73
+
74
+ model_list = sorted(model_set)
75
+ if not model_list:
76
+ return {}
77
+ if len(model_list) == 1:
78
+ return {model_list[0]: init_rating}
79
+
80
+ fixed_ratings = fixed_ratings or {}
81
+ fixed_set = set(fixed_ratings.keys())
82
+ if fixed_set:
83
+ # Filter anchors to only those in the requested model universe
84
+ fixed_set = fixed_set & set(model_list)
85
+ fixed_ratings = {m: float(fixed_ratings[m]) for m in fixed_set}
86
+
87
+ if fixed_set:
88
+ # === Anchored BT fit in strength space ===
89
+ # We represent BT strengths as positive numbers s_m such that:
90
+ # P(i beats j) = s_i / (s_i + s_j)
91
+ # and convert to Elo via:
92
+ # Elo = init_rating + scale * log_base(s)
93
+ #
94
+ # This is equivalent to the existing implementation (which uses probabilities
95
+ # normalized to sum=1) but avoids global renormalization so fixed anchors
96
+ # can remain unchanged even when new models are introduced.
97
+ W: dict[str, dict[str, float]] = {m: {} for m in model_list}
98
+ for i in model_list:
99
+ for j in model_list:
100
+ if i != j:
101
+ W[i][j] = 0.0
102
+
103
+ # Adjacency for connectivity checks (n_ij > 0 indicates at least one battle)
104
+ adj: dict[str, set[str]] = {m: set() for m in model_list}
105
+
106
+ for model_a, model_b, winner in battles:
107
+ if not model_a or not model_b or model_a == model_b:
108
+ continue
109
+ if model_a not in W or model_b not in W:
110
+ continue
111
+ if winner == "model_a":
112
+ W[model_a][model_b] += 2.0
113
+ elif winner == "model_b":
114
+ W[model_b][model_a] += 2.0
115
+ else:
116
+ W[model_a][model_b] += 1.0
117
+ W[model_b][model_a] += 1.0
118
+ adj[model_a].add(model_b)
119
+ adj[model_b].add(model_a)
120
+
121
+ log_base = math.log(base)
122
+ # Initialize strengths
123
+ s: dict[str, float] = {}
124
+ for m in model_list:
125
+ if m in fixed_set:
126
+ beta = (fixed_ratings[m] - init_rating) / scale
127
+ s[m] = base ** beta
128
+ else:
129
+ s[m] = 1.0
130
+
131
+ free = [m for m in model_list if m not in fixed_set]
132
+
133
+ for _ in range(max_iters):
134
+ max_diff = 0.0
135
+ for i in free:
136
+ num = 0.0
137
+ denom = 0.0
138
+ for j in model_list:
139
+ if i == j:
140
+ continue
141
+ w_ij = W[i][j]
142
+ w_ji = W[j][i]
143
+ n_ij = w_ij + w_ji
144
+ if n_ij <= 0:
145
+ continue
146
+ num += w_ij
147
+ denom += n_ij / (s[i] + s[j])
148
+
149
+ s_new = (num / denom) if denom > 0 else s[i]
150
+ max_diff = max(max_diff, abs(s_new - s[i]))
151
+ s[i] = s_new
152
+
153
+ if max_diff <= tol:
154
+ break
155
+
156
+ # If any free-model connected component has no path to fixed anchors,
157
+ # its absolute scale is unidentifiable. Normalize such components to mean(s)=1
158
+ # to keep their average Elo at init_rating, without affecting likelihood.
159
+ visited: set[str] = set()
160
+ fixed_neighbors = fixed_set
161
+
162
+ for m in free:
163
+ if m in visited:
164
+ continue
165
+ stack = [m]
166
+ comp: list[str] = []
167
+ visited.add(m)
168
+ connected_to_fixed = False
169
+ while stack:
170
+ x = stack.pop()
171
+ comp.append(x)
172
+ for y in adj.get(x, set()):
173
+ if y in fixed_neighbors:
174
+ connected_to_fixed = True
175
+ continue
176
+ if y in fixed_set:
177
+ connected_to_fixed = True
178
+ continue
179
+ if y in free and y not in visited:
180
+ visited.add(y)
181
+ stack.append(y)
182
+ if not connected_to_fixed and comp:
183
+ mean_s = sum(s[x] for x in comp) / len(comp)
184
+ if mean_s > 0:
185
+ for x in comp:
186
+ s[x] = s[x] / mean_s
187
+
188
+ ratings: dict[str, float] = {}
189
+ for m in model_list:
190
+ if m in fixed_set:
191
+ ratings[m] = float(fixed_ratings[m])
192
+ else:
193
+ if s[m] > 0:
194
+ ratings[m] = scale * (math.log(s[m]) / log_base) + init_rating
195
+ else:
196
+ ratings[m] = init_rating
197
+
198
+ return ratings
199
+
200
+ # Win matrix W where W[i][j] is "points" of i over j:
201
+ # - win: +2 to winner over loser
202
+ # - tie: +1 to each direction
203
+ W: dict[str, dict[str, float]] = {m: {} for m in model_list}
204
+ for i in model_list:
205
+ for j in model_list:
206
+ if i != j:
207
+ W[i][j] = 0.0
208
+
209
+ for model_a, model_b, winner in battles:
210
+ if not model_a or not model_b or model_a == model_b:
211
+ continue
212
+ if winner == "model_a":
213
+ W[model_a][model_b] += 2.0
214
+ elif winner == "model_b":
215
+ W[model_b][model_a] += 2.0
216
+ else:
217
+ # Tie
218
+ W[model_a][model_b] += 1.0
219
+ W[model_b][model_a] += 1.0
220
+
221
+ # MM algorithm on probabilities p_i (sum to 1)
222
+ n = len(model_list)
223
+ p: dict[str, float] = {m: 1.0 / n for m in model_list}
224
+
225
+ for _ in range(max_iters):
226
+ p_new: dict[str, float] = {}
227
+ for i in model_list:
228
+ num = 0.0
229
+ denom = 0.0
230
+ for j in model_list:
231
+ if i == j:
232
+ continue
233
+ w_ij = W[i][j]
234
+ w_ji = W[j][i]
235
+ n_ij = w_ij + w_ji
236
+ if n_ij <= 0:
237
+ continue
238
+ num += w_ij
239
+ denom += n_ij / (p[i] + p[j])
240
+
241
+ p_new[i] = (num / denom) if denom > 0 else p[i]
242
+
243
+ total = sum(p_new.values())
244
+ if total > 0:
245
+ for k in p_new:
246
+ p_new[k] /= total
247
+
248
+ max_diff = max(abs(p_new[m] - p[m]) for m in model_list)
249
+ p = p_new
250
+ if max_diff <= tol:
251
+ break
252
+
253
+ mean_p = sum(p.values()) / n if n > 0 else 0.0
254
+ if mean_p <= 0:
255
+ return {m: init_rating for m in model_list}
256
+
257
+ log_base = math.log(base)
258
+ ratings: dict[str, float] = {}
259
+ for m in model_list:
260
+ if p[m] > 0:
261
+ beta = math.log(p[m] / mean_p) / log_base
262
+ ratings[m] = scale * beta + init_rating
263
+ else:
264
+ ratings[m] = init_rating
265
+
266
+ return ratings
267
+
268
+
269
+ @dataclass
270
+ class BootstrapResult:
271
+ """Result of bootstrap ELO computation with confidence intervals.
272
+
273
+ Attributes:
274
+ ratings: Point estimates of ELO ratings (median of bootstrap samples).
275
+ ci_lower: Lower bound of 95% CI (2.5th percentile).
276
+ ci_upper: Upper bound of 95% CI (97.5th percentile).
277
+ ci_width: Width of 95% CI (ci_upper - ci_lower) for each model.
278
+ std: Standard deviation of bootstrap samples.
279
+ num_battles: Number of battles used for computation.
280
+ num_bootstrap: Number of bootstrap iterations performed.
281
+ """
282
+ ratings: dict[str, float]
283
+ ci_lower: dict[str, float]
284
+ ci_upper: dict[str, float]
285
+ ci_width: dict[str, float]
286
+ std: dict[str, float]
287
+ num_battles: int
288
+ num_bootstrap: int
289
+
290
+ def to_dict(self) -> dict[str, Any]:
291
+ """Convert to dictionary for serialization."""
292
+ return {
293
+ "ratings": self.ratings,
294
+ "ci_lower": self.ci_lower,
295
+ "ci_upper": self.ci_upper,
296
+ "ci_width": self.ci_width,
297
+ "std": self.std,
298
+ "num_battles": self.num_battles,
299
+ "num_bootstrap": self.num_bootstrap,
300
+ }
301
+
302
+ @classmethod
303
+ def from_dict(cls, data: dict[str, Any]) -> "BootstrapResult":
304
+ """Create from dictionary."""
305
+ return cls(
306
+ ratings=data.get("ratings", {}),
307
+ ci_lower=data.get("ci_lower", {}),
308
+ ci_upper=data.get("ci_upper", {}),
309
+ ci_width=data.get("ci_width", {}),
310
+ std=data.get("std", {}),
311
+ num_battles=data.get("num_battles", 0),
312
+ num_bootstrap=data.get("num_bootstrap", 0),
313
+ )
314
+
315
+ def get_model_ci_width(self, model: str) -> float:
316
+ """Get CI width for a specific model."""
317
+ return self.ci_width.get(model, float("inf"))
318
+
319
+ def get_max_ci_width(self) -> float:
320
+ """Get the maximum CI width across all models."""
321
+ if not self.ci_width:
322
+ return float("inf")
323
+ return max(self.ci_width.values())
324
+
325
+ def get_mean_ci_width(self) -> float:
326
+ """Get the mean CI width across all models."""
327
+ if not self.ci_width:
328
+ return float("inf")
329
+ return sum(self.ci_width.values()) / len(self.ci_width)
330
+
331
+
332
+ def compute_bootstrap_bt_elo(
333
+ battles: Sequence[BattleTuple],
334
+ *,
335
+ models: Optional[Iterable[str]] = None,
336
+ num_bootstrap: int = 100,
337
+ scale: float = SCALE,
338
+ base: float = BASE,
339
+ init_rating: float = INIT_RATING,
340
+ fixed_ratings: Optional[dict[str, float]] = None,
341
+ seed: Optional[int] = None,
342
+ ) -> BootstrapResult:
343
+ """Compute BT Elo ratings with 95% confidence intervals via bootstrap.
344
+
345
+ Uses multinomial resampling on unique (model_a, model_b, outcome) counts
346
+ for efficiency, following the FastChat approach.
347
+
348
+ Args:
349
+ battles: List of (model_a, model_b, winner) tuples.
350
+ models: Optional iterable of model names to include.
351
+ num_bootstrap: Number of bootstrap iterations (default 100).
352
+ scale: Elo scale factor (default 400).
353
+ base: Log base for BT parameterization (default 10).
354
+ init_rating: Elo offset / initial rating (default 1000).
355
+ fixed_ratings: Optional mapping of model name -> Elo for fixed anchors.
356
+ seed: Random seed for reproducibility.
357
+
358
+ Returns:
359
+ BootstrapResult with ratings and confidence intervals.
360
+ """
361
+ if seed is not None:
362
+ random.seed(seed)
363
+
364
+ battles_list = list(battles)
365
+ n_battles = len(battles_list)
366
+
367
+ if n_battles == 0:
368
+ return BootstrapResult(
369
+ ratings={},
370
+ ci_lower={},
371
+ ci_upper={},
372
+ ci_width={},
373
+ std={},
374
+ num_battles=0,
375
+ num_bootstrap=num_bootstrap,
376
+ )
377
+
378
+ # Get point estimate
379
+ point_ratings = compute_bt_elo_ratings(
380
+ battles_list,
381
+ models=models,
382
+ scale=scale,
383
+ base=base,
384
+ init_rating=init_rating,
385
+ fixed_ratings=fixed_ratings,
386
+ )
387
+
388
+ model_list = sorted(point_ratings.keys())
389
+ if not model_list:
390
+ return BootstrapResult(
391
+ ratings={},
392
+ ci_lower={},
393
+ ci_upper={},
394
+ ci_width={},
395
+ std={},
396
+ num_battles=n_battles,
397
+ num_bootstrap=num_bootstrap,
398
+ )
399
+
400
+ # Count unique battle outcomes for efficient multinomial resampling
401
+ battle_counts = Counter(battles_list)
402
+ unique_battles = list(battle_counts.keys())
403
+ counts = [battle_counts[b] for b in unique_battles]
404
+ total_count = sum(counts)
405
+
406
+ # Bootstrap iterations
407
+ bootstrap_ratings: list[dict[str, float]] = []
408
+
409
+ for _ in range(num_bootstrap):
410
+ # Multinomial resampling of counts
411
+ sampled_counts = _multinomial_sample(total_count, counts)
412
+
413
+ # Reconstruct battles from sampled counts
414
+ sampled_battles: list[BattleTuple] = []
415
+ for battle, count in zip(unique_battles, sampled_counts):
416
+ sampled_battles.extend([battle] * count)
417
+
418
+ # Compute ratings for this bootstrap sample
419
+ sample_ratings = compute_bt_elo_ratings(
420
+ sampled_battles,
421
+ models=models,
422
+ scale=scale,
423
+ base=base,
424
+ init_rating=init_rating,
425
+ fixed_ratings=fixed_ratings,
426
+ )
427
+ bootstrap_ratings.append(sample_ratings)
428
+
429
+ # Compute statistics
430
+ ratings_matrix: dict[str, list[float]] = {m: [] for m in model_list}
431
+ for br in bootstrap_ratings:
432
+ for m in model_list:
433
+ ratings_matrix[m].append(br.get(m, init_rating))
434
+
435
+ ci_lower: dict[str, float] = {}
436
+ ci_upper: dict[str, float] = {}
437
+ ci_width: dict[str, float] = {}
438
+ std: dict[str, float] = {}
439
+ median_ratings: dict[str, float] = {}
440
+
441
+ for m in model_list:
442
+ values = sorted(ratings_matrix[m])
443
+ n = len(values)
444
+
445
+ # Percentiles for 95% CI
446
+ lower_idx = int(n * 0.025)
447
+ upper_idx = int(n * 0.975)
448
+ median_idx = n // 2
449
+
450
+ ci_lower[m] = values[lower_idx] if n > 0 else init_rating
451
+ ci_upper[m] = values[min(upper_idx, n - 1)] if n > 0 else init_rating
452
+ ci_width[m] = ci_upper[m] - ci_lower[m]
453
+ median_ratings[m] = values[median_idx] if n > 0 else init_rating
454
+
455
+ # Standard deviation
456
+ if n > 1:
457
+ mean_val = sum(values) / n
458
+ variance = sum((v - mean_val) ** 2 for v in values) / (n - 1)
459
+ std[m] = math.sqrt(variance)
460
+ else:
461
+ std[m] = 0.0
462
+
463
+ return BootstrapResult(
464
+ ratings=median_ratings,
465
+ ci_lower=ci_lower,
466
+ ci_upper=ci_upper,
467
+ ci_width=ci_width,
468
+ std=std,
469
+ num_battles=n_battles,
470
+ num_bootstrap=num_bootstrap,
471
+ )
472
+
473
+
474
+ def _multinomial_sample(n: int, weights: list[int]) -> list[int]:
475
+ """Sample from multinomial distribution.
476
+
477
+ Args:
478
+ n: Total number of samples to draw.
479
+ weights: Unnormalized weights (counts) for each category.
480
+
481
+ Returns:
482
+ List of counts for each category summing to n.
483
+ """
484
+ total_weight = sum(weights)
485
+ if total_weight == 0:
486
+ return [0] * len(weights)
487
+
488
+ # Normalize to probabilities
489
+ probs = [w / total_weight for w in weights]
490
+
491
+ # Sample n items according to probabilities
492
+ result = [0] * len(weights)
493
+ for _ in range(n):
494
+ r = random.random()
495
+ cumsum = 0.0
496
+ for i, p in enumerate(probs):
497
+ cumsum += p
498
+ if r < cumsum:
499
+ result[i] += 1
500
+ break
501
+ else:
502
+ # Edge case: assign to last category
503
+ result[-1] += 1
504
+
505
+ return result
506
+
507
+