genarena 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. genarena/__init__.py +49 -2
  2. genarena/__main__.py +10 -0
  3. genarena/arena.py +1685 -0
  4. genarena/battle.py +337 -0
  5. genarena/bt_elo.py +507 -0
  6. genarena/cli.py +1581 -0
  7. genarena/data.py +476 -0
  8. genarena/deploy/Dockerfile +25 -0
  9. genarena/deploy/README.md +55 -0
  10. genarena/deploy/__init__.py +5 -0
  11. genarena/deploy/app.py +84 -0
  12. genarena/experiments.py +121 -0
  13. genarena/leaderboard.py +270 -0
  14. genarena/logs.py +409 -0
  15. genarena/models.py +412 -0
  16. genarena/prompts/__init__.py +127 -0
  17. genarena/prompts/mmrb2.py +373 -0
  18. genarena/sampling.py +336 -0
  19. genarena/state.py +656 -0
  20. genarena/sync/__init__.py +105 -0
  21. genarena/sync/auto_commit.py +118 -0
  22. genarena/sync/deploy_ops.py +543 -0
  23. genarena/sync/git_ops.py +422 -0
  24. genarena/sync/hf_ops.py +891 -0
  25. genarena/sync/init_ops.py +431 -0
  26. genarena/sync/packer.py +587 -0
  27. genarena/sync/submit.py +837 -0
  28. genarena/utils.py +103 -0
  29. genarena/validation/__init__.py +19 -0
  30. genarena/validation/schema.py +327 -0
  31. genarena/validation/validator.py +329 -0
  32. genarena/visualize/README.md +148 -0
  33. genarena/visualize/__init__.py +14 -0
  34. genarena/visualize/app.py +938 -0
  35. genarena/visualize/data_loader.py +2335 -0
  36. genarena/visualize/static/app.js +3762 -0
  37. genarena/visualize/static/model_aliases.json +86 -0
  38. genarena/visualize/static/style.css +4104 -0
  39. genarena/visualize/templates/index.html +413 -0
  40. genarena/vlm.py +519 -0
  41. genarena-0.1.0.dist-info/METADATA +178 -0
  42. genarena-0.1.0.dist-info/RECORD +44 -0
  43. {genarena-0.0.1.dist-info → genarena-0.1.0.dist-info}/WHEEL +1 -2
  44. genarena-0.1.0.dist-info/entry_points.txt +2 -0
  45. genarena-0.0.1.dist-info/METADATA +0 -26
  46. genarena-0.0.1.dist-info/RECORD +0 -5
  47. genarena-0.0.1.dist-info/top_level.txt +0 -1
genarena/state.py ADDED
@@ -0,0 +1,656 @@
1
+ # Copyright 2026 Ruihang Li.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See LICENSE file in the project root for details.
4
+
5
+ """Arena state management module (Bradley-Terry Elo scoring)."""
6
+
7
+ import json
8
+ import os
9
+ from dataclasses import dataclass, field
10
+ from typing import Any, Optional
11
+
12
+ from genarena.utils import ensure_dir, iso_timestamp
13
+
14
+
15
+ # BT-to-Elo conversion constants (VideoAutoArena-style defaults)
16
+ # NOTE: This package intentionally uses *batch* Bradley-Terry scoring rather than
17
+ # online ELO with a K-factor, so scores are order-independent and reproducible.
18
+ SCALE = 400.0
19
+ BASE = 10.0
20
+ INIT_RATING = 1000.0
21
+
22
+ # Backward-compatible alias for existing state.json fields/defaults
23
+ DEFAULT_ELO = INIT_RATING
24
+
25
+ # Default number of bootstrap iterations for CI computation
26
+ DEFAULT_NUM_BOOTSTRAP = 100
27
+
28
+
29
+ @dataclass
30
+ class ModelStats:
31
+ """Statistics for a single model."""
32
+
33
+ elo: float = DEFAULT_ELO
34
+ wins: int = 0
35
+ losses: int = 0
36
+ ties: int = 0
37
+ ci_lower: Optional[float] = None
38
+ ci_upper: Optional[float] = None
39
+
40
+ @property
41
+ def total_battles(self) -> int:
42
+ """Total number of battles."""
43
+ return self.wins + self.losses + self.ties
44
+
45
+ @property
46
+ def win_rate(self) -> float:
47
+ """Win rate (wins / total, ties count as 0.5)."""
48
+ if self.total_battles == 0:
49
+ return 0.0
50
+ return (self.wins + 0.5 * self.ties) / self.total_battles
51
+
52
+ @property
53
+ def ci_width(self) -> Optional[float]:
54
+ """95% CI width (upper - lower), or None if CI not computed."""
55
+ if self.ci_lower is None or self.ci_upper is None:
56
+ return None
57
+ return self.ci_upper - self.ci_lower
58
+
59
+ def to_dict(self) -> dict[str, Any]:
60
+ """Convert to dictionary."""
61
+ result = {
62
+ "elo": self.elo,
63
+ "wins": self.wins,
64
+ "losses": self.losses,
65
+ "ties": self.ties
66
+ }
67
+ if self.ci_lower is not None:
68
+ result["ci_lower"] = self.ci_lower
69
+ if self.ci_upper is not None:
70
+ result["ci_upper"] = self.ci_upper
71
+ return result
72
+
73
+ @classmethod
74
+ def from_dict(cls, data: dict[str, Any]) -> "ModelStats":
75
+ """Create from dictionary."""
76
+ return cls(
77
+ elo=data.get("elo", DEFAULT_ELO),
78
+ wins=data.get("wins", 0),
79
+ losses=data.get("losses", 0),
80
+ ties=data.get("ties", 0),
81
+ ci_lower=data.get("ci_lower"),
82
+ ci_upper=data.get("ci_upper"),
83
+ )
84
+
85
+
86
+ @dataclass
87
+ class ArenaState:
88
+ """
89
+ Arena state containing ELO ratings and battle statistics.
90
+
91
+ Manages model ratings and provides methods for ELO updates.
92
+ """
93
+
94
+ # Model name -> ModelStats
95
+ models: dict[str, ModelStats] = field(default_factory=dict)
96
+
97
+ # Total battles processed
98
+ total_battles: int = 0
99
+
100
+ # Last update timestamp
101
+ last_updated: str = ""
102
+
103
+ def get_model_stats(self, model: str) -> ModelStats:
104
+ """
105
+ Get stats for a model, creating if necessary.
106
+
107
+ Args:
108
+ model: Model name
109
+
110
+ Returns:
111
+ ModelStats for the model
112
+ """
113
+ if model not in self.models:
114
+ self.models[model] = ModelStats()
115
+ return self.models[model]
116
+
117
+ def get_elo(self, model: str) -> float:
118
+ """Get ELO rating for a model."""
119
+ return self.get_model_stats(model).elo
120
+
121
+ def to_dict(self) -> dict[str, Any]:
122
+ """Convert to dictionary for serialization."""
123
+ return {
124
+ "models": {
125
+ name: stats.to_dict()
126
+ for name, stats in self.models.items()
127
+ },
128
+ "total_battles": self.total_battles,
129
+ "last_updated": self.last_updated
130
+ }
131
+
132
+ @classmethod
133
+ def from_dict(cls, data: dict[str, Any]) -> "ArenaState":
134
+ """Create from dictionary."""
135
+ state = cls()
136
+
137
+ models_data = data.get("models", {})
138
+ for name, stats_data in models_data.items():
139
+ state.models[name] = ModelStats.from_dict(stats_data)
140
+
141
+ state.total_battles = data.get("total_battles", 0)
142
+ state.last_updated = data.get("last_updated", "")
143
+
144
+ return state
145
+
146
+
147
+ def update_stats(
148
+ state: ArenaState,
149
+ model_a: str,
150
+ model_b: str,
151
+ winner: str
152
+ ) -> ArenaState:
153
+ """
154
+ Update win/loss/tie statistics based on a battle result.
155
+
156
+ This does NOT update Elo ratings directly. Elo ratings are computed via
157
+ Bradley-Terry model fitting from accumulated battle records (see
158
+ `rebuild_state_from_logs`).
159
+
160
+ Args:
161
+ state: Current arena state
162
+ model_a: First model name
163
+ model_b: Second model name
164
+ winner: "model_a", "model_b", or "tie" (or the actual model name)
165
+
166
+ Returns:
167
+ Updated arena state
168
+ """
169
+ stats_a = state.get_model_stats(model_a)
170
+ stats_b = state.get_model_stats(model_b)
171
+
172
+ # Determine actual scores based on winner
173
+ winner_lower = winner.lower()
174
+
175
+ if winner_lower == model_a.lower() or winner_lower == "model_a":
176
+ # Model A wins
177
+ stats_a.wins += 1
178
+ stats_b.losses += 1
179
+ elif winner_lower == model_b.lower() or winner_lower == "model_b":
180
+ # Model B wins
181
+ stats_a.losses += 1
182
+ stats_b.wins += 1
183
+ else:
184
+ # Tie
185
+ stats_a.ties += 1
186
+ stats_b.ties += 1
187
+
188
+ # Update state metadata
189
+ state.total_battles += 1
190
+ state.last_updated = iso_timestamp()
191
+
192
+ return state
193
+
194
+
195
+ def load_state(path: str) -> ArenaState:
196
+ """
197
+ Load arena state from a JSON file.
198
+
199
+ Args:
200
+ path: Path to state.json file
201
+
202
+ Returns:
203
+ Loaded ArenaState, or empty state if file doesn't exist
204
+ """
205
+ if not os.path.isfile(path):
206
+ return ArenaState()
207
+
208
+ try:
209
+ with open(path, "r", encoding="utf-8") as f:
210
+ data = json.load(f)
211
+ return ArenaState.from_dict(data)
212
+ except (json.JSONDecodeError, IOError):
213
+ return ArenaState()
214
+
215
+
216
+ def save_state(state: ArenaState, path: str) -> None:
217
+ """
218
+ Save arena state to a JSON file.
219
+
220
+ Args:
221
+ state: ArenaState to save
222
+ path: Path to save to
223
+ """
224
+ # Ensure directory exists
225
+ ensure_dir(os.path.dirname(path))
226
+
227
+ # Update timestamp
228
+ state.last_updated = iso_timestamp()
229
+
230
+ with open(path, "w", encoding="utf-8") as f:
231
+ json.dump(state.to_dict(), f, indent=2, ensure_ascii=False)
232
+
233
+
234
+ def rebuild_state_from_logs(
235
+ pk_logs_dir: str,
236
+ models: Optional[list[str]] = None
237
+ ) -> ArenaState:
238
+ """
239
+ Rebuild arena state from battle log files.
240
+
241
+ Recomputes:
242
+ - W/L/T statistics
243
+ - Bradley-Terry Elo ratings (VideoAutoArena-style), order-independent
244
+
245
+ Args:
246
+ pk_logs_dir: Path to pk_logs directory
247
+ models: Optional list of models to include (includes all if None)
248
+
249
+ Returns:
250
+ Rebuilt ArenaState
251
+ """
252
+ from genarena.logs import load_battle_records
253
+ from genarena.bt_elo import compute_bt_elo_ratings, compute_bootstrap_bt_elo
254
+ from genarena.experiments import is_milestone_exp, parse_exp_date_suffix
255
+
256
+ state = ArenaState()
257
+
258
+ if not os.path.isdir(pk_logs_dir):
259
+ return state
260
+
261
+ # Discover experiment directories (enforce `_yyyymmdd` suffix)
262
+ exp_keys: list[tuple[tuple, str]] = []
263
+ # key is (date, name) for deterministic ordering
264
+ for name in os.listdir(pk_logs_dir):
265
+ if name.startswith("."):
266
+ continue
267
+ exp_dir = os.path.join(pk_logs_dir, name)
268
+ if not os.path.isdir(exp_dir):
269
+ continue
270
+ d = parse_exp_date_suffix(name)
271
+ if d is None:
272
+ raise ValueError(
273
+ f"Invalid experiment directory under pk_logs: '{name}'. "
274
+ f"Expected exp_name ending with `_yyyymmdd`."
275
+ )
276
+ exp_keys.append(((d, name), name))
277
+ exp_keys.sort(key=lambda x: x[0])
278
+
279
+ milestones = [name for (key, name) in exp_keys if is_milestone_exp(name)]
280
+
281
+ def _winner_side(model_a: str, model_b: str, winner: str) -> str:
282
+ """Normalize winner to 'model_a'/'model_b'/'tie' relative to (model_a, model_b)."""
283
+ w = str(winner).lower()
284
+ if w == "tie":
285
+ return "tie"
286
+ if w == str(model_a).lower():
287
+ return "model_a"
288
+ if w == str(model_b).lower():
289
+ return "model_b"
290
+ return "tie"
291
+
292
+ def _load_elo_snapshot(path: str) -> Optional[dict[str, float]]:
293
+ if not os.path.isfile(path):
294
+ return None
295
+ try:
296
+ with open(path, "r", encoding="utf-8") as f:
297
+ data = json.load(f)
298
+ except Exception:
299
+ return None
300
+
301
+ if not isinstance(data, dict):
302
+ return None
303
+
304
+ # Accept either: {"elo": {...}} or a direct {model: elo} mapping.
305
+ raw = data.get("elo") if isinstance(data.get("elo"), dict) else data
306
+ if not isinstance(raw, dict):
307
+ return None
308
+
309
+ out: dict[str, float] = {}
310
+ for k, v in raw.items():
311
+ try:
312
+ out[str(k)] = float(v)
313
+ except Exception:
314
+ continue
315
+ return out or None
316
+
317
+ def _save_elo_snapshot(
318
+ path: str,
319
+ *,
320
+ exp_name: str,
321
+ elo: dict[str, float],
322
+ model_count: int,
323
+ battle_count: int,
324
+ ci_lower: Optional[dict[str, float]] = None,
325
+ ci_upper: Optional[dict[str, float]] = None,
326
+ ci_width: Optional[dict[str, float]] = None,
327
+ std: Optional[dict[str, float]] = None,
328
+ num_bootstrap: Optional[int] = None,
329
+ ) -> None:
330
+ ensure_dir(os.path.dirname(path))
331
+ payload: dict[str, Any] = {
332
+ "exp_name": exp_name,
333
+ "generated_at": iso_timestamp(),
334
+ "params": {"scale": SCALE, "base": BASE, "init_rating": INIT_RATING},
335
+ "model_count": int(model_count),
336
+ "battle_count": int(battle_count),
337
+ "elo": {k: float(v) for k, v in sorted(elo.items(), key=lambda x: x[0])},
338
+ }
339
+
340
+ # Include CI information if available
341
+ if ci_lower is not None:
342
+ payload["ci_lower"] = {k: float(v) for k, v in sorted(ci_lower.items(), key=lambda x: x[0])}
343
+ if ci_upper is not None:
344
+ payload["ci_upper"] = {k: float(v) for k, v in sorted(ci_upper.items(), key=lambda x: x[0])}
345
+ if ci_width is not None:
346
+ payload["ci_width"] = {k: float(v) for k, v in sorted(ci_width.items(), key=lambda x: x[0])}
347
+ if std is not None:
348
+ payload["std"] = {k: float(v) for k, v in sorted(std.items(), key=lambda x: x[0])}
349
+ if num_bootstrap is not None:
350
+ payload["num_bootstrap"] = int(num_bootstrap)
351
+
352
+ with open(path, "w", encoding="utf-8") as f:
353
+ json.dump(payload, f, indent=2, ensure_ascii=False)
354
+
355
+ def _save_exp_readme(
356
+ exp_dir: str,
357
+ exp_name: str,
358
+ elo: dict[str, float],
359
+ model_count: int,
360
+ battle_count: int,
361
+ ci_lower: Optional[dict[str, float]] = None,
362
+ ci_upper: Optional[dict[str, float]] = None,
363
+ ) -> None:
364
+ """Save README.md with cumulative leaderboard for an experiment directory."""
365
+ from genarena.leaderboard import generate_experiment_readme
366
+ readme_path = os.path.join(exp_dir, "README.md")
367
+ content = generate_experiment_readme(
368
+ exp_name=exp_name,
369
+ elo=elo,
370
+ model_count=model_count,
371
+ battle_count=battle_count,
372
+ ci_lower=ci_lower,
373
+ ci_upper=ci_upper,
374
+ )
375
+ with open(readme_path, "w", encoding="utf-8") as f:
376
+ f.write(content)
377
+
378
+ # If no milestones exist, fall back to the legacy full-fit behavior.
379
+ # Also generate README.md for experiments missing them.
380
+ if not milestones:
381
+ # Track battles per experiment for README generation
382
+ battles_cumulative: list[tuple[str, str, str]] = []
383
+ models_seen_cumulative: set[str] = set()
384
+
385
+ for (key, name) in exp_keys:
386
+ exp_records = load_battle_records(pk_logs_dir, exp_name=name)
387
+
388
+ for record in exp_records:
389
+ model_a = record.get("model_a", "")
390
+ model_b = record.get("model_b", "")
391
+ winner = record.get("final_winner", "tie")
392
+
393
+ if models:
394
+ if model_a not in models or model_b not in models:
395
+ continue
396
+
397
+ if model_a and model_b:
398
+ update_stats(state, model_a, model_b, winner)
399
+ battles_cumulative.append((model_a, model_b, _winner_side(model_a, model_b, winner)))
400
+ models_seen_cumulative.add(model_a)
401
+ models_seen_cumulative.add(model_b)
402
+
403
+ # Check if elo_snapshot.json or README.md is missing for this experiment
404
+ exp_dir = os.path.join(pk_logs_dir, name)
405
+ snapshot_path = os.path.join(exp_dir, "elo_snapshot.json")
406
+ readme_path = os.path.join(exp_dir, "README.md")
407
+ expected_models = sorted(models_seen_cumulative)
408
+
409
+ if expected_models:
410
+ existing_snapshot = _load_elo_snapshot(snapshot_path)
411
+ need_snapshot = existing_snapshot is None or any(m not in existing_snapshot for m in expected_models)
412
+ need_readme = not os.path.isfile(readme_path)
413
+
414
+ if need_snapshot or need_readme:
415
+ bootstrap_result = compute_bootstrap_bt_elo(
416
+ battles_cumulative,
417
+ models=expected_models,
418
+ num_bootstrap=DEFAULT_NUM_BOOTSTRAP,
419
+ scale=SCALE,
420
+ base=BASE,
421
+ init_rating=INIT_RATING,
422
+ )
423
+ if need_snapshot:
424
+ _save_elo_snapshot(
425
+ snapshot_path,
426
+ exp_name=name,
427
+ elo=bootstrap_result.ratings,
428
+ model_count=len(expected_models),
429
+ battle_count=len(battles_cumulative),
430
+ ci_lower=bootstrap_result.ci_lower,
431
+ ci_upper=bootstrap_result.ci_upper,
432
+ ci_width=bootstrap_result.ci_width,
433
+ std=bootstrap_result.std,
434
+ num_bootstrap=bootstrap_result.num_bootstrap,
435
+ )
436
+ if need_readme:
437
+ _save_exp_readme(
438
+ exp_dir=exp_dir,
439
+ exp_name=name,
440
+ elo=bootstrap_result.ratings,
441
+ model_count=len(expected_models),
442
+ battle_count=len(battles_cumulative),
443
+ ci_lower=bootstrap_result.ci_lower,
444
+ ci_upper=bootstrap_result.ci_upper,
445
+ )
446
+
447
+ include_models = models if models is not None else list(state.models.keys())
448
+
449
+ # Compute bootstrap CI for final ratings
450
+ bootstrap_result = compute_bootstrap_bt_elo(
451
+ battles_cumulative,
452
+ models=include_models,
453
+ num_bootstrap=DEFAULT_NUM_BOOTSTRAP,
454
+ scale=SCALE,
455
+ base=BASE,
456
+ init_rating=INIT_RATING,
457
+ )
458
+
459
+ for m in bootstrap_result.ratings:
460
+ stats = state.get_model_stats(m)
461
+ stats.elo = float(bootstrap_result.ratings[m])
462
+ stats.ci_lower = bootstrap_result.ci_lower.get(m)
463
+ stats.ci_upper = bootstrap_result.ci_upper.get(m)
464
+
465
+ state.last_updated = iso_timestamp()
466
+ return state
467
+
468
+ # === Milestone mode ===
469
+ # Ensure every milestone has an elo_snapshot.json (auto-generate if missing/incomplete),
470
+ # then use the latest milestone snapshot as fixed anchors to insert newer models.
471
+ milestone_set = set(milestones)
472
+ latest_milestone_name = milestones[-1]
473
+ latest_milestone_key = next(k for (k, name) in exp_keys if name == latest_milestone_name)
474
+
475
+ models_filter = set(models) if models else None
476
+
477
+ battles_all: list[tuple[str, str, str]] = []
478
+ battles_after_latest: list[tuple[str, str, str]] = []
479
+ models_seen_upto: set[str] = set()
480
+ models_seen_all: set[str] = set()
481
+
482
+ # Iterate experiments in order, accumulate battles, and generate snapshots at milestones.
483
+ for (key, name) in exp_keys:
484
+ exp_records = load_battle_records(pk_logs_dir, exp_name=name)
485
+
486
+ for record in exp_records:
487
+ model_a = record.get("model_a", "")
488
+ model_b = record.get("model_b", "")
489
+ winner = record.get("final_winner", "tie")
490
+
491
+ if models_filter is not None:
492
+ if model_a not in models_filter or model_b not in models_filter:
493
+ continue
494
+
495
+ if not model_a or not model_b:
496
+ continue
497
+
498
+ update_stats(state, model_a, model_b, winner)
499
+
500
+ side = _winner_side(model_a, model_b, winner)
501
+ battles_all.append((model_a, model_b, side))
502
+ models_seen_all.add(model_a)
503
+ models_seen_all.add(model_b)
504
+
505
+ models_seen_upto.add(model_a)
506
+ models_seen_upto.add(model_b)
507
+
508
+ if key > latest_milestone_key:
509
+ battles_after_latest.append((model_a, model_b, side))
510
+
511
+ if name in milestone_set:
512
+ snapshot_path = os.path.join(pk_logs_dir, name, "elo_snapshot.json")
513
+ expected_models = sorted(models_seen_upto)
514
+
515
+ # If there are no models yet, don't generate an empty snapshot.
516
+ if not expected_models:
517
+ continue
518
+
519
+ existing = _load_elo_snapshot(snapshot_path)
520
+ if existing is None or any(m not in existing for m in expected_models):
521
+ # Use bootstrap to compute ELO with CI for milestone snapshots
522
+ bootstrap_result = compute_bootstrap_bt_elo(
523
+ battles_all,
524
+ models=expected_models,
525
+ num_bootstrap=DEFAULT_NUM_BOOTSTRAP,
526
+ scale=SCALE,
527
+ base=BASE,
528
+ init_rating=INIT_RATING,
529
+ )
530
+ _save_elo_snapshot(
531
+ snapshot_path,
532
+ exp_name=name,
533
+ elo=bootstrap_result.ratings,
534
+ model_count=len(expected_models),
535
+ battle_count=len(battles_all),
536
+ ci_lower=bootstrap_result.ci_lower,
537
+ ci_upper=bootstrap_result.ci_upper,
538
+ ci_width=bootstrap_result.ci_width,
539
+ std=bootstrap_result.std,
540
+ num_bootstrap=bootstrap_result.num_bootstrap,
541
+ )
542
+ # Also generate README.md for the milestone
543
+ _save_exp_readme(
544
+ exp_dir=os.path.join(pk_logs_dir, name),
545
+ exp_name=name,
546
+ elo=bootstrap_result.ratings,
547
+ model_count=len(expected_models),
548
+ battle_count=len(battles_all),
549
+ ci_lower=bootstrap_result.ci_lower,
550
+ ci_upper=bootstrap_result.ci_upper,
551
+ )
552
+ else:
553
+ # Snapshot exists, but check if README.md is missing
554
+ readme_path = os.path.join(pk_logs_dir, name, "README.md")
555
+ if not os.path.isfile(readme_path):
556
+ # Load CI info from snapshot if available
557
+ snapshot_ci_lower: Optional[dict[str, float]] = None
558
+ snapshot_ci_upper: Optional[dict[str, float]] = None
559
+ try:
560
+ with open(snapshot_path, "r", encoding="utf-8") as f:
561
+ snapshot_data = json.load(f)
562
+ snapshot_ci_lower = snapshot_data.get("ci_lower")
563
+ snapshot_ci_upper = snapshot_data.get("ci_upper")
564
+ except Exception:
565
+ pass
566
+ _save_exp_readme(
567
+ exp_dir=os.path.join(pk_logs_dir, name),
568
+ exp_name=name,
569
+ elo=existing,
570
+ model_count=len(expected_models),
571
+ battle_count=len(battles_all),
572
+ ci_lower=snapshot_ci_lower,
573
+ ci_upper=snapshot_ci_upper,
574
+ )
575
+ else:
576
+ # Non-milestone experiment: check if elo_snapshot.json or README.md is missing
577
+ exp_dir = os.path.join(pk_logs_dir, name)
578
+ snapshot_path = os.path.join(exp_dir, "elo_snapshot.json")
579
+ readme_path = os.path.join(exp_dir, "README.md")
580
+ expected_models = sorted(models_seen_upto)
581
+
582
+ if expected_models:
583
+ existing_snapshot = _load_elo_snapshot(snapshot_path)
584
+ need_snapshot = existing_snapshot is None or any(m not in existing_snapshot for m in expected_models)
585
+ need_readme = not os.path.isfile(readme_path)
586
+
587
+ if need_snapshot or need_readme:
588
+ bootstrap_result = compute_bootstrap_bt_elo(
589
+ battles_all,
590
+ models=expected_models,
591
+ num_bootstrap=DEFAULT_NUM_BOOTSTRAP,
592
+ scale=SCALE,
593
+ base=BASE,
594
+ init_rating=INIT_RATING,
595
+ )
596
+ if need_snapshot:
597
+ _save_elo_snapshot(
598
+ snapshot_path,
599
+ exp_name=name,
600
+ elo=bootstrap_result.ratings,
601
+ model_count=len(expected_models),
602
+ battle_count=len(battles_all),
603
+ ci_lower=bootstrap_result.ci_lower,
604
+ ci_upper=bootstrap_result.ci_upper,
605
+ ci_width=bootstrap_result.ci_width,
606
+ std=bootstrap_result.std,
607
+ num_bootstrap=bootstrap_result.num_bootstrap,
608
+ )
609
+ if need_readme:
610
+ _save_exp_readme(
611
+ exp_dir=exp_dir,
612
+ exp_name=name,
613
+ elo=bootstrap_result.ratings,
614
+ model_count=len(expected_models),
615
+ battle_count=len(battles_all),
616
+ ci_lower=bootstrap_result.ci_lower,
617
+ ci_upper=bootstrap_result.ci_upper,
618
+ )
619
+
620
+ # Load anchors from the latest milestone snapshot (it should exist now, if milestone had any models).
621
+ latest_snapshot_path = os.path.join(pk_logs_dir, latest_milestone_name, "elo_snapshot.json")
622
+ anchor_elo = _load_elo_snapshot(latest_snapshot_path) or {}
623
+
624
+ include_models = list(models) if models is not None else sorted(models_seen_all)
625
+ anchor_elo = {m: float(v) for m, v in anchor_elo.items() if m in set(include_models)}
626
+
627
+ # Final ratings: anchored insertion from latest milestone snapshot.
628
+ # Compute bootstrap CI for final ratings
629
+ if anchor_elo:
630
+ bootstrap_result = compute_bootstrap_bt_elo(
631
+ battles_after_latest,
632
+ models=include_models,
633
+ fixed_ratings=anchor_elo,
634
+ num_bootstrap=DEFAULT_NUM_BOOTSTRAP,
635
+ scale=SCALE,
636
+ base=BASE,
637
+ init_rating=INIT_RATING,
638
+ )
639
+ else:
640
+ bootstrap_result = compute_bootstrap_bt_elo(
641
+ battles_all,
642
+ models=include_models,
643
+ num_bootstrap=DEFAULT_NUM_BOOTSTRAP,
644
+ scale=SCALE,
645
+ base=BASE,
646
+ init_rating=INIT_RATING,
647
+ )
648
+
649
+ for m in bootstrap_result.ratings:
650
+ stats = state.get_model_stats(m)
651
+ stats.elo = float(bootstrap_result.ratings[m])
652
+ stats.ci_lower = bootstrap_result.ci_lower.get(m)
653
+ stats.ci_upper = bootstrap_result.ci_upper.get(m)
654
+
655
+ state.last_updated = iso_timestamp()
656
+ return state