genarena 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genarena/__init__.py +49 -2
- genarena/__main__.py +10 -0
- genarena/arena.py +1685 -0
- genarena/battle.py +337 -0
- genarena/bt_elo.py +507 -0
- genarena/cli.py +1581 -0
- genarena/data.py +476 -0
- genarena/deploy/Dockerfile +25 -0
- genarena/deploy/README.md +55 -0
- genarena/deploy/__init__.py +5 -0
- genarena/deploy/app.py +84 -0
- genarena/experiments.py +121 -0
- genarena/leaderboard.py +270 -0
- genarena/logs.py +409 -0
- genarena/models.py +412 -0
- genarena/prompts/__init__.py +127 -0
- genarena/prompts/mmrb2.py +373 -0
- genarena/sampling.py +336 -0
- genarena/state.py +656 -0
- genarena/sync/__init__.py +105 -0
- genarena/sync/auto_commit.py +118 -0
- genarena/sync/deploy_ops.py +543 -0
- genarena/sync/git_ops.py +422 -0
- genarena/sync/hf_ops.py +891 -0
- genarena/sync/init_ops.py +431 -0
- genarena/sync/packer.py +587 -0
- genarena/sync/submit.py +837 -0
- genarena/utils.py +103 -0
- genarena/validation/__init__.py +19 -0
- genarena/validation/schema.py +327 -0
- genarena/validation/validator.py +329 -0
- genarena/visualize/README.md +148 -0
- genarena/visualize/__init__.py +14 -0
- genarena/visualize/app.py +938 -0
- genarena/visualize/data_loader.py +2335 -0
- genarena/visualize/static/app.js +3762 -0
- genarena/visualize/static/model_aliases.json +86 -0
- genarena/visualize/static/style.css +4104 -0
- genarena/visualize/templates/index.html +413 -0
- genarena/vlm.py +519 -0
- genarena-0.1.0.dist-info/METADATA +178 -0
- genarena-0.1.0.dist-info/RECORD +44 -0
- {genarena-0.0.1.dist-info → genarena-0.1.0.dist-info}/WHEEL +1 -2
- genarena-0.1.0.dist-info/entry_points.txt +2 -0
- genarena-0.0.1.dist-info/METADATA +0 -26
- genarena-0.0.1.dist-info/RECORD +0 -5
- genarena-0.0.1.dist-info/top_level.txt +0 -1
genarena/state.py
ADDED
|
@@ -0,0 +1,656 @@
|
|
|
1
|
+
# Copyright 2026 Ruihang Li.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0.
|
|
3
|
+
# See LICENSE file in the project root for details.
|
|
4
|
+
|
|
5
|
+
"""Arena state management module (Bradley-Terry Elo scoring)."""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Any, Optional
|
|
11
|
+
|
|
12
|
+
from genarena.utils import ensure_dir, iso_timestamp
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# BT-to-Elo conversion constants (VideoAutoArena-style defaults)
|
|
16
|
+
# NOTE: This package intentionally uses *batch* Bradley-Terry scoring rather than
|
|
17
|
+
# online ELO with a K-factor, so scores are order-independent and reproducible.
|
|
18
|
+
SCALE = 400.0
|
|
19
|
+
BASE = 10.0
|
|
20
|
+
INIT_RATING = 1000.0
|
|
21
|
+
|
|
22
|
+
# Backward-compatible alias for existing state.json fields/defaults
|
|
23
|
+
DEFAULT_ELO = INIT_RATING
|
|
24
|
+
|
|
25
|
+
# Default number of bootstrap iterations for CI computation
|
|
26
|
+
DEFAULT_NUM_BOOTSTRAP = 100
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class ModelStats:
|
|
31
|
+
"""Statistics for a single model."""
|
|
32
|
+
|
|
33
|
+
elo: float = DEFAULT_ELO
|
|
34
|
+
wins: int = 0
|
|
35
|
+
losses: int = 0
|
|
36
|
+
ties: int = 0
|
|
37
|
+
ci_lower: Optional[float] = None
|
|
38
|
+
ci_upper: Optional[float] = None
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def total_battles(self) -> int:
|
|
42
|
+
"""Total number of battles."""
|
|
43
|
+
return self.wins + self.losses + self.ties
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def win_rate(self) -> float:
|
|
47
|
+
"""Win rate (wins / total, ties count as 0.5)."""
|
|
48
|
+
if self.total_battles == 0:
|
|
49
|
+
return 0.0
|
|
50
|
+
return (self.wins + 0.5 * self.ties) / self.total_battles
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def ci_width(self) -> Optional[float]:
|
|
54
|
+
"""95% CI width (upper - lower), or None if CI not computed."""
|
|
55
|
+
if self.ci_lower is None or self.ci_upper is None:
|
|
56
|
+
return None
|
|
57
|
+
return self.ci_upper - self.ci_lower
|
|
58
|
+
|
|
59
|
+
def to_dict(self) -> dict[str, Any]:
|
|
60
|
+
"""Convert to dictionary."""
|
|
61
|
+
result = {
|
|
62
|
+
"elo": self.elo,
|
|
63
|
+
"wins": self.wins,
|
|
64
|
+
"losses": self.losses,
|
|
65
|
+
"ties": self.ties
|
|
66
|
+
}
|
|
67
|
+
if self.ci_lower is not None:
|
|
68
|
+
result["ci_lower"] = self.ci_lower
|
|
69
|
+
if self.ci_upper is not None:
|
|
70
|
+
result["ci_upper"] = self.ci_upper
|
|
71
|
+
return result
|
|
72
|
+
|
|
73
|
+
@classmethod
|
|
74
|
+
def from_dict(cls, data: dict[str, Any]) -> "ModelStats":
|
|
75
|
+
"""Create from dictionary."""
|
|
76
|
+
return cls(
|
|
77
|
+
elo=data.get("elo", DEFAULT_ELO),
|
|
78
|
+
wins=data.get("wins", 0),
|
|
79
|
+
losses=data.get("losses", 0),
|
|
80
|
+
ties=data.get("ties", 0),
|
|
81
|
+
ci_lower=data.get("ci_lower"),
|
|
82
|
+
ci_upper=data.get("ci_upper"),
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class ArenaState:
|
|
88
|
+
"""
|
|
89
|
+
Arena state containing ELO ratings and battle statistics.
|
|
90
|
+
|
|
91
|
+
Manages model ratings and provides methods for ELO updates.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
# Model name -> ModelStats
|
|
95
|
+
models: dict[str, ModelStats] = field(default_factory=dict)
|
|
96
|
+
|
|
97
|
+
# Total battles processed
|
|
98
|
+
total_battles: int = 0
|
|
99
|
+
|
|
100
|
+
# Last update timestamp
|
|
101
|
+
last_updated: str = ""
|
|
102
|
+
|
|
103
|
+
def get_model_stats(self, model: str) -> ModelStats:
|
|
104
|
+
"""
|
|
105
|
+
Get stats for a model, creating if necessary.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
model: Model name
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
ModelStats for the model
|
|
112
|
+
"""
|
|
113
|
+
if model not in self.models:
|
|
114
|
+
self.models[model] = ModelStats()
|
|
115
|
+
return self.models[model]
|
|
116
|
+
|
|
117
|
+
def get_elo(self, model: str) -> float:
|
|
118
|
+
"""Get ELO rating for a model."""
|
|
119
|
+
return self.get_model_stats(model).elo
|
|
120
|
+
|
|
121
|
+
def to_dict(self) -> dict[str, Any]:
|
|
122
|
+
"""Convert to dictionary for serialization."""
|
|
123
|
+
return {
|
|
124
|
+
"models": {
|
|
125
|
+
name: stats.to_dict()
|
|
126
|
+
for name, stats in self.models.items()
|
|
127
|
+
},
|
|
128
|
+
"total_battles": self.total_battles,
|
|
129
|
+
"last_updated": self.last_updated
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
@classmethod
|
|
133
|
+
def from_dict(cls, data: dict[str, Any]) -> "ArenaState":
|
|
134
|
+
"""Create from dictionary."""
|
|
135
|
+
state = cls()
|
|
136
|
+
|
|
137
|
+
models_data = data.get("models", {})
|
|
138
|
+
for name, stats_data in models_data.items():
|
|
139
|
+
state.models[name] = ModelStats.from_dict(stats_data)
|
|
140
|
+
|
|
141
|
+
state.total_battles = data.get("total_battles", 0)
|
|
142
|
+
state.last_updated = data.get("last_updated", "")
|
|
143
|
+
|
|
144
|
+
return state
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def update_stats(
|
|
148
|
+
state: ArenaState,
|
|
149
|
+
model_a: str,
|
|
150
|
+
model_b: str,
|
|
151
|
+
winner: str
|
|
152
|
+
) -> ArenaState:
|
|
153
|
+
"""
|
|
154
|
+
Update win/loss/tie statistics based on a battle result.
|
|
155
|
+
|
|
156
|
+
This does NOT update Elo ratings directly. Elo ratings are computed via
|
|
157
|
+
Bradley-Terry model fitting from accumulated battle records (see
|
|
158
|
+
`rebuild_state_from_logs`).
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
state: Current arena state
|
|
162
|
+
model_a: First model name
|
|
163
|
+
model_b: Second model name
|
|
164
|
+
winner: "model_a", "model_b", or "tie" (or the actual model name)
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Updated arena state
|
|
168
|
+
"""
|
|
169
|
+
stats_a = state.get_model_stats(model_a)
|
|
170
|
+
stats_b = state.get_model_stats(model_b)
|
|
171
|
+
|
|
172
|
+
# Determine actual scores based on winner
|
|
173
|
+
winner_lower = winner.lower()
|
|
174
|
+
|
|
175
|
+
if winner_lower == model_a.lower() or winner_lower == "model_a":
|
|
176
|
+
# Model A wins
|
|
177
|
+
stats_a.wins += 1
|
|
178
|
+
stats_b.losses += 1
|
|
179
|
+
elif winner_lower == model_b.lower() or winner_lower == "model_b":
|
|
180
|
+
# Model B wins
|
|
181
|
+
stats_a.losses += 1
|
|
182
|
+
stats_b.wins += 1
|
|
183
|
+
else:
|
|
184
|
+
# Tie
|
|
185
|
+
stats_a.ties += 1
|
|
186
|
+
stats_b.ties += 1
|
|
187
|
+
|
|
188
|
+
# Update state metadata
|
|
189
|
+
state.total_battles += 1
|
|
190
|
+
state.last_updated = iso_timestamp()
|
|
191
|
+
|
|
192
|
+
return state
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def load_state(path: str) -> ArenaState:
|
|
196
|
+
"""
|
|
197
|
+
Load arena state from a JSON file.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
path: Path to state.json file
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
Loaded ArenaState, or empty state if file doesn't exist
|
|
204
|
+
"""
|
|
205
|
+
if not os.path.isfile(path):
|
|
206
|
+
return ArenaState()
|
|
207
|
+
|
|
208
|
+
try:
|
|
209
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
210
|
+
data = json.load(f)
|
|
211
|
+
return ArenaState.from_dict(data)
|
|
212
|
+
except (json.JSONDecodeError, IOError):
|
|
213
|
+
return ArenaState()
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def save_state(state: ArenaState, path: str) -> None:
|
|
217
|
+
"""
|
|
218
|
+
Save arena state to a JSON file.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
state: ArenaState to save
|
|
222
|
+
path: Path to save to
|
|
223
|
+
"""
|
|
224
|
+
# Ensure directory exists
|
|
225
|
+
ensure_dir(os.path.dirname(path))
|
|
226
|
+
|
|
227
|
+
# Update timestamp
|
|
228
|
+
state.last_updated = iso_timestamp()
|
|
229
|
+
|
|
230
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
231
|
+
json.dump(state.to_dict(), f, indent=2, ensure_ascii=False)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def rebuild_state_from_logs(
|
|
235
|
+
pk_logs_dir: str,
|
|
236
|
+
models: Optional[list[str]] = None
|
|
237
|
+
) -> ArenaState:
|
|
238
|
+
"""
|
|
239
|
+
Rebuild arena state from battle log files.
|
|
240
|
+
|
|
241
|
+
Recomputes:
|
|
242
|
+
- W/L/T statistics
|
|
243
|
+
- Bradley-Terry Elo ratings (VideoAutoArena-style), order-independent
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
pk_logs_dir: Path to pk_logs directory
|
|
247
|
+
models: Optional list of models to include (includes all if None)
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
Rebuilt ArenaState
|
|
251
|
+
"""
|
|
252
|
+
from genarena.logs import load_battle_records
|
|
253
|
+
from genarena.bt_elo import compute_bt_elo_ratings, compute_bootstrap_bt_elo
|
|
254
|
+
from genarena.experiments import is_milestone_exp, parse_exp_date_suffix
|
|
255
|
+
|
|
256
|
+
state = ArenaState()
|
|
257
|
+
|
|
258
|
+
if not os.path.isdir(pk_logs_dir):
|
|
259
|
+
return state
|
|
260
|
+
|
|
261
|
+
# Discover experiment directories (enforce `_yyyymmdd` suffix)
|
|
262
|
+
exp_keys: list[tuple[tuple, str]] = []
|
|
263
|
+
# key is (date, name) for deterministic ordering
|
|
264
|
+
for name in os.listdir(pk_logs_dir):
|
|
265
|
+
if name.startswith("."):
|
|
266
|
+
continue
|
|
267
|
+
exp_dir = os.path.join(pk_logs_dir, name)
|
|
268
|
+
if not os.path.isdir(exp_dir):
|
|
269
|
+
continue
|
|
270
|
+
d = parse_exp_date_suffix(name)
|
|
271
|
+
if d is None:
|
|
272
|
+
raise ValueError(
|
|
273
|
+
f"Invalid experiment directory under pk_logs: '{name}'. "
|
|
274
|
+
f"Expected exp_name ending with `_yyyymmdd`."
|
|
275
|
+
)
|
|
276
|
+
exp_keys.append(((d, name), name))
|
|
277
|
+
exp_keys.sort(key=lambda x: x[0])
|
|
278
|
+
|
|
279
|
+
milestones = [name for (key, name) in exp_keys if is_milestone_exp(name)]
|
|
280
|
+
|
|
281
|
+
def _winner_side(model_a: str, model_b: str, winner: str) -> str:
|
|
282
|
+
"""Normalize winner to 'model_a'/'model_b'/'tie' relative to (model_a, model_b)."""
|
|
283
|
+
w = str(winner).lower()
|
|
284
|
+
if w == "tie":
|
|
285
|
+
return "tie"
|
|
286
|
+
if w == str(model_a).lower():
|
|
287
|
+
return "model_a"
|
|
288
|
+
if w == str(model_b).lower():
|
|
289
|
+
return "model_b"
|
|
290
|
+
return "tie"
|
|
291
|
+
|
|
292
|
+
def _load_elo_snapshot(path: str) -> Optional[dict[str, float]]:
|
|
293
|
+
if not os.path.isfile(path):
|
|
294
|
+
return None
|
|
295
|
+
try:
|
|
296
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
297
|
+
data = json.load(f)
|
|
298
|
+
except Exception:
|
|
299
|
+
return None
|
|
300
|
+
|
|
301
|
+
if not isinstance(data, dict):
|
|
302
|
+
return None
|
|
303
|
+
|
|
304
|
+
# Accept either: {"elo": {...}} or a direct {model: elo} mapping.
|
|
305
|
+
raw = data.get("elo") if isinstance(data.get("elo"), dict) else data
|
|
306
|
+
if not isinstance(raw, dict):
|
|
307
|
+
return None
|
|
308
|
+
|
|
309
|
+
out: dict[str, float] = {}
|
|
310
|
+
for k, v in raw.items():
|
|
311
|
+
try:
|
|
312
|
+
out[str(k)] = float(v)
|
|
313
|
+
except Exception:
|
|
314
|
+
continue
|
|
315
|
+
return out or None
|
|
316
|
+
|
|
317
|
+
def _save_elo_snapshot(
|
|
318
|
+
path: str,
|
|
319
|
+
*,
|
|
320
|
+
exp_name: str,
|
|
321
|
+
elo: dict[str, float],
|
|
322
|
+
model_count: int,
|
|
323
|
+
battle_count: int,
|
|
324
|
+
ci_lower: Optional[dict[str, float]] = None,
|
|
325
|
+
ci_upper: Optional[dict[str, float]] = None,
|
|
326
|
+
ci_width: Optional[dict[str, float]] = None,
|
|
327
|
+
std: Optional[dict[str, float]] = None,
|
|
328
|
+
num_bootstrap: Optional[int] = None,
|
|
329
|
+
) -> None:
|
|
330
|
+
ensure_dir(os.path.dirname(path))
|
|
331
|
+
payload: dict[str, Any] = {
|
|
332
|
+
"exp_name": exp_name,
|
|
333
|
+
"generated_at": iso_timestamp(),
|
|
334
|
+
"params": {"scale": SCALE, "base": BASE, "init_rating": INIT_RATING},
|
|
335
|
+
"model_count": int(model_count),
|
|
336
|
+
"battle_count": int(battle_count),
|
|
337
|
+
"elo": {k: float(v) for k, v in sorted(elo.items(), key=lambda x: x[0])},
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
# Include CI information if available
|
|
341
|
+
if ci_lower is not None:
|
|
342
|
+
payload["ci_lower"] = {k: float(v) for k, v in sorted(ci_lower.items(), key=lambda x: x[0])}
|
|
343
|
+
if ci_upper is not None:
|
|
344
|
+
payload["ci_upper"] = {k: float(v) for k, v in sorted(ci_upper.items(), key=lambda x: x[0])}
|
|
345
|
+
if ci_width is not None:
|
|
346
|
+
payload["ci_width"] = {k: float(v) for k, v in sorted(ci_width.items(), key=lambda x: x[0])}
|
|
347
|
+
if std is not None:
|
|
348
|
+
payload["std"] = {k: float(v) for k, v in sorted(std.items(), key=lambda x: x[0])}
|
|
349
|
+
if num_bootstrap is not None:
|
|
350
|
+
payload["num_bootstrap"] = int(num_bootstrap)
|
|
351
|
+
|
|
352
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
353
|
+
json.dump(payload, f, indent=2, ensure_ascii=False)
|
|
354
|
+
|
|
355
|
+
def _save_exp_readme(
|
|
356
|
+
exp_dir: str,
|
|
357
|
+
exp_name: str,
|
|
358
|
+
elo: dict[str, float],
|
|
359
|
+
model_count: int,
|
|
360
|
+
battle_count: int,
|
|
361
|
+
ci_lower: Optional[dict[str, float]] = None,
|
|
362
|
+
ci_upper: Optional[dict[str, float]] = None,
|
|
363
|
+
) -> None:
|
|
364
|
+
"""Save README.md with cumulative leaderboard for an experiment directory."""
|
|
365
|
+
from genarena.leaderboard import generate_experiment_readme
|
|
366
|
+
readme_path = os.path.join(exp_dir, "README.md")
|
|
367
|
+
content = generate_experiment_readme(
|
|
368
|
+
exp_name=exp_name,
|
|
369
|
+
elo=elo,
|
|
370
|
+
model_count=model_count,
|
|
371
|
+
battle_count=battle_count,
|
|
372
|
+
ci_lower=ci_lower,
|
|
373
|
+
ci_upper=ci_upper,
|
|
374
|
+
)
|
|
375
|
+
with open(readme_path, "w", encoding="utf-8") as f:
|
|
376
|
+
f.write(content)
|
|
377
|
+
|
|
378
|
+
# If no milestones exist, fall back to the legacy full-fit behavior.
|
|
379
|
+
# Also generate README.md for experiments missing them.
|
|
380
|
+
if not milestones:
|
|
381
|
+
# Track battles per experiment for README generation
|
|
382
|
+
battles_cumulative: list[tuple[str, str, str]] = []
|
|
383
|
+
models_seen_cumulative: set[str] = set()
|
|
384
|
+
|
|
385
|
+
for (key, name) in exp_keys:
|
|
386
|
+
exp_records = load_battle_records(pk_logs_dir, exp_name=name)
|
|
387
|
+
|
|
388
|
+
for record in exp_records:
|
|
389
|
+
model_a = record.get("model_a", "")
|
|
390
|
+
model_b = record.get("model_b", "")
|
|
391
|
+
winner = record.get("final_winner", "tie")
|
|
392
|
+
|
|
393
|
+
if models:
|
|
394
|
+
if model_a not in models or model_b not in models:
|
|
395
|
+
continue
|
|
396
|
+
|
|
397
|
+
if model_a and model_b:
|
|
398
|
+
update_stats(state, model_a, model_b, winner)
|
|
399
|
+
battles_cumulative.append((model_a, model_b, _winner_side(model_a, model_b, winner)))
|
|
400
|
+
models_seen_cumulative.add(model_a)
|
|
401
|
+
models_seen_cumulative.add(model_b)
|
|
402
|
+
|
|
403
|
+
# Check if elo_snapshot.json or README.md is missing for this experiment
|
|
404
|
+
exp_dir = os.path.join(pk_logs_dir, name)
|
|
405
|
+
snapshot_path = os.path.join(exp_dir, "elo_snapshot.json")
|
|
406
|
+
readme_path = os.path.join(exp_dir, "README.md")
|
|
407
|
+
expected_models = sorted(models_seen_cumulative)
|
|
408
|
+
|
|
409
|
+
if expected_models:
|
|
410
|
+
existing_snapshot = _load_elo_snapshot(snapshot_path)
|
|
411
|
+
need_snapshot = existing_snapshot is None or any(m not in existing_snapshot for m in expected_models)
|
|
412
|
+
need_readme = not os.path.isfile(readme_path)
|
|
413
|
+
|
|
414
|
+
if need_snapshot or need_readme:
|
|
415
|
+
bootstrap_result = compute_bootstrap_bt_elo(
|
|
416
|
+
battles_cumulative,
|
|
417
|
+
models=expected_models,
|
|
418
|
+
num_bootstrap=DEFAULT_NUM_BOOTSTRAP,
|
|
419
|
+
scale=SCALE,
|
|
420
|
+
base=BASE,
|
|
421
|
+
init_rating=INIT_RATING,
|
|
422
|
+
)
|
|
423
|
+
if need_snapshot:
|
|
424
|
+
_save_elo_snapshot(
|
|
425
|
+
snapshot_path,
|
|
426
|
+
exp_name=name,
|
|
427
|
+
elo=bootstrap_result.ratings,
|
|
428
|
+
model_count=len(expected_models),
|
|
429
|
+
battle_count=len(battles_cumulative),
|
|
430
|
+
ci_lower=bootstrap_result.ci_lower,
|
|
431
|
+
ci_upper=bootstrap_result.ci_upper,
|
|
432
|
+
ci_width=bootstrap_result.ci_width,
|
|
433
|
+
std=bootstrap_result.std,
|
|
434
|
+
num_bootstrap=bootstrap_result.num_bootstrap,
|
|
435
|
+
)
|
|
436
|
+
if need_readme:
|
|
437
|
+
_save_exp_readme(
|
|
438
|
+
exp_dir=exp_dir,
|
|
439
|
+
exp_name=name,
|
|
440
|
+
elo=bootstrap_result.ratings,
|
|
441
|
+
model_count=len(expected_models),
|
|
442
|
+
battle_count=len(battles_cumulative),
|
|
443
|
+
ci_lower=bootstrap_result.ci_lower,
|
|
444
|
+
ci_upper=bootstrap_result.ci_upper,
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
include_models = models if models is not None else list(state.models.keys())
|
|
448
|
+
|
|
449
|
+
# Compute bootstrap CI for final ratings
|
|
450
|
+
bootstrap_result = compute_bootstrap_bt_elo(
|
|
451
|
+
battles_cumulative,
|
|
452
|
+
models=include_models,
|
|
453
|
+
num_bootstrap=DEFAULT_NUM_BOOTSTRAP,
|
|
454
|
+
scale=SCALE,
|
|
455
|
+
base=BASE,
|
|
456
|
+
init_rating=INIT_RATING,
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
for m in bootstrap_result.ratings:
|
|
460
|
+
stats = state.get_model_stats(m)
|
|
461
|
+
stats.elo = float(bootstrap_result.ratings[m])
|
|
462
|
+
stats.ci_lower = bootstrap_result.ci_lower.get(m)
|
|
463
|
+
stats.ci_upper = bootstrap_result.ci_upper.get(m)
|
|
464
|
+
|
|
465
|
+
state.last_updated = iso_timestamp()
|
|
466
|
+
return state
|
|
467
|
+
|
|
468
|
+
# === Milestone mode ===
|
|
469
|
+
# Ensure every milestone has an elo_snapshot.json (auto-generate if missing/incomplete),
|
|
470
|
+
# then use the latest milestone snapshot as fixed anchors to insert newer models.
|
|
471
|
+
milestone_set = set(milestones)
|
|
472
|
+
latest_milestone_name = milestones[-1]
|
|
473
|
+
latest_milestone_key = next(k for (k, name) in exp_keys if name == latest_milestone_name)
|
|
474
|
+
|
|
475
|
+
models_filter = set(models) if models else None
|
|
476
|
+
|
|
477
|
+
battles_all: list[tuple[str, str, str]] = []
|
|
478
|
+
battles_after_latest: list[tuple[str, str, str]] = []
|
|
479
|
+
models_seen_upto: set[str] = set()
|
|
480
|
+
models_seen_all: set[str] = set()
|
|
481
|
+
|
|
482
|
+
# Iterate experiments in order, accumulate battles, and generate snapshots at milestones.
|
|
483
|
+
for (key, name) in exp_keys:
|
|
484
|
+
exp_records = load_battle_records(pk_logs_dir, exp_name=name)
|
|
485
|
+
|
|
486
|
+
for record in exp_records:
|
|
487
|
+
model_a = record.get("model_a", "")
|
|
488
|
+
model_b = record.get("model_b", "")
|
|
489
|
+
winner = record.get("final_winner", "tie")
|
|
490
|
+
|
|
491
|
+
if models_filter is not None:
|
|
492
|
+
if model_a not in models_filter or model_b not in models_filter:
|
|
493
|
+
continue
|
|
494
|
+
|
|
495
|
+
if not model_a or not model_b:
|
|
496
|
+
continue
|
|
497
|
+
|
|
498
|
+
update_stats(state, model_a, model_b, winner)
|
|
499
|
+
|
|
500
|
+
side = _winner_side(model_a, model_b, winner)
|
|
501
|
+
battles_all.append((model_a, model_b, side))
|
|
502
|
+
models_seen_all.add(model_a)
|
|
503
|
+
models_seen_all.add(model_b)
|
|
504
|
+
|
|
505
|
+
models_seen_upto.add(model_a)
|
|
506
|
+
models_seen_upto.add(model_b)
|
|
507
|
+
|
|
508
|
+
if key > latest_milestone_key:
|
|
509
|
+
battles_after_latest.append((model_a, model_b, side))
|
|
510
|
+
|
|
511
|
+
if name in milestone_set:
|
|
512
|
+
snapshot_path = os.path.join(pk_logs_dir, name, "elo_snapshot.json")
|
|
513
|
+
expected_models = sorted(models_seen_upto)
|
|
514
|
+
|
|
515
|
+
# If there are no models yet, don't generate an empty snapshot.
|
|
516
|
+
if not expected_models:
|
|
517
|
+
continue
|
|
518
|
+
|
|
519
|
+
existing = _load_elo_snapshot(snapshot_path)
|
|
520
|
+
if existing is None or any(m not in existing for m in expected_models):
|
|
521
|
+
# Use bootstrap to compute ELO with CI for milestone snapshots
|
|
522
|
+
bootstrap_result = compute_bootstrap_bt_elo(
|
|
523
|
+
battles_all,
|
|
524
|
+
models=expected_models,
|
|
525
|
+
num_bootstrap=DEFAULT_NUM_BOOTSTRAP,
|
|
526
|
+
scale=SCALE,
|
|
527
|
+
base=BASE,
|
|
528
|
+
init_rating=INIT_RATING,
|
|
529
|
+
)
|
|
530
|
+
_save_elo_snapshot(
|
|
531
|
+
snapshot_path,
|
|
532
|
+
exp_name=name,
|
|
533
|
+
elo=bootstrap_result.ratings,
|
|
534
|
+
model_count=len(expected_models),
|
|
535
|
+
battle_count=len(battles_all),
|
|
536
|
+
ci_lower=bootstrap_result.ci_lower,
|
|
537
|
+
ci_upper=bootstrap_result.ci_upper,
|
|
538
|
+
ci_width=bootstrap_result.ci_width,
|
|
539
|
+
std=bootstrap_result.std,
|
|
540
|
+
num_bootstrap=bootstrap_result.num_bootstrap,
|
|
541
|
+
)
|
|
542
|
+
# Also generate README.md for the milestone
|
|
543
|
+
_save_exp_readme(
|
|
544
|
+
exp_dir=os.path.join(pk_logs_dir, name),
|
|
545
|
+
exp_name=name,
|
|
546
|
+
elo=bootstrap_result.ratings,
|
|
547
|
+
model_count=len(expected_models),
|
|
548
|
+
battle_count=len(battles_all),
|
|
549
|
+
ci_lower=bootstrap_result.ci_lower,
|
|
550
|
+
ci_upper=bootstrap_result.ci_upper,
|
|
551
|
+
)
|
|
552
|
+
else:
|
|
553
|
+
# Snapshot exists, but check if README.md is missing
|
|
554
|
+
readme_path = os.path.join(pk_logs_dir, name, "README.md")
|
|
555
|
+
if not os.path.isfile(readme_path):
|
|
556
|
+
# Load CI info from snapshot if available
|
|
557
|
+
snapshot_ci_lower: Optional[dict[str, float]] = None
|
|
558
|
+
snapshot_ci_upper: Optional[dict[str, float]] = None
|
|
559
|
+
try:
|
|
560
|
+
with open(snapshot_path, "r", encoding="utf-8") as f:
|
|
561
|
+
snapshot_data = json.load(f)
|
|
562
|
+
snapshot_ci_lower = snapshot_data.get("ci_lower")
|
|
563
|
+
snapshot_ci_upper = snapshot_data.get("ci_upper")
|
|
564
|
+
except Exception:
|
|
565
|
+
pass
|
|
566
|
+
_save_exp_readme(
|
|
567
|
+
exp_dir=os.path.join(pk_logs_dir, name),
|
|
568
|
+
exp_name=name,
|
|
569
|
+
elo=existing,
|
|
570
|
+
model_count=len(expected_models),
|
|
571
|
+
battle_count=len(battles_all),
|
|
572
|
+
ci_lower=snapshot_ci_lower,
|
|
573
|
+
ci_upper=snapshot_ci_upper,
|
|
574
|
+
)
|
|
575
|
+
else:
|
|
576
|
+
# Non-milestone experiment: check if elo_snapshot.json or README.md is missing
|
|
577
|
+
exp_dir = os.path.join(pk_logs_dir, name)
|
|
578
|
+
snapshot_path = os.path.join(exp_dir, "elo_snapshot.json")
|
|
579
|
+
readme_path = os.path.join(exp_dir, "README.md")
|
|
580
|
+
expected_models = sorted(models_seen_upto)
|
|
581
|
+
|
|
582
|
+
if expected_models:
|
|
583
|
+
existing_snapshot = _load_elo_snapshot(snapshot_path)
|
|
584
|
+
need_snapshot = existing_snapshot is None or any(m not in existing_snapshot for m in expected_models)
|
|
585
|
+
need_readme = not os.path.isfile(readme_path)
|
|
586
|
+
|
|
587
|
+
if need_snapshot or need_readme:
|
|
588
|
+
bootstrap_result = compute_bootstrap_bt_elo(
|
|
589
|
+
battles_all,
|
|
590
|
+
models=expected_models,
|
|
591
|
+
num_bootstrap=DEFAULT_NUM_BOOTSTRAP,
|
|
592
|
+
scale=SCALE,
|
|
593
|
+
base=BASE,
|
|
594
|
+
init_rating=INIT_RATING,
|
|
595
|
+
)
|
|
596
|
+
if need_snapshot:
|
|
597
|
+
_save_elo_snapshot(
|
|
598
|
+
snapshot_path,
|
|
599
|
+
exp_name=name,
|
|
600
|
+
elo=bootstrap_result.ratings,
|
|
601
|
+
model_count=len(expected_models),
|
|
602
|
+
battle_count=len(battles_all),
|
|
603
|
+
ci_lower=bootstrap_result.ci_lower,
|
|
604
|
+
ci_upper=bootstrap_result.ci_upper,
|
|
605
|
+
ci_width=bootstrap_result.ci_width,
|
|
606
|
+
std=bootstrap_result.std,
|
|
607
|
+
num_bootstrap=bootstrap_result.num_bootstrap,
|
|
608
|
+
)
|
|
609
|
+
if need_readme:
|
|
610
|
+
_save_exp_readme(
|
|
611
|
+
exp_dir=exp_dir,
|
|
612
|
+
exp_name=name,
|
|
613
|
+
elo=bootstrap_result.ratings,
|
|
614
|
+
model_count=len(expected_models),
|
|
615
|
+
battle_count=len(battles_all),
|
|
616
|
+
ci_lower=bootstrap_result.ci_lower,
|
|
617
|
+
ci_upper=bootstrap_result.ci_upper,
|
|
618
|
+
)
|
|
619
|
+
|
|
620
|
+
# Load anchors from the latest milestone snapshot (it should exist now, if milestone had any models).
|
|
621
|
+
latest_snapshot_path = os.path.join(pk_logs_dir, latest_milestone_name, "elo_snapshot.json")
|
|
622
|
+
anchor_elo = _load_elo_snapshot(latest_snapshot_path) or {}
|
|
623
|
+
|
|
624
|
+
include_models = list(models) if models is not None else sorted(models_seen_all)
|
|
625
|
+
anchor_elo = {m: float(v) for m, v in anchor_elo.items() if m in set(include_models)}
|
|
626
|
+
|
|
627
|
+
# Final ratings: anchored insertion from latest milestone snapshot.
|
|
628
|
+
# Compute bootstrap CI for final ratings
|
|
629
|
+
if anchor_elo:
|
|
630
|
+
bootstrap_result = compute_bootstrap_bt_elo(
|
|
631
|
+
battles_after_latest,
|
|
632
|
+
models=include_models,
|
|
633
|
+
fixed_ratings=anchor_elo,
|
|
634
|
+
num_bootstrap=DEFAULT_NUM_BOOTSTRAP,
|
|
635
|
+
scale=SCALE,
|
|
636
|
+
base=BASE,
|
|
637
|
+
init_rating=INIT_RATING,
|
|
638
|
+
)
|
|
639
|
+
else:
|
|
640
|
+
bootstrap_result = compute_bootstrap_bt_elo(
|
|
641
|
+
battles_all,
|
|
642
|
+
models=include_models,
|
|
643
|
+
num_bootstrap=DEFAULT_NUM_BOOTSTRAP,
|
|
644
|
+
scale=SCALE,
|
|
645
|
+
base=BASE,
|
|
646
|
+
init_rating=INIT_RATING,
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
for m in bootstrap_result.ratings:
|
|
650
|
+
stats = state.get_model_stats(m)
|
|
651
|
+
stats.elo = float(bootstrap_result.ratings[m])
|
|
652
|
+
stats.ci_lower = bootstrap_result.ci_lower.get(m)
|
|
653
|
+
stats.ci_upper = bootstrap_result.ci_upper.get(m)
|
|
654
|
+
|
|
655
|
+
state.last_updated = iso_timestamp()
|
|
656
|
+
return state
|