genarena 0.0.1__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genarena/__init__.py +49 -2
- genarena/__main__.py +10 -0
- genarena/arena.py +1685 -0
- genarena/battle.py +337 -0
- genarena/bt_elo.py +507 -0
- genarena/cli.py +1581 -0
- genarena/data.py +476 -0
- genarena/deploy/Dockerfile +22 -0
- genarena/deploy/README.md +55 -0
- genarena/deploy/__init__.py +5 -0
- genarena/deploy/app.py +84 -0
- genarena/experiments.py +121 -0
- genarena/leaderboard.py +270 -0
- genarena/logs.py +409 -0
- genarena/models.py +412 -0
- genarena/prompts/__init__.py +127 -0
- genarena/prompts/mmrb2.py +373 -0
- genarena/sampling.py +336 -0
- genarena/state.py +656 -0
- genarena/sync/__init__.py +105 -0
- genarena/sync/auto_commit.py +118 -0
- genarena/sync/deploy_ops.py +543 -0
- genarena/sync/git_ops.py +422 -0
- genarena/sync/hf_ops.py +891 -0
- genarena/sync/init_ops.py +431 -0
- genarena/sync/packer.py +587 -0
- genarena/sync/submit.py +837 -0
- genarena/utils.py +103 -0
- genarena/validation/__init__.py +19 -0
- genarena/validation/schema.py +327 -0
- genarena/validation/validator.py +329 -0
- genarena/visualize/README.md +148 -0
- genarena/visualize/__init__.py +14 -0
- genarena/visualize/app.py +938 -0
- genarena/visualize/data_loader.py +2430 -0
- genarena/visualize/static/app.js +3762 -0
- genarena/visualize/static/model_aliases.json +86 -0
- genarena/visualize/static/style.css +4104 -0
- genarena/visualize/templates/index.html +413 -0
- genarena/vlm.py +519 -0
- genarena-0.1.1.dist-info/METADATA +178 -0
- genarena-0.1.1.dist-info/RECORD +44 -0
- {genarena-0.0.1.dist-info → genarena-0.1.1.dist-info}/WHEEL +1 -2
- genarena-0.1.1.dist-info/entry_points.txt +2 -0
- genarena-0.0.1.dist-info/METADATA +0 -26
- genarena-0.0.1.dist-info/RECORD +0 -5
- genarena-0.0.1.dist-info/top_level.txt +0 -1
genarena/experiments.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# Copyright 2026 Ruihang Li.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0.
|
|
3
|
+
# See LICENSE file in the project root for details.
|
|
4
|
+
|
|
5
|
+
"""Experiment name utilities for GenArena.
|
|
6
|
+
|
|
7
|
+
This module centralizes experiment naming rules and milestone detection.
|
|
8
|
+
|
|
9
|
+
Key rules:
|
|
10
|
+
- exp_name must end with a date suffix: `_yyyymmdd` (8 digits)
|
|
11
|
+
- milestone experiments are identified by a fixed prefix constant
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import os
|
|
17
|
+
import re
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from datetime import date, datetime
|
|
20
|
+
from typing import List, Optional, Tuple
|
|
21
|
+
|
|
22
|
+
# Fixed milestone prefix (must be constant across the codebase)
|
|
23
|
+
MILESTONE_PREFIX: str = "GenArena_"
|
|
24
|
+
|
|
25
|
+
# exp_name must end with "_yyyymmdd"
|
|
26
|
+
_EXP_DATE_SUFFIX_RE = re.compile(r"^(?P<prefix>.+)_(?P<date>\d{8})$")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def parse_exp_date_suffix(exp_name: str) -> Optional[date]:
|
|
30
|
+
"""Parse the `_yyyymmdd` suffix from exp_name.
|
|
31
|
+
|
|
32
|
+
Returns None if exp_name does not match the required format or if the date is invalid.
|
|
33
|
+
"""
|
|
34
|
+
m = _EXP_DATE_SUFFIX_RE.match(exp_name or "")
|
|
35
|
+
if not m:
|
|
36
|
+
return None
|
|
37
|
+
datestr = m.group("date")
|
|
38
|
+
try:
|
|
39
|
+
return datetime.strptime(datestr, "%Y%m%d").date()
|
|
40
|
+
except ValueError:
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def is_valid_exp_name(exp_name: str) -> bool:
|
|
45
|
+
"""Return True iff exp_name ends with a valid `_yyyymmdd` date suffix."""
|
|
46
|
+
return parse_exp_date_suffix(exp_name) is not None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def validate_exp_name(exp_name: str) -> Tuple[bool, str]:
|
|
50
|
+
"""Validate exp_name. Returns (ok, error_message)."""
|
|
51
|
+
if not exp_name:
|
|
52
|
+
return False, "exp_name is empty"
|
|
53
|
+
if not _EXP_DATE_SUFFIX_RE.match(exp_name):
|
|
54
|
+
return False, "exp_name must end with `_yyyymmdd` (e.g., `MyExp_20260128`)"
|
|
55
|
+
if parse_exp_date_suffix(exp_name) is None:
|
|
56
|
+
return False, "exp_name has an invalid date suffix; expected a real calendar date in `_yyyymmdd`"
|
|
57
|
+
return True, ""
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def require_valid_exp_name(exp_name: str) -> None:
|
|
61
|
+
"""Raise ValueError if exp_name is invalid."""
|
|
62
|
+
ok, msg = validate_exp_name(exp_name)
|
|
63
|
+
if not ok:
|
|
64
|
+
raise ValueError(msg)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def is_milestone_exp(exp_name: str) -> bool:
|
|
68
|
+
"""Return True if exp_name is a milestone experiment (fixed prefix)."""
|
|
69
|
+
return (exp_name or "").startswith(MILESTONE_PREFIX)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass(frozen=True)
|
|
73
|
+
class ExperimentInfo:
|
|
74
|
+
"""Parsed experiment directory info."""
|
|
75
|
+
|
|
76
|
+
name: str
|
|
77
|
+
date: date
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def discover_experiments(models_root: str) -> List[ExperimentInfo]:
|
|
81
|
+
"""Discover valid experiment directories under a models root directory.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
models_root: `arena_dir/<subset>/models`
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
List of ExperimentInfo for directories whose name matches `_yyyymmdd`.
|
|
88
|
+
"""
|
|
89
|
+
infos: List[ExperimentInfo] = []
|
|
90
|
+
if not os.path.isdir(models_root):
|
|
91
|
+
return infos
|
|
92
|
+
|
|
93
|
+
for name in os.listdir(models_root):
|
|
94
|
+
if name.startswith("."):
|
|
95
|
+
continue
|
|
96
|
+
path = os.path.join(models_root, name)
|
|
97
|
+
if not os.path.isdir(path):
|
|
98
|
+
continue
|
|
99
|
+
d = parse_exp_date_suffix(name)
|
|
100
|
+
if d is None:
|
|
101
|
+
continue
|
|
102
|
+
infos.append(ExperimentInfo(name=name, date=d))
|
|
103
|
+
|
|
104
|
+
# Sort by (date, name) for deterministic ordering
|
|
105
|
+
infos.sort(key=lambda x: (x.date, x.name))
|
|
106
|
+
return infos
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def pick_latest_experiment_name(models_root: str) -> str:
|
|
110
|
+
"""Pick the latest experiment name under models_root by `_yyyymmdd` suffix.
|
|
111
|
+
|
|
112
|
+
Raises ValueError if no valid experiment directories exist.
|
|
113
|
+
"""
|
|
114
|
+
infos = discover_experiments(models_root)
|
|
115
|
+
if not infos:
|
|
116
|
+
raise ValueError(
|
|
117
|
+
f"No valid experiments found under models dir: {models_root}. "
|
|
118
|
+
f"Expected subdirectories named like `<something>_yyyymmdd`."
|
|
119
|
+
)
|
|
120
|
+
return infos[-1].name
|
|
121
|
+
|
genarena/leaderboard.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
# Copyright 2026 Ruihang Li.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0.
|
|
3
|
+
# See LICENSE file in the project root for details.
|
|
4
|
+
|
|
5
|
+
"""Leaderboard generation module."""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from genarena.state import ArenaState
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class LeaderboardEntry:
|
|
16
|
+
"""Entry in the leaderboard."""
|
|
17
|
+
|
|
18
|
+
rank: int
|
|
19
|
+
model: str
|
|
20
|
+
elo: float
|
|
21
|
+
wins: int
|
|
22
|
+
losses: int
|
|
23
|
+
ties: int
|
|
24
|
+
total_battles: int
|
|
25
|
+
win_rate: float
|
|
26
|
+
ci_lower: Optional[float] = None
|
|
27
|
+
ci_upper: Optional[float] = None
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def ci_width(self) -> Optional[float]:
|
|
31
|
+
"""95% CI width, or None if CI not computed."""
|
|
32
|
+
if self.ci_lower is None or self.ci_upper is None:
|
|
33
|
+
return None
|
|
34
|
+
return self.ci_upper - self.ci_lower
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def ci_str(self) -> str:
|
|
38
|
+
"""Format CI as string (e.g., '±7.5' or 'N/A')."""
|
|
39
|
+
width = self.ci_width
|
|
40
|
+
if width is None:
|
|
41
|
+
return "N/A"
|
|
42
|
+
return f"±{width / 2:.1f}"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def generate_leaderboard(
|
|
46
|
+
state: ArenaState,
|
|
47
|
+
title: Optional[str] = None,
|
|
48
|
+
show_ci: bool = True,
|
|
49
|
+
) -> str:
|
|
50
|
+
"""
|
|
51
|
+
Generate a Markdown leaderboard from arena state.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
state: ArenaState with model statistics
|
|
55
|
+
title: Optional title for the leaderboard
|
|
56
|
+
show_ci: Whether to show CI column (default: True)
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
Markdown formatted leaderboard string
|
|
60
|
+
"""
|
|
61
|
+
# Build list of entries sorted by ELO
|
|
62
|
+
entries: list[LeaderboardEntry] = []
|
|
63
|
+
|
|
64
|
+
for model, stats in state.models.items():
|
|
65
|
+
entries.append(LeaderboardEntry(
|
|
66
|
+
rank=0, # Will be set after sorting
|
|
67
|
+
model=model,
|
|
68
|
+
elo=stats.elo,
|
|
69
|
+
wins=stats.wins,
|
|
70
|
+
losses=stats.losses,
|
|
71
|
+
ties=stats.ties,
|
|
72
|
+
total_battles=stats.total_battles,
|
|
73
|
+
win_rate=stats.win_rate,
|
|
74
|
+
ci_lower=stats.ci_lower,
|
|
75
|
+
ci_upper=stats.ci_upper,
|
|
76
|
+
))
|
|
77
|
+
|
|
78
|
+
# Sort by ELO descending
|
|
79
|
+
entries.sort(key=lambda e: e.elo, reverse=True)
|
|
80
|
+
|
|
81
|
+
# Assign ranks
|
|
82
|
+
for i, entry in enumerate(entries):
|
|
83
|
+
entry.rank = i + 1
|
|
84
|
+
|
|
85
|
+
# Check if any entry has CI information
|
|
86
|
+
has_ci = any(e.ci_lower is not None for e in entries)
|
|
87
|
+
show_ci = show_ci and has_ci
|
|
88
|
+
|
|
89
|
+
# Generate Markdown
|
|
90
|
+
lines = []
|
|
91
|
+
|
|
92
|
+
# Title
|
|
93
|
+
if title:
|
|
94
|
+
lines.append(f"# {title}")
|
|
95
|
+
else:
|
|
96
|
+
lines.append("# ELO Leaderboard")
|
|
97
|
+
lines.append("")
|
|
98
|
+
|
|
99
|
+
# Summary
|
|
100
|
+
lines.append(f"**Total Models:** {len(entries)}")
|
|
101
|
+
lines.append(f"**Total Battles:** {state.total_battles}")
|
|
102
|
+
if state.last_updated:
|
|
103
|
+
lines.append(f"**Last Updated:** {state.last_updated}")
|
|
104
|
+
lines.append("")
|
|
105
|
+
|
|
106
|
+
# Table header
|
|
107
|
+
if show_ci:
|
|
108
|
+
lines.append("| Rank | Model | ELO | 95% CI | Win Rate | W/L/T | Battles |")
|
|
109
|
+
lines.append("|------|-------|-----|--------|----------|-------|---------|")
|
|
110
|
+
else:
|
|
111
|
+
lines.append("| Rank | Model | ELO | Win Rate | W/L/T | Battles |")
|
|
112
|
+
lines.append("|------|-------|-----|----------|-------|---------|")
|
|
113
|
+
|
|
114
|
+
# Table rows
|
|
115
|
+
for entry in entries:
|
|
116
|
+
win_rate_pct = f"{entry.win_rate * 100:.1f}%"
|
|
117
|
+
wlt = f"{entry.wins}/{entry.losses}/{entry.ties}"
|
|
118
|
+
elo_str = f"{entry.elo:.0f}"
|
|
119
|
+
|
|
120
|
+
if show_ci:
|
|
121
|
+
lines.append(
|
|
122
|
+
f"| {entry.rank} | {entry.model} | {elo_str} | {entry.ci_str} | "
|
|
123
|
+
f"{win_rate_pct} | {wlt} | {entry.total_battles} |"
|
|
124
|
+
)
|
|
125
|
+
else:
|
|
126
|
+
lines.append(
|
|
127
|
+
f"| {entry.rank} | {entry.model} | {elo_str} | {win_rate_pct} | "
|
|
128
|
+
f"{wlt} | {entry.total_battles} |"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
lines.append("")
|
|
132
|
+
|
|
133
|
+
# Footer
|
|
134
|
+
lines.append("---")
|
|
135
|
+
lines.append(f"*Generated by GenArena Arena Evaluation*")
|
|
136
|
+
|
|
137
|
+
return "\n".join(lines)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def save_leaderboard(
|
|
141
|
+
state: ArenaState,
|
|
142
|
+
path: str,
|
|
143
|
+
title: Optional[str] = None
|
|
144
|
+
) -> None:
|
|
145
|
+
"""
|
|
146
|
+
Generate and save leaderboard to a file.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
state: ArenaState with model statistics
|
|
150
|
+
path: Path to save the Markdown file (typically README.md)
|
|
151
|
+
title: Optional title for the leaderboard
|
|
152
|
+
"""
|
|
153
|
+
content = generate_leaderboard(state, title)
|
|
154
|
+
|
|
155
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
156
|
+
f.write(content)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def generate_experiment_readme(
|
|
160
|
+
exp_name: str,
|
|
161
|
+
elo: dict[str, float],
|
|
162
|
+
model_count: int,
|
|
163
|
+
battle_count: int,
|
|
164
|
+
ci_lower: Optional[dict[str, float]] = None,
|
|
165
|
+
ci_upper: Optional[dict[str, float]] = None,
|
|
166
|
+
) -> str:
|
|
167
|
+
"""
|
|
168
|
+
Generate README.md content for an experiment directory.
|
|
169
|
+
|
|
170
|
+
Shows the cumulative ELO leaderboard up to (and including) this experiment.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
exp_name: Experiment name (e.g., "MyExp_20260128")
|
|
174
|
+
elo: Dict mapping model name to ELO rating
|
|
175
|
+
model_count: Number of models
|
|
176
|
+
battle_count: Total battles up to this experiment
|
|
177
|
+
ci_lower: Optional dict of CI lower bounds
|
|
178
|
+
ci_upper: Optional dict of CI upper bounds
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Markdown formatted README content
|
|
182
|
+
"""
|
|
183
|
+
lines = []
|
|
184
|
+
|
|
185
|
+
# Title
|
|
186
|
+
lines.append(f"# {exp_name}")
|
|
187
|
+
lines.append("")
|
|
188
|
+
|
|
189
|
+
# Summary
|
|
190
|
+
lines.append(f"**Models:** {model_count}")
|
|
191
|
+
lines.append(f"**Cumulative Battles:** {battle_count}")
|
|
192
|
+
lines.append("")
|
|
193
|
+
|
|
194
|
+
# Build sorted entries
|
|
195
|
+
entries = sorted(elo.items(), key=lambda x: x[1], reverse=True)
|
|
196
|
+
|
|
197
|
+
# Check if CI info is available
|
|
198
|
+
has_ci = ci_lower is not None and ci_upper is not None and len(ci_lower) > 0
|
|
199
|
+
|
|
200
|
+
# Table header
|
|
201
|
+
if has_ci:
|
|
202
|
+
lines.append("| Rank | Model | ELO | 95% CI |")
|
|
203
|
+
lines.append("|------|-------|-----|--------|")
|
|
204
|
+
else:
|
|
205
|
+
lines.append("| Rank | Model | ELO |")
|
|
206
|
+
lines.append("|------|-------|-----|")
|
|
207
|
+
|
|
208
|
+
# Table rows
|
|
209
|
+
for rank, (model, elo_val) in enumerate(entries, start=1):
|
|
210
|
+
elo_str = f"{elo_val:.0f}"
|
|
211
|
+
if has_ci and model in ci_lower and model in ci_upper:
|
|
212
|
+
width = ci_upper[model] - ci_lower[model]
|
|
213
|
+
ci_str = f"±{width / 2:.1f}"
|
|
214
|
+
lines.append(f"| {rank} | {model} | {elo_str} | {ci_str} |")
|
|
215
|
+
elif has_ci:
|
|
216
|
+
lines.append(f"| {rank} | {model} | {elo_str} | N/A |")
|
|
217
|
+
else:
|
|
218
|
+
lines.append(f"| {rank} | {model} | {elo_str} |")
|
|
219
|
+
|
|
220
|
+
lines.append("")
|
|
221
|
+
lines.append("---")
|
|
222
|
+
lines.append("*Generated by GenArena*")
|
|
223
|
+
|
|
224
|
+
return "\n".join(lines)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def print_leaderboard(state: ArenaState, title: Optional[str] = None) -> None:
|
|
228
|
+
"""
|
|
229
|
+
Print leaderboard to stdout.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
state: ArenaState with model statistics
|
|
233
|
+
title: Optional title for the leaderboard
|
|
234
|
+
"""
|
|
235
|
+
content = generate_leaderboard(state, title)
|
|
236
|
+
print(content)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def get_leaderboard_entries(state: ArenaState) -> list[LeaderboardEntry]:
|
|
240
|
+
"""
|
|
241
|
+
Get leaderboard entries as a list.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
state: ArenaState with model statistics
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
List of LeaderboardEntry objects sorted by ELO
|
|
248
|
+
"""
|
|
249
|
+
entries: list[LeaderboardEntry] = []
|
|
250
|
+
|
|
251
|
+
for model, stats in state.models.items():
|
|
252
|
+
entries.append(LeaderboardEntry(
|
|
253
|
+
rank=0,
|
|
254
|
+
model=model,
|
|
255
|
+
elo=stats.elo,
|
|
256
|
+
wins=stats.wins,
|
|
257
|
+
losses=stats.losses,
|
|
258
|
+
ties=stats.ties,
|
|
259
|
+
total_battles=stats.total_battles,
|
|
260
|
+
win_rate=stats.win_rate,
|
|
261
|
+
ci_lower=stats.ci_lower,
|
|
262
|
+
ci_upper=stats.ci_upper,
|
|
263
|
+
))
|
|
264
|
+
|
|
265
|
+
entries.sort(key=lambda e: e.elo, reverse=True)
|
|
266
|
+
|
|
267
|
+
for i, entry in enumerate(entries):
|
|
268
|
+
entry.rank = i + 1
|
|
269
|
+
|
|
270
|
+
return entries
|