genarena 0.0.1__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. genarena/__init__.py +49 -2
  2. genarena/__main__.py +10 -0
  3. genarena/arena.py +1685 -0
  4. genarena/battle.py +337 -0
  5. genarena/bt_elo.py +507 -0
  6. genarena/cli.py +1581 -0
  7. genarena/data.py +476 -0
  8. genarena/deploy/Dockerfile +22 -0
  9. genarena/deploy/README.md +55 -0
  10. genarena/deploy/__init__.py +5 -0
  11. genarena/deploy/app.py +84 -0
  12. genarena/experiments.py +121 -0
  13. genarena/leaderboard.py +270 -0
  14. genarena/logs.py +409 -0
  15. genarena/models.py +412 -0
  16. genarena/prompts/__init__.py +127 -0
  17. genarena/prompts/mmrb2.py +373 -0
  18. genarena/sampling.py +336 -0
  19. genarena/state.py +656 -0
  20. genarena/sync/__init__.py +105 -0
  21. genarena/sync/auto_commit.py +118 -0
  22. genarena/sync/deploy_ops.py +543 -0
  23. genarena/sync/git_ops.py +422 -0
  24. genarena/sync/hf_ops.py +891 -0
  25. genarena/sync/init_ops.py +431 -0
  26. genarena/sync/packer.py +587 -0
  27. genarena/sync/submit.py +837 -0
  28. genarena/utils.py +103 -0
  29. genarena/validation/__init__.py +19 -0
  30. genarena/validation/schema.py +327 -0
  31. genarena/validation/validator.py +329 -0
  32. genarena/visualize/README.md +148 -0
  33. genarena/visualize/__init__.py +14 -0
  34. genarena/visualize/app.py +938 -0
  35. genarena/visualize/data_loader.py +2430 -0
  36. genarena/visualize/static/app.js +3762 -0
  37. genarena/visualize/static/model_aliases.json +86 -0
  38. genarena/visualize/static/style.css +4104 -0
  39. genarena/visualize/templates/index.html +413 -0
  40. genarena/vlm.py +519 -0
  41. genarena-0.1.1.dist-info/METADATA +178 -0
  42. genarena-0.1.1.dist-info/RECORD +44 -0
  43. {genarena-0.0.1.dist-info → genarena-0.1.1.dist-info}/WHEEL +1 -2
  44. genarena-0.1.1.dist-info/entry_points.txt +2 -0
  45. genarena-0.0.1.dist-info/METADATA +0 -26
  46. genarena-0.0.1.dist-info/RECORD +0 -5
  47. genarena-0.0.1.dist-info/top_level.txt +0 -1
@@ -0,0 +1,121 @@
1
+ # Copyright 2026 Ruihang Li.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See LICENSE file in the project root for details.
4
+
5
+ """Experiment name utilities for GenArena.
6
+
7
+ This module centralizes experiment naming rules and milestone detection.
8
+
9
+ Key rules:
10
+ - exp_name must end with a date suffix: `_yyyymmdd` (8 digits)
11
+ - milestone experiments are identified by a fixed prefix constant
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import os
17
+ import re
18
+ from dataclasses import dataclass
19
+ from datetime import date, datetime
20
+ from typing import List, Optional, Tuple
21
+
22
+ # Fixed milestone prefix (must be constant across the codebase)
23
+ MILESTONE_PREFIX: str = "GenArena_"
24
+
25
+ # exp_name must end with "_yyyymmdd"
26
+ _EXP_DATE_SUFFIX_RE = re.compile(r"^(?P<prefix>.+)_(?P<date>\d{8})$")
27
+
28
+
29
+ def parse_exp_date_suffix(exp_name: str) -> Optional[date]:
30
+ """Parse the `_yyyymmdd` suffix from exp_name.
31
+
32
+ Returns None if exp_name does not match the required format or if the date is invalid.
33
+ """
34
+ m = _EXP_DATE_SUFFIX_RE.match(exp_name or "")
35
+ if not m:
36
+ return None
37
+ datestr = m.group("date")
38
+ try:
39
+ return datetime.strptime(datestr, "%Y%m%d").date()
40
+ except ValueError:
41
+ return None
42
+
43
+
44
+ def is_valid_exp_name(exp_name: str) -> bool:
45
+ """Return True iff exp_name ends with a valid `_yyyymmdd` date suffix."""
46
+ return parse_exp_date_suffix(exp_name) is not None
47
+
48
+
49
+ def validate_exp_name(exp_name: str) -> Tuple[bool, str]:
50
+ """Validate exp_name. Returns (ok, error_message)."""
51
+ if not exp_name:
52
+ return False, "exp_name is empty"
53
+ if not _EXP_DATE_SUFFIX_RE.match(exp_name):
54
+ return False, "exp_name must end with `_yyyymmdd` (e.g., `MyExp_20260128`)"
55
+ if parse_exp_date_suffix(exp_name) is None:
56
+ return False, "exp_name has an invalid date suffix; expected a real calendar date in `_yyyymmdd`"
57
+ return True, ""
58
+
59
+
60
+ def require_valid_exp_name(exp_name: str) -> None:
61
+ """Raise ValueError if exp_name is invalid."""
62
+ ok, msg = validate_exp_name(exp_name)
63
+ if not ok:
64
+ raise ValueError(msg)
65
+
66
+
67
+ def is_milestone_exp(exp_name: str) -> bool:
68
+ """Return True if exp_name is a milestone experiment (fixed prefix)."""
69
+ return (exp_name or "").startswith(MILESTONE_PREFIX)
70
+
71
+
72
+ @dataclass(frozen=True)
73
+ class ExperimentInfo:
74
+ """Parsed experiment directory info."""
75
+
76
+ name: str
77
+ date: date
78
+
79
+
80
+ def discover_experiments(models_root: str) -> List[ExperimentInfo]:
81
+ """Discover valid experiment directories under a models root directory.
82
+
83
+ Args:
84
+ models_root: `arena_dir/<subset>/models`
85
+
86
+ Returns:
87
+ List of ExperimentInfo for directories whose name matches `_yyyymmdd`.
88
+ """
89
+ infos: List[ExperimentInfo] = []
90
+ if not os.path.isdir(models_root):
91
+ return infos
92
+
93
+ for name in os.listdir(models_root):
94
+ if name.startswith("."):
95
+ continue
96
+ path = os.path.join(models_root, name)
97
+ if not os.path.isdir(path):
98
+ continue
99
+ d = parse_exp_date_suffix(name)
100
+ if d is None:
101
+ continue
102
+ infos.append(ExperimentInfo(name=name, date=d))
103
+
104
+ # Sort by (date, name) for deterministic ordering
105
+ infos.sort(key=lambda x: (x.date, x.name))
106
+ return infos
107
+
108
+
109
+ def pick_latest_experiment_name(models_root: str) -> str:
110
+ """Pick the latest experiment name under models_root by `_yyyymmdd` suffix.
111
+
112
+ Raises ValueError if no valid experiment directories exist.
113
+ """
114
+ infos = discover_experiments(models_root)
115
+ if not infos:
116
+ raise ValueError(
117
+ f"No valid experiments found under models dir: {models_root}. "
118
+ f"Expected subdirectories named like `<something>_yyyymmdd`."
119
+ )
120
+ return infos[-1].name
121
+
@@ -0,0 +1,270 @@
1
+ # Copyright 2026 Ruihang Li.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See LICENSE file in the project root for details.
4
+
5
+ """Leaderboard generation module."""
6
+
7
+ from dataclasses import dataclass
8
+ from datetime import datetime
9
+ from typing import Optional
10
+
11
+ from genarena.state import ArenaState
12
+
13
+
14
+ @dataclass
15
+ class LeaderboardEntry:
16
+ """Entry in the leaderboard."""
17
+
18
+ rank: int
19
+ model: str
20
+ elo: float
21
+ wins: int
22
+ losses: int
23
+ ties: int
24
+ total_battles: int
25
+ win_rate: float
26
+ ci_lower: Optional[float] = None
27
+ ci_upper: Optional[float] = None
28
+
29
+ @property
30
+ def ci_width(self) -> Optional[float]:
31
+ """95% CI width, or None if CI not computed."""
32
+ if self.ci_lower is None or self.ci_upper is None:
33
+ return None
34
+ return self.ci_upper - self.ci_lower
35
+
36
+ @property
37
+ def ci_str(self) -> str:
38
+ """Format CI as string (e.g., '±7.5' or 'N/A')."""
39
+ width = self.ci_width
40
+ if width is None:
41
+ return "N/A"
42
+ return f"±{width / 2:.1f}"
43
+
44
+
45
+ def generate_leaderboard(
46
+ state: ArenaState,
47
+ title: Optional[str] = None,
48
+ show_ci: bool = True,
49
+ ) -> str:
50
+ """
51
+ Generate a Markdown leaderboard from arena state.
52
+
53
+ Args:
54
+ state: ArenaState with model statistics
55
+ title: Optional title for the leaderboard
56
+ show_ci: Whether to show CI column (default: True)
57
+
58
+ Returns:
59
+ Markdown formatted leaderboard string
60
+ """
61
+ # Build list of entries sorted by ELO
62
+ entries: list[LeaderboardEntry] = []
63
+
64
+ for model, stats in state.models.items():
65
+ entries.append(LeaderboardEntry(
66
+ rank=0, # Will be set after sorting
67
+ model=model,
68
+ elo=stats.elo,
69
+ wins=stats.wins,
70
+ losses=stats.losses,
71
+ ties=stats.ties,
72
+ total_battles=stats.total_battles,
73
+ win_rate=stats.win_rate,
74
+ ci_lower=stats.ci_lower,
75
+ ci_upper=stats.ci_upper,
76
+ ))
77
+
78
+ # Sort by ELO descending
79
+ entries.sort(key=lambda e: e.elo, reverse=True)
80
+
81
+ # Assign ranks
82
+ for i, entry in enumerate(entries):
83
+ entry.rank = i + 1
84
+
85
+ # Check if any entry has CI information
86
+ has_ci = any(e.ci_lower is not None for e in entries)
87
+ show_ci = show_ci and has_ci
88
+
89
+ # Generate Markdown
90
+ lines = []
91
+
92
+ # Title
93
+ if title:
94
+ lines.append(f"# {title}")
95
+ else:
96
+ lines.append("# ELO Leaderboard")
97
+ lines.append("")
98
+
99
+ # Summary
100
+ lines.append(f"**Total Models:** {len(entries)}")
101
+ lines.append(f"**Total Battles:** {state.total_battles}")
102
+ if state.last_updated:
103
+ lines.append(f"**Last Updated:** {state.last_updated}")
104
+ lines.append("")
105
+
106
+ # Table header
107
+ if show_ci:
108
+ lines.append("| Rank | Model | ELO | 95% CI | Win Rate | W/L/T | Battles |")
109
+ lines.append("|------|-------|-----|--------|----------|-------|---------|")
110
+ else:
111
+ lines.append("| Rank | Model | ELO | Win Rate | W/L/T | Battles |")
112
+ lines.append("|------|-------|-----|----------|-------|---------|")
113
+
114
+ # Table rows
115
+ for entry in entries:
116
+ win_rate_pct = f"{entry.win_rate * 100:.1f}%"
117
+ wlt = f"{entry.wins}/{entry.losses}/{entry.ties}"
118
+ elo_str = f"{entry.elo:.0f}"
119
+
120
+ if show_ci:
121
+ lines.append(
122
+ f"| {entry.rank} | {entry.model} | {elo_str} | {entry.ci_str} | "
123
+ f"{win_rate_pct} | {wlt} | {entry.total_battles} |"
124
+ )
125
+ else:
126
+ lines.append(
127
+ f"| {entry.rank} | {entry.model} | {elo_str} | {win_rate_pct} | "
128
+ f"{wlt} | {entry.total_battles} |"
129
+ )
130
+
131
+ lines.append("")
132
+
133
+ # Footer
134
+ lines.append("---")
135
+ lines.append(f"*Generated by GenArena Arena Evaluation*")
136
+
137
+ return "\n".join(lines)
138
+
139
+
140
+ def save_leaderboard(
141
+ state: ArenaState,
142
+ path: str,
143
+ title: Optional[str] = None
144
+ ) -> None:
145
+ """
146
+ Generate and save leaderboard to a file.
147
+
148
+ Args:
149
+ state: ArenaState with model statistics
150
+ path: Path to save the Markdown file (typically README.md)
151
+ title: Optional title for the leaderboard
152
+ """
153
+ content = generate_leaderboard(state, title)
154
+
155
+ with open(path, "w", encoding="utf-8") as f:
156
+ f.write(content)
157
+
158
+
159
+ def generate_experiment_readme(
160
+ exp_name: str,
161
+ elo: dict[str, float],
162
+ model_count: int,
163
+ battle_count: int,
164
+ ci_lower: Optional[dict[str, float]] = None,
165
+ ci_upper: Optional[dict[str, float]] = None,
166
+ ) -> str:
167
+ """
168
+ Generate README.md content for an experiment directory.
169
+
170
+ Shows the cumulative ELO leaderboard up to (and including) this experiment.
171
+
172
+ Args:
173
+ exp_name: Experiment name (e.g., "MyExp_20260128")
174
+ elo: Dict mapping model name to ELO rating
175
+ model_count: Number of models
176
+ battle_count: Total battles up to this experiment
177
+ ci_lower: Optional dict of CI lower bounds
178
+ ci_upper: Optional dict of CI upper bounds
179
+
180
+ Returns:
181
+ Markdown formatted README content
182
+ """
183
+ lines = []
184
+
185
+ # Title
186
+ lines.append(f"# {exp_name}")
187
+ lines.append("")
188
+
189
+ # Summary
190
+ lines.append(f"**Models:** {model_count}")
191
+ lines.append(f"**Cumulative Battles:** {battle_count}")
192
+ lines.append("")
193
+
194
+ # Build sorted entries
195
+ entries = sorted(elo.items(), key=lambda x: x[1], reverse=True)
196
+
197
+ # Check if CI info is available
198
+ has_ci = ci_lower is not None and ci_upper is not None and len(ci_lower) > 0
199
+
200
+ # Table header
201
+ if has_ci:
202
+ lines.append("| Rank | Model | ELO | 95% CI |")
203
+ lines.append("|------|-------|-----|--------|")
204
+ else:
205
+ lines.append("| Rank | Model | ELO |")
206
+ lines.append("|------|-------|-----|")
207
+
208
+ # Table rows
209
+ for rank, (model, elo_val) in enumerate(entries, start=1):
210
+ elo_str = f"{elo_val:.0f}"
211
+ if has_ci and model in ci_lower and model in ci_upper:
212
+ width = ci_upper[model] - ci_lower[model]
213
+ ci_str = f"±{width / 2:.1f}"
214
+ lines.append(f"| {rank} | {model} | {elo_str} | {ci_str} |")
215
+ elif has_ci:
216
+ lines.append(f"| {rank} | {model} | {elo_str} | N/A |")
217
+ else:
218
+ lines.append(f"| {rank} | {model} | {elo_str} |")
219
+
220
+ lines.append("")
221
+ lines.append("---")
222
+ lines.append("*Generated by GenArena*")
223
+
224
+ return "\n".join(lines)
225
+
226
+
227
+ def print_leaderboard(state: ArenaState, title: Optional[str] = None) -> None:
228
+ """
229
+ Print leaderboard to stdout.
230
+
231
+ Args:
232
+ state: ArenaState with model statistics
233
+ title: Optional title for the leaderboard
234
+ """
235
+ content = generate_leaderboard(state, title)
236
+ print(content)
237
+
238
+
239
+ def get_leaderboard_entries(state: ArenaState) -> list[LeaderboardEntry]:
240
+ """
241
+ Get leaderboard entries as a list.
242
+
243
+ Args:
244
+ state: ArenaState with model statistics
245
+
246
+ Returns:
247
+ List of LeaderboardEntry objects sorted by ELO
248
+ """
249
+ entries: list[LeaderboardEntry] = []
250
+
251
+ for model, stats in state.models.items():
252
+ entries.append(LeaderboardEntry(
253
+ rank=0,
254
+ model=model,
255
+ elo=stats.elo,
256
+ wins=stats.wins,
257
+ losses=stats.losses,
258
+ ties=stats.ties,
259
+ total_battles=stats.total_battles,
260
+ win_rate=stats.win_rate,
261
+ ci_lower=stats.ci_lower,
262
+ ci_upper=stats.ci_upper,
263
+ ))
264
+
265
+ entries.sort(key=lambda e: e.elo, reverse=True)
266
+
267
+ for i, entry in enumerate(entries):
268
+ entry.rank = i + 1
269
+
270
+ return entries