genarena 0.0.1__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genarena/__init__.py +49 -2
- genarena/__main__.py +10 -0
- genarena/arena.py +1685 -0
- genarena/battle.py +337 -0
- genarena/bt_elo.py +507 -0
- genarena/cli.py +1581 -0
- genarena/data.py +476 -0
- genarena/deploy/Dockerfile +22 -0
- genarena/deploy/README.md +55 -0
- genarena/deploy/__init__.py +5 -0
- genarena/deploy/app.py +84 -0
- genarena/experiments.py +121 -0
- genarena/leaderboard.py +270 -0
- genarena/logs.py +409 -0
- genarena/models.py +412 -0
- genarena/prompts/__init__.py +127 -0
- genarena/prompts/mmrb2.py +373 -0
- genarena/sampling.py +336 -0
- genarena/state.py +656 -0
- genarena/sync/__init__.py +105 -0
- genarena/sync/auto_commit.py +118 -0
- genarena/sync/deploy_ops.py +543 -0
- genarena/sync/git_ops.py +422 -0
- genarena/sync/hf_ops.py +891 -0
- genarena/sync/init_ops.py +431 -0
- genarena/sync/packer.py +587 -0
- genarena/sync/submit.py +837 -0
- genarena/utils.py +103 -0
- genarena/validation/__init__.py +19 -0
- genarena/validation/schema.py +327 -0
- genarena/validation/validator.py +329 -0
- genarena/visualize/README.md +148 -0
- genarena/visualize/__init__.py +14 -0
- genarena/visualize/app.py +938 -0
- genarena/visualize/data_loader.py +2430 -0
- genarena/visualize/static/app.js +3762 -0
- genarena/visualize/static/model_aliases.json +86 -0
- genarena/visualize/static/style.css +4104 -0
- genarena/visualize/templates/index.html +413 -0
- genarena/vlm.py +519 -0
- genarena-0.1.1.dist-info/METADATA +178 -0
- genarena-0.1.1.dist-info/RECORD +44 -0
- {genarena-0.0.1.dist-info → genarena-0.1.1.dist-info}/WHEEL +1 -2
- genarena-0.1.1.dist-info/entry_points.txt +2 -0
- genarena-0.0.1.dist-info/METADATA +0 -26
- genarena-0.0.1.dist-info/RECORD +0 -5
- genarena-0.0.1.dist-info/top_level.txt +0 -1
genarena/sync/submit.py
ADDED
|
@@ -0,0 +1,837 @@
|
|
|
1
|
+
# Copyright 2026 Ruihang Li.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0.
|
|
3
|
+
# See LICENSE file in the project root for details.
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Submission functionality for GenArena.
|
|
7
|
+
|
|
8
|
+
This module provides the ability for users to submit their evaluation results
|
|
9
|
+
to the official leaderboard via GitHub PR.
|
|
10
|
+
|
|
11
|
+
Workflow:
|
|
12
|
+
1. Validate local submission data
|
|
13
|
+
2. Upload data to user's HuggingFace repository
|
|
14
|
+
3. Create submission metadata JSON
|
|
15
|
+
4. Fork official repo and create PR via GitHub CLI
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import hashlib
|
|
19
|
+
import json
|
|
20
|
+
import logging
|
|
21
|
+
import os
|
|
22
|
+
import subprocess
|
|
23
|
+
import tempfile
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
from datetime import datetime, timezone
|
|
26
|
+
from typing import Any, Optional
|
|
27
|
+
|
|
28
|
+
from genarena import __version__
|
|
29
|
+
from genarena.experiments import is_valid_exp_name
|
|
30
|
+
from genarena.logs import load_battle_records
|
|
31
|
+
from genarena.sync.packer import (
|
|
32
|
+
TempPackingContext,
|
|
33
|
+
pack_exp_dir,
|
|
34
|
+
pack_directory,
|
|
35
|
+
IMAGE_EXTENSIONS,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
# Default official submissions repository
|
|
41
|
+
DEFAULT_OFFICIAL_REPO = "genarena/submissions"
|
|
42
|
+
|
|
43
|
+
# URL to fetch official models list
|
|
44
|
+
OFFICIAL_MODELS_URL = (
|
|
45
|
+
"https://raw.githubusercontent.com/genarena/submissions/main/official_models.json"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class ValidationResult:
|
|
51
|
+
"""Result of local submission validation."""
|
|
52
|
+
|
|
53
|
+
valid: bool
|
|
54
|
+
exp_name: str
|
|
55
|
+
subset: str
|
|
56
|
+
models: list[str] = field(default_factory=list)
|
|
57
|
+
new_models: list[str] = field(default_factory=list)
|
|
58
|
+
existing_models: list[str] = field(default_factory=list)
|
|
59
|
+
total_battles: int = 0
|
|
60
|
+
battles_per_pair: dict[str, int] = field(default_factory=dict)
|
|
61
|
+
elo_ratings: dict[str, float] = field(default_factory=dict)
|
|
62
|
+
elo_ci: dict[str, tuple[float, float]] = field(default_factory=dict)
|
|
63
|
+
evaluation_config: dict[str, Any] = field(default_factory=dict)
|
|
64
|
+
errors: list[str] = field(default_factory=list)
|
|
65
|
+
warnings: list[str] = field(default_factory=list)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class UploadResult:
|
|
70
|
+
"""Result of HuggingFace upload."""
|
|
71
|
+
|
|
72
|
+
hf_repo: str
|
|
73
|
+
hf_revision: str
|
|
74
|
+
models_zip_path: str
|
|
75
|
+
models_zip_sha256: str
|
|
76
|
+
models_zip_size: int
|
|
77
|
+
pk_logs_zip_path: str
|
|
78
|
+
pk_logs_zip_sha256: str
|
|
79
|
+
pk_logs_zip_size: int
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def fetch_official_models(subset: str, timeout: int = 10) -> set[str]:
|
|
83
|
+
"""
|
|
84
|
+
Fetch official models list from GitHub.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
subset: Subset name to get models for
|
|
88
|
+
timeout: Request timeout in seconds
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Set of official model names for the subset
|
|
92
|
+
"""
|
|
93
|
+
import urllib.request
|
|
94
|
+
import urllib.error
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
with urllib.request.urlopen(OFFICIAL_MODELS_URL, timeout=timeout) as resp:
|
|
98
|
+
data = json.load(resp)
|
|
99
|
+
return set(data.get("subsets", {}).get(subset, {}).get("models", []))
|
|
100
|
+
except urllib.error.URLError as e:
|
|
101
|
+
logger.warning(f"Failed to fetch official models list: {e}")
|
|
102
|
+
return set()
|
|
103
|
+
except json.JSONDecodeError as e:
|
|
104
|
+
logger.warning(f"Failed to parse official models list: {e}")
|
|
105
|
+
return set()
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.warning(f"Unexpected error fetching official models: {e}")
|
|
108
|
+
return set()
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _load_experiment_config(exp_dir: str) -> dict[str, Any]:
|
|
112
|
+
"""Load experiment configuration from config.json."""
|
|
113
|
+
config_path = os.path.join(exp_dir, "config.json")
|
|
114
|
+
if not os.path.isfile(config_path):
|
|
115
|
+
return {}
|
|
116
|
+
try:
|
|
117
|
+
with open(config_path, "r", encoding="utf-8") as f:
|
|
118
|
+
return json.load(f)
|
|
119
|
+
except (json.JSONDecodeError, IOError):
|
|
120
|
+
return {}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def validate_local_submission(
|
|
124
|
+
arena_dir: str,
|
|
125
|
+
subset: str,
|
|
126
|
+
exp_name: str,
|
|
127
|
+
skip_official_check: bool = False,
|
|
128
|
+
) -> ValidationResult:
|
|
129
|
+
"""
|
|
130
|
+
Validate local submission data.
|
|
131
|
+
|
|
132
|
+
Checks:
|
|
133
|
+
1. exp_name format (_yyyymmdd suffix)
|
|
134
|
+
2. pk_logs directory exists and has battle records
|
|
135
|
+
3. models directory exists and has model outputs
|
|
136
|
+
4. All models in battles have corresponding outputs
|
|
137
|
+
5. At least one model is new (not in official leaderboard)
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
arena_dir: Arena directory path
|
|
141
|
+
subset: Subset name
|
|
142
|
+
exp_name: Experiment name
|
|
143
|
+
skip_official_check: Skip checking against official models (for testing)
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
ValidationResult with validation status and details
|
|
147
|
+
"""
|
|
148
|
+
errors: list[str] = []
|
|
149
|
+
warnings: list[str] = []
|
|
150
|
+
|
|
151
|
+
# Check exp_name format
|
|
152
|
+
if not is_valid_exp_name(exp_name):
|
|
153
|
+
errors.append(
|
|
154
|
+
f"Invalid exp_name format: '{exp_name}' must end with _yyyymmdd"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Check paths exist
|
|
158
|
+
pk_logs_dir = os.path.join(arena_dir, subset, "pk_logs")
|
|
159
|
+
exp_dir = os.path.join(pk_logs_dir, exp_name)
|
|
160
|
+
models_root = os.path.join(arena_dir, subset, "models")
|
|
161
|
+
exp_models_dir = os.path.join(models_root, exp_name)
|
|
162
|
+
|
|
163
|
+
if not os.path.isdir(exp_dir):
|
|
164
|
+
errors.append(f"pk_logs directory not found: {exp_dir}")
|
|
165
|
+
|
|
166
|
+
if not os.path.isdir(exp_models_dir):
|
|
167
|
+
errors.append(f"models directory not found: {exp_models_dir}")
|
|
168
|
+
|
|
169
|
+
if errors:
|
|
170
|
+
return ValidationResult(
|
|
171
|
+
valid=False,
|
|
172
|
+
exp_name=exp_name,
|
|
173
|
+
subset=subset,
|
|
174
|
+
errors=errors,
|
|
175
|
+
warnings=warnings,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Load battle records
|
|
179
|
+
records = load_battle_records(pk_logs_dir, exp_name=exp_name)
|
|
180
|
+
if not records:
|
|
181
|
+
errors.append("No battle records found in pk_logs")
|
|
182
|
+
return ValidationResult(
|
|
183
|
+
valid=False,
|
|
184
|
+
exp_name=exp_name,
|
|
185
|
+
subset=subset,
|
|
186
|
+
errors=errors,
|
|
187
|
+
warnings=warnings,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Extract models and battle statistics
|
|
191
|
+
models: set[str] = set()
|
|
192
|
+
battles_per_pair: dict[str, int] = {}
|
|
193
|
+
|
|
194
|
+
for r in records:
|
|
195
|
+
model_a = r.get("model_a", "")
|
|
196
|
+
model_b = r.get("model_b", "")
|
|
197
|
+
if model_a and model_b:
|
|
198
|
+
models.add(model_a)
|
|
199
|
+
models.add(model_b)
|
|
200
|
+
# Ensure consistent pair key (sorted)
|
|
201
|
+
pair_key = f"{min(model_a, model_b)}_vs_{max(model_a, model_b)}"
|
|
202
|
+
battles_per_pair[pair_key] = battles_per_pair.get(pair_key, 0) + 1
|
|
203
|
+
|
|
204
|
+
models_list = sorted(models)
|
|
205
|
+
|
|
206
|
+
# Check model outputs exist
|
|
207
|
+
for model in models_list:
|
|
208
|
+
model_dir = os.path.join(exp_models_dir, model)
|
|
209
|
+
if not os.path.isdir(model_dir):
|
|
210
|
+
errors.append(f"Model output directory not found: {model_dir}")
|
|
211
|
+
else:
|
|
212
|
+
# Check if there are any images
|
|
213
|
+
has_images = False
|
|
214
|
+
for f in os.listdir(model_dir):
|
|
215
|
+
ext = os.path.splitext(f)[1].lower()
|
|
216
|
+
if ext in IMAGE_EXTENSIONS:
|
|
217
|
+
has_images = True
|
|
218
|
+
break
|
|
219
|
+
if not has_images:
|
|
220
|
+
errors.append(f"No image files found in model directory: {model_dir}")
|
|
221
|
+
|
|
222
|
+
# Check against official models
|
|
223
|
+
if not skip_official_check:
|
|
224
|
+
official_models = fetch_official_models(subset)
|
|
225
|
+
new_models = [m for m in models_list if m not in official_models]
|
|
226
|
+
existing_models = [m for m in models_list if m in official_models]
|
|
227
|
+
|
|
228
|
+
if not new_models:
|
|
229
|
+
errors.append(
|
|
230
|
+
"No new models found. All models already exist in official leaderboard. "
|
|
231
|
+
"Submissions must include at least one new model."
|
|
232
|
+
)
|
|
233
|
+
else:
|
|
234
|
+
new_models = models_list
|
|
235
|
+
existing_models = []
|
|
236
|
+
warnings.append("Skipped official models check (--skip-official-check)")
|
|
237
|
+
|
|
238
|
+
# Calculate ELO (only if no critical errors so far)
|
|
239
|
+
elo_ratings: dict[str, float] = {}
|
|
240
|
+
elo_ci: dict[str, tuple[float, float]] = {}
|
|
241
|
+
|
|
242
|
+
if not errors:
|
|
243
|
+
try:
|
|
244
|
+
from genarena.bt_elo import compute_bootstrap_bt_elo
|
|
245
|
+
|
|
246
|
+
battles = [
|
|
247
|
+
(r["model_a"], r["model_b"], r["final_winner"])
|
|
248
|
+
for r in records
|
|
249
|
+
if r.get("model_a") and r.get("model_b") and r.get("final_winner")
|
|
250
|
+
]
|
|
251
|
+
|
|
252
|
+
if battles:
|
|
253
|
+
bt_result = compute_bootstrap_bt_elo(battles, num_bootstrap=100)
|
|
254
|
+
elo_ratings = bt_result.ratings
|
|
255
|
+
for model in models_list:
|
|
256
|
+
if model in bt_result.ci_lower and model in bt_result.ci_upper:
|
|
257
|
+
elo_ci[model] = (
|
|
258
|
+
bt_result.ci_lower[model],
|
|
259
|
+
bt_result.ci_upper[model],
|
|
260
|
+
)
|
|
261
|
+
except Exception as e:
|
|
262
|
+
warnings.append(f"Failed to calculate ELO: {e}")
|
|
263
|
+
|
|
264
|
+
# Load evaluation config
|
|
265
|
+
evaluation_config = _load_experiment_config(exp_dir)
|
|
266
|
+
|
|
267
|
+
return ValidationResult(
|
|
268
|
+
valid=len(errors) == 0,
|
|
269
|
+
exp_name=exp_name,
|
|
270
|
+
subset=subset,
|
|
271
|
+
models=models_list,
|
|
272
|
+
new_models=new_models,
|
|
273
|
+
existing_models=existing_models,
|
|
274
|
+
total_battles=len(records),
|
|
275
|
+
battles_per_pair=battles_per_pair,
|
|
276
|
+
elo_ratings=elo_ratings,
|
|
277
|
+
elo_ci=elo_ci,
|
|
278
|
+
evaluation_config=evaluation_config,
|
|
279
|
+
errors=errors,
|
|
280
|
+
warnings=warnings,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def upload_submission_data(
|
|
285
|
+
arena_dir: str,
|
|
286
|
+
subset: str,
|
|
287
|
+
exp_name: str,
|
|
288
|
+
hf_repo: str,
|
|
289
|
+
hf_revision: str = "main",
|
|
290
|
+
show_progress: bool = True,
|
|
291
|
+
) -> UploadResult:
|
|
292
|
+
"""
|
|
293
|
+
Pack and upload submission data to HuggingFace.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
arena_dir: Arena directory path
|
|
297
|
+
subset: Subset name
|
|
298
|
+
exp_name: Experiment name
|
|
299
|
+
hf_repo: HuggingFace repository ID (e.g., "username/repo-name")
|
|
300
|
+
hf_revision: Repository revision/branch (default: "main")
|
|
301
|
+
show_progress: Show upload progress
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
UploadResult with upload details
|
|
305
|
+
|
|
306
|
+
Raises:
|
|
307
|
+
RuntimeError: If upload fails
|
|
308
|
+
"""
|
|
309
|
+
from huggingface_hub import HfApi
|
|
310
|
+
|
|
311
|
+
api = HfApi()
|
|
312
|
+
|
|
313
|
+
# Paths
|
|
314
|
+
exp_models_dir = os.path.join(arena_dir, subset, "models", exp_name)
|
|
315
|
+
exp_dir = os.path.join(arena_dir, subset, "pk_logs", exp_name)
|
|
316
|
+
|
|
317
|
+
with TempPackingContext() as ctx:
|
|
318
|
+
# Pack models
|
|
319
|
+
models_zip_path = ctx.get_temp_zip_path(f"{subset}/models/{exp_name}.zip")
|
|
320
|
+
success, msg = pack_directory(
|
|
321
|
+
exp_models_dir, models_zip_path, file_extensions=IMAGE_EXTENSIONS
|
|
322
|
+
)
|
|
323
|
+
if not success:
|
|
324
|
+
raise RuntimeError(f"Failed to pack models: {msg}")
|
|
325
|
+
|
|
326
|
+
# Calculate SHA256 for models
|
|
327
|
+
with open(models_zip_path, "rb") as f:
|
|
328
|
+
models_sha256 = hashlib.sha256(f.read()).hexdigest()
|
|
329
|
+
models_size = os.path.getsize(models_zip_path)
|
|
330
|
+
|
|
331
|
+
# Pack pk_logs
|
|
332
|
+
logs_zip_path = ctx.get_temp_zip_path(f"{subset}/pk_logs/{exp_name}.zip")
|
|
333
|
+
success, msg = pack_exp_dir(exp_dir, logs_zip_path)
|
|
334
|
+
if not success:
|
|
335
|
+
raise RuntimeError(f"Failed to pack pk_logs: {msg}")
|
|
336
|
+
|
|
337
|
+
# Calculate SHA256 for logs
|
|
338
|
+
with open(logs_zip_path, "rb") as f:
|
|
339
|
+
logs_sha256 = hashlib.sha256(f.read()).hexdigest()
|
|
340
|
+
logs_size = os.path.getsize(logs_zip_path)
|
|
341
|
+
|
|
342
|
+
# Upload to HF
|
|
343
|
+
hf_models_path = f"{subset}/models/{exp_name}.zip"
|
|
344
|
+
hf_logs_path = f"{subset}/pk_logs/{exp_name}.zip"
|
|
345
|
+
|
|
346
|
+
logger.info(f"Uploading models ZIP ({models_size / 1024 / 1024:.1f} MB)...")
|
|
347
|
+
api.upload_file(
|
|
348
|
+
path_or_fileobj=models_zip_path,
|
|
349
|
+
path_in_repo=hf_models_path,
|
|
350
|
+
repo_id=hf_repo,
|
|
351
|
+
repo_type="dataset",
|
|
352
|
+
revision=hf_revision,
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
logger.info(f"Uploading pk_logs ZIP ({logs_size / 1024 / 1024:.1f} MB)...")
|
|
356
|
+
api.upload_file(
|
|
357
|
+
path_or_fileobj=logs_zip_path,
|
|
358
|
+
path_in_repo=hf_logs_path,
|
|
359
|
+
repo_id=hf_repo,
|
|
360
|
+
repo_type="dataset",
|
|
361
|
+
revision=hf_revision,
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
return UploadResult(
|
|
365
|
+
hf_repo=hf_repo,
|
|
366
|
+
hf_revision=hf_revision,
|
|
367
|
+
models_zip_path=hf_models_path,
|
|
368
|
+
models_zip_sha256=models_sha256,
|
|
369
|
+
models_zip_size=models_size,
|
|
370
|
+
pk_logs_zip_path=hf_logs_path,
|
|
371
|
+
pk_logs_zip_sha256=logs_sha256,
|
|
372
|
+
pk_logs_zip_size=logs_size,
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def create_submission_metadata(
|
|
377
|
+
validation: ValidationResult,
|
|
378
|
+
upload: UploadResult,
|
|
379
|
+
github_username: str,
|
|
380
|
+
title: str = "",
|
|
381
|
+
description: str = "",
|
|
382
|
+
contact: str = "",
|
|
383
|
+
) -> dict[str, Any]:
|
|
384
|
+
"""
|
|
385
|
+
Create submission metadata JSON.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
validation: ValidationResult from validate_local_submission
|
|
389
|
+
upload: UploadResult from upload_submission_data
|
|
390
|
+
github_username: GitHub username of submitter
|
|
391
|
+
title: Submission title
|
|
392
|
+
description: Submission description
|
|
393
|
+
contact: Optional contact email
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
Submission metadata dictionary
|
|
397
|
+
"""
|
|
398
|
+
# Generate submission ID
|
|
399
|
+
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S")
|
|
400
|
+
hash_input = f"{timestamp}{validation.exp_name}{github_username}"
|
|
401
|
+
short_hash = hashlib.sha256(hash_input.encode()).hexdigest()[:8]
|
|
402
|
+
submission_id = f"sub_{timestamp}_{short_hash}"
|
|
403
|
+
|
|
404
|
+
# Build submitter info
|
|
405
|
+
submitter: dict[str, str] = {"github_username": github_username}
|
|
406
|
+
if contact:
|
|
407
|
+
submitter["contact"] = contact
|
|
408
|
+
|
|
409
|
+
# Build evaluation config (extract key fields)
|
|
410
|
+
eval_config = validation.evaluation_config
|
|
411
|
+
evaluation_config_summary = {
|
|
412
|
+
"judge_model": eval_config.get("judge_model", "unknown"),
|
|
413
|
+
"prompt_module": eval_config.get("prompt", "unknown"),
|
|
414
|
+
"temperature": eval_config.get("temperature", 0.0),
|
|
415
|
+
"position_debiasing": True, # Always true in genarena
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
# Build model pairs list
|
|
419
|
+
model_pairs = [
|
|
420
|
+
[min(p.split("_vs_")[0], p.split("_vs_")[1]),
|
|
421
|
+
max(p.split("_vs_")[0], p.split("_vs_")[1])]
|
|
422
|
+
for p in validation.battles_per_pair.keys()
|
|
423
|
+
]
|
|
424
|
+
|
|
425
|
+
return {
|
|
426
|
+
"schema_version": "1.0",
|
|
427
|
+
"submission_id": submission_id,
|
|
428
|
+
"created_at": datetime.now(timezone.utc).isoformat(),
|
|
429
|
+
"submitter": submitter,
|
|
430
|
+
"experiment": {
|
|
431
|
+
"exp_name": validation.exp_name,
|
|
432
|
+
"subset": validation.subset,
|
|
433
|
+
"models": validation.models,
|
|
434
|
+
"new_models": validation.new_models,
|
|
435
|
+
"existing_models": validation.existing_models,
|
|
436
|
+
"model_pairs": model_pairs,
|
|
437
|
+
"total_battles": validation.total_battles,
|
|
438
|
+
"battles_per_pair": validation.battles_per_pair,
|
|
439
|
+
},
|
|
440
|
+
"data_location": {
|
|
441
|
+
"hf_repo_id": upload.hf_repo,
|
|
442
|
+
"hf_revision": upload.hf_revision,
|
|
443
|
+
"files": {
|
|
444
|
+
"models_zip": {
|
|
445
|
+
"path": upload.models_zip_path,
|
|
446
|
+
"sha256": upload.models_zip_sha256,
|
|
447
|
+
"size_bytes": upload.models_zip_size,
|
|
448
|
+
},
|
|
449
|
+
"pk_logs_zip": {
|
|
450
|
+
"path": upload.pk_logs_zip_path,
|
|
451
|
+
"sha256": upload.pk_logs_zip_sha256,
|
|
452
|
+
"size_bytes": upload.pk_logs_zip_size,
|
|
453
|
+
},
|
|
454
|
+
},
|
|
455
|
+
},
|
|
456
|
+
"elo_preview": {
|
|
457
|
+
"ratings": validation.elo_ratings,
|
|
458
|
+
"ci_95": {m: list(ci) for m, ci in validation.elo_ci.items()},
|
|
459
|
+
},
|
|
460
|
+
"evaluation_config": evaluation_config_summary,
|
|
461
|
+
"title": title or f"Submit {validation.exp_name}",
|
|
462
|
+
"description": description,
|
|
463
|
+
"verification": {
|
|
464
|
+
"local_validation_passed": validation.valid,
|
|
465
|
+
"genarena_version": __version__,
|
|
466
|
+
},
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def _get_github_username() -> Optional[str]:
|
|
471
|
+
"""Get GitHub username from gh CLI."""
|
|
472
|
+
try:
|
|
473
|
+
result = subprocess.run(
|
|
474
|
+
["gh", "api", "user", "-q", ".login"],
|
|
475
|
+
capture_output=True,
|
|
476
|
+
text=True,
|
|
477
|
+
timeout=30,
|
|
478
|
+
)
|
|
479
|
+
if result.returncode == 0:
|
|
480
|
+
return result.stdout.strip()
|
|
481
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
482
|
+
pass
|
|
483
|
+
return None
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
def _check_gh_cli() -> tuple[bool, str]:
|
|
487
|
+
"""Check if GitHub CLI is available and authenticated."""
|
|
488
|
+
try:
|
|
489
|
+
# Check if gh is installed
|
|
490
|
+
result = subprocess.run(
|
|
491
|
+
["gh", "--version"],
|
|
492
|
+
capture_output=True,
|
|
493
|
+
text=True,
|
|
494
|
+
timeout=10,
|
|
495
|
+
)
|
|
496
|
+
if result.returncode != 0:
|
|
497
|
+
return False, "GitHub CLI (gh) is not installed"
|
|
498
|
+
|
|
499
|
+
# Check if authenticated
|
|
500
|
+
result = subprocess.run(
|
|
501
|
+
["gh", "auth", "status"],
|
|
502
|
+
capture_output=True,
|
|
503
|
+
text=True,
|
|
504
|
+
timeout=10,
|
|
505
|
+
)
|
|
506
|
+
if result.returncode != 0:
|
|
507
|
+
return False, "GitHub CLI is not authenticated. Run 'gh auth login' first."
|
|
508
|
+
|
|
509
|
+
return True, "GitHub CLI is ready"
|
|
510
|
+
except FileNotFoundError:
|
|
511
|
+
return False, "GitHub CLI (gh) is not installed. Install it from https://cli.github.com"
|
|
512
|
+
except subprocess.TimeoutExpired:
|
|
513
|
+
return False, "GitHub CLI timed out"
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def _generate_pr_body(submission: dict[str, Any]) -> str:
|
|
517
|
+
"""Generate PR description body."""
|
|
518
|
+
exp = submission["experiment"]
|
|
519
|
+
elo = submission["elo_preview"]["ratings"]
|
|
520
|
+
eval_config = submission["evaluation_config"]
|
|
521
|
+
|
|
522
|
+
body = f"""## Submission Details
|
|
523
|
+
|
|
524
|
+
**Experiment:** `{exp['exp_name']}`
|
|
525
|
+
**Subset:** `{exp['subset']}`
|
|
526
|
+
**New Models:** {', '.join(f'`{m}`' for m in exp['new_models']) or 'None'}
|
|
527
|
+
**Total Battles:** {exp['total_battles']:,}
|
|
528
|
+
**Model Pairs:** {len(exp['model_pairs'])}
|
|
529
|
+
|
|
530
|
+
### Evaluation Configuration
|
|
531
|
+
|
|
532
|
+
| Setting | Value |
|
|
533
|
+
|---------|-------|
|
|
534
|
+
| Judge Model | `{eval_config.get('judge_model', 'N/A')}` |
|
|
535
|
+
| Prompt Module | `{eval_config.get('prompt_module', 'N/A')}` |
|
|
536
|
+
| Temperature | {eval_config.get('temperature', 'N/A')} |
|
|
537
|
+
| Position Debiasing | {'Yes' if eval_config.get('position_debiasing') else 'No'} |
|
|
538
|
+
|
|
539
|
+
### ELO Preview
|
|
540
|
+
|
|
541
|
+
| Model | ELO | 95% CI |
|
|
542
|
+
|-------|-----|--------|
|
|
543
|
+
"""
|
|
544
|
+
ci_data = submission["elo_preview"].get("ci_95", {})
|
|
545
|
+
for model in sorted(elo.keys(), key=lambda m: -elo[m]):
|
|
546
|
+
ci = ci_data.get(model, [None, None])
|
|
547
|
+
ci_str = f"[{ci[0]:.1f}, {ci[1]:.1f}]" if ci[0] is not None else "N/A"
|
|
548
|
+
body += f"| {model} | {elo[model]:.1f} | {ci_str} |\n"
|
|
549
|
+
|
|
550
|
+
body += f"""
|
|
551
|
+
### Data Location
|
|
552
|
+
|
|
553
|
+
- **HuggingFace Repo:** `{submission['data_location']['hf_repo_id']}`
|
|
554
|
+
- **Models ZIP:** `{submission['data_location']['files']['models_zip']['path']}`
|
|
555
|
+
- SHA256: `{submission['data_location']['files']['models_zip']['sha256'][:16]}...`
|
|
556
|
+
- Size: {submission['data_location']['files']['models_zip']['size_bytes'] / 1024 / 1024:.1f} MB
|
|
557
|
+
- **Logs ZIP:** `{submission['data_location']['files']['pk_logs_zip']['path']}`
|
|
558
|
+
- SHA256: `{submission['data_location']['files']['pk_logs_zip']['sha256'][:16]}...`
|
|
559
|
+
- Size: {submission['data_location']['files']['pk_logs_zip']['size_bytes'] / 1024:.1f} KB
|
|
560
|
+
|
|
561
|
+
### Description
|
|
562
|
+
|
|
563
|
+
{submission.get('description') or submission.get('title', 'No description provided.')}
|
|
564
|
+
|
|
565
|
+
---
|
|
566
|
+
*Submitted via genarena v{submission['verification']['genarena_version']}*
|
|
567
|
+
"""
|
|
568
|
+
return body
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def create_submission_pr(
|
|
572
|
+
submission: dict[str, Any],
|
|
573
|
+
official_repo: str = DEFAULT_OFFICIAL_REPO,
|
|
574
|
+
title: Optional[str] = None,
|
|
575
|
+
) -> str:
|
|
576
|
+
"""
|
|
577
|
+
Fork official repo and create PR with submission.
|
|
578
|
+
|
|
579
|
+
Args:
|
|
580
|
+
submission: Submission metadata dictionary
|
|
581
|
+
official_repo: Official submissions repository (default: genarena/submissions)
|
|
582
|
+
title: PR title (optional, auto-generated if not provided)
|
|
583
|
+
|
|
584
|
+
Returns:
|
|
585
|
+
PR URL
|
|
586
|
+
|
|
587
|
+
Raises:
|
|
588
|
+
RuntimeError: If PR creation fails
|
|
589
|
+
"""
|
|
590
|
+
submission_id = submission["submission_id"]
|
|
591
|
+
filename = f"{submission_id}.json"
|
|
592
|
+
|
|
593
|
+
# Get GitHub username
|
|
594
|
+
gh_username = _get_github_username()
|
|
595
|
+
if not gh_username:
|
|
596
|
+
raise RuntimeError("Failed to get GitHub username. Ensure gh CLI is authenticated.")
|
|
597
|
+
|
|
598
|
+
# Fork the repo (idempotent - won't fail if already forked)
|
|
599
|
+
logger.info(f"Forking {official_repo}...")
|
|
600
|
+
result = subprocess.run(
|
|
601
|
+
["gh", "repo", "fork", official_repo, "--clone=false"],
|
|
602
|
+
capture_output=True,
|
|
603
|
+
text=True,
|
|
604
|
+
timeout=60,
|
|
605
|
+
)
|
|
606
|
+
# Note: fork may "fail" if already forked, but that's OK
|
|
607
|
+
|
|
608
|
+
# Clone forked repo to temp directory
|
|
609
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
610
|
+
fork_repo = f"{gh_username}/submissions"
|
|
611
|
+
logger.info(f"Cloning {fork_repo}...")
|
|
612
|
+
|
|
613
|
+
result = subprocess.run(
|
|
614
|
+
["gh", "repo", "clone", fork_repo, tmpdir],
|
|
615
|
+
capture_output=True,
|
|
616
|
+
text=True,
|
|
617
|
+
timeout=120,
|
|
618
|
+
)
|
|
619
|
+
if result.returncode != 0:
|
|
620
|
+
raise RuntimeError(f"Failed to clone fork: {result.stderr}")
|
|
621
|
+
|
|
622
|
+
# Sync with upstream
|
|
623
|
+
logger.info("Syncing with upstream...")
|
|
624
|
+
subprocess.run(
|
|
625
|
+
["gh", "repo", "sync", fork_repo, "--source", official_repo],
|
|
626
|
+
capture_output=True,
|
|
627
|
+
text=True,
|
|
628
|
+
timeout=60,
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
# Pull latest changes
|
|
632
|
+
subprocess.run(
|
|
633
|
+
["git", "pull", "origin", "main"],
|
|
634
|
+
cwd=tmpdir,
|
|
635
|
+
capture_output=True,
|
|
636
|
+
text=True,
|
|
637
|
+
timeout=60,
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
# Create branch
|
|
641
|
+
branch_name = f"submit/{submission_id}"
|
|
642
|
+
logger.info(f"Creating branch {branch_name}...")
|
|
643
|
+
result = subprocess.run(
|
|
644
|
+
["git", "checkout", "-b", branch_name],
|
|
645
|
+
cwd=tmpdir,
|
|
646
|
+
capture_output=True,
|
|
647
|
+
text=True,
|
|
648
|
+
)
|
|
649
|
+
if result.returncode != 0:
|
|
650
|
+
raise RuntimeError(f"Failed to create branch: {result.stderr}")
|
|
651
|
+
|
|
652
|
+
# Write submission file
|
|
653
|
+
submissions_dir = os.path.join(tmpdir, "submissions", "pending")
|
|
654
|
+
os.makedirs(submissions_dir, exist_ok=True)
|
|
655
|
+
submission_path = os.path.join(submissions_dir, filename)
|
|
656
|
+
|
|
657
|
+
with open(submission_path, "w", encoding="utf-8") as f:
|
|
658
|
+
json.dump(submission, f, indent=2, ensure_ascii=False)
|
|
659
|
+
|
|
660
|
+
# Commit
|
|
661
|
+
logger.info("Committing submission...")
|
|
662
|
+
subprocess.run(["git", "add", "."], cwd=tmpdir, check=True)
|
|
663
|
+
|
|
664
|
+
commit_msg = title or f"Submit {submission['experiment']['exp_name']}"
|
|
665
|
+
result = subprocess.run(
|
|
666
|
+
["git", "commit", "-m", commit_msg],
|
|
667
|
+
cwd=tmpdir,
|
|
668
|
+
capture_output=True,
|
|
669
|
+
text=True,
|
|
670
|
+
)
|
|
671
|
+
if result.returncode != 0:
|
|
672
|
+
raise RuntimeError(f"Failed to commit: {result.stderr}")
|
|
673
|
+
|
|
674
|
+
# Push
|
|
675
|
+
logger.info("Pushing to fork...")
|
|
676
|
+
result = subprocess.run(
|
|
677
|
+
["git", "push", "-u", "origin", branch_name],
|
|
678
|
+
cwd=tmpdir,
|
|
679
|
+
capture_output=True,
|
|
680
|
+
text=True,
|
|
681
|
+
timeout=120,
|
|
682
|
+
)
|
|
683
|
+
if result.returncode != 0:
|
|
684
|
+
raise RuntimeError(f"Failed to push: {result.stderr}")
|
|
685
|
+
|
|
686
|
+
# Create PR
|
|
687
|
+
logger.info("Creating PR...")
|
|
688
|
+
pr_title = title or f"[Submission] {submission['experiment']['exp_name']}"
|
|
689
|
+
pr_body = _generate_pr_body(submission)
|
|
690
|
+
|
|
691
|
+
result = subprocess.run(
|
|
692
|
+
[
|
|
693
|
+
"gh", "pr", "create",
|
|
694
|
+
"--repo", official_repo,
|
|
695
|
+
"--head", f"{gh_username}:{branch_name}",
|
|
696
|
+
"--title", pr_title,
|
|
697
|
+
"--body", pr_body,
|
|
698
|
+
],
|
|
699
|
+
capture_output=True,
|
|
700
|
+
text=True,
|
|
701
|
+
timeout=60,
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
if result.returncode != 0:
|
|
705
|
+
raise RuntimeError(f"Failed to create PR: {result.stderr}")
|
|
706
|
+
|
|
707
|
+
pr_url = result.stdout.strip()
|
|
708
|
+
return pr_url
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
def print_validation_summary(validation: ValidationResult) -> None:
|
|
712
|
+
"""Print validation summary to console."""
|
|
713
|
+
print("\nValidation Results:")
|
|
714
|
+
print("-" * 40)
|
|
715
|
+
|
|
716
|
+
if validation.valid:
|
|
717
|
+
print("Status: PASSED")
|
|
718
|
+
else:
|
|
719
|
+
print("Status: FAILED")
|
|
720
|
+
|
|
721
|
+
print(f"\nExperiment: {validation.exp_name}")
|
|
722
|
+
print(f"Subset: {validation.subset}")
|
|
723
|
+
print(f"Models: {len(validation.models)}")
|
|
724
|
+
print(f" New models: {', '.join(validation.new_models) or 'None'}")
|
|
725
|
+
print(f" Existing models: {', '.join(validation.existing_models) or 'None'}")
|
|
726
|
+
print(f"Total battles: {validation.total_battles:,}")
|
|
727
|
+
print(f"Model pairs: {len(validation.battles_per_pair)}")
|
|
728
|
+
|
|
729
|
+
if validation.elo_ratings:
|
|
730
|
+
print("\nELO Preview:")
|
|
731
|
+
sorted_models = sorted(
|
|
732
|
+
validation.elo_ratings.keys(),
|
|
733
|
+
key=lambda m: -validation.elo_ratings[m]
|
|
734
|
+
)
|
|
735
|
+
for model in sorted_models:
|
|
736
|
+
elo = validation.elo_ratings[model]
|
|
737
|
+
ci = validation.elo_ci.get(model)
|
|
738
|
+
ci_str = f" [{ci[0]:.1f}, {ci[1]:.1f}]" if ci else ""
|
|
739
|
+
new_marker = " (new)" if model in validation.new_models else ""
|
|
740
|
+
print(f" {model}: {elo:.1f}{ci_str}{new_marker}")
|
|
741
|
+
|
|
742
|
+
if validation.evaluation_config:
|
|
743
|
+
config = validation.evaluation_config
|
|
744
|
+
print("\nEvaluation Config:")
|
|
745
|
+
print(f" Judge model: {config.get('judge_model', 'N/A')}")
|
|
746
|
+
print(f" Prompt: {config.get('prompt', 'N/A')}")
|
|
747
|
+
print(f" Temperature: {config.get('temperature', 'N/A')}")
|
|
748
|
+
|
|
749
|
+
if validation.warnings:
|
|
750
|
+
print("\nWarnings:")
|
|
751
|
+
for w in validation.warnings:
|
|
752
|
+
print(f" - {w}")
|
|
753
|
+
|
|
754
|
+
if validation.errors:
|
|
755
|
+
print("\nErrors:")
|
|
756
|
+
for e in validation.errors:
|
|
757
|
+
print(f" - {e}")
|
|
758
|
+
|
|
759
|
+
print()
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
def generate_official_models_json(
|
|
763
|
+
arena_dir: str,
|
|
764
|
+
output_path: Optional[str] = None,
|
|
765
|
+
) -> dict[str, Any]:
|
|
766
|
+
"""
|
|
767
|
+
Generate official_models.json from arena state files.
|
|
768
|
+
|
|
769
|
+
This function scans all subsets in the arena directory and extracts
|
|
770
|
+
the list of models from each subset's state.json file.
|
|
771
|
+
|
|
772
|
+
Args:
|
|
773
|
+
arena_dir: Path to the official arena directory
|
|
774
|
+
output_path: Optional path to write the JSON file
|
|
775
|
+
|
|
776
|
+
Returns:
|
|
777
|
+
The official_models.json content as a dictionary
|
|
778
|
+
"""
|
|
779
|
+
from genarena.sync.packer import discover_subsets
|
|
780
|
+
from genarena.state import load_state
|
|
781
|
+
|
|
782
|
+
result: dict[str, Any] = {
|
|
783
|
+
"last_updated": datetime.now(timezone.utc).isoformat(),
|
|
784
|
+
"description": "List of models currently on the official GenArena leaderboard",
|
|
785
|
+
"subsets": {},
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
# Discover all subsets
|
|
789
|
+
subsets = discover_subsets(arena_dir)
|
|
790
|
+
|
|
791
|
+
for subset in subsets:
|
|
792
|
+
state_path = os.path.join(arena_dir, subset, "arena", "state.json")
|
|
793
|
+
if not os.path.isfile(state_path):
|
|
794
|
+
continue
|
|
795
|
+
|
|
796
|
+
state = load_state(state_path)
|
|
797
|
+
if not state.models:
|
|
798
|
+
continue
|
|
799
|
+
|
|
800
|
+
# Get sorted list of model names
|
|
801
|
+
models = sorted(state.models.keys())
|
|
802
|
+
|
|
803
|
+
result["subsets"][subset] = {
|
|
804
|
+
"models": models,
|
|
805
|
+
"model_count": len(models),
|
|
806
|
+
"total_battles": state.total_battles,
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
# Write to file if output_path specified
|
|
810
|
+
if output_path:
|
|
811
|
+
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
|
812
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
813
|
+
json.dump(result, f, indent=2, ensure_ascii=False)
|
|
814
|
+
logger.info(f"Wrote official_models.json to {output_path}")
|
|
815
|
+
|
|
816
|
+
return result
|
|
817
|
+
|
|
818
|
+
|
|
819
|
+
def print_official_models_summary(data: dict[str, Any]) -> None:
|
|
820
|
+
"""Print summary of official models."""
|
|
821
|
+
print("\n=== Official Models ===\n")
|
|
822
|
+
print(f"Last Updated: {data.get('last_updated', 'N/A')}")
|
|
823
|
+
print()
|
|
824
|
+
|
|
825
|
+
subsets = data.get("subsets", {})
|
|
826
|
+
if not subsets:
|
|
827
|
+
print("No subsets found.")
|
|
828
|
+
return
|
|
829
|
+
|
|
830
|
+
for subset, info in sorted(subsets.items()):
|
|
831
|
+
models = info.get("models", [])
|
|
832
|
+
print(f"Subset: {subset}")
|
|
833
|
+
print(f" Models ({len(models)}):")
|
|
834
|
+
for model in models:
|
|
835
|
+
print(f" - {model}")
|
|
836
|
+
print(f" Total Battles: {info.get('total_battles', 0):,}")
|
|
837
|
+
print()
|