genarena 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. genarena/__init__.py +49 -2
  2. genarena/__main__.py +10 -0
  3. genarena/arena.py +1685 -0
  4. genarena/battle.py +337 -0
  5. genarena/bt_elo.py +507 -0
  6. genarena/cli.py +1581 -0
  7. genarena/data.py +476 -0
  8. genarena/deploy/Dockerfile +25 -0
  9. genarena/deploy/README.md +55 -0
  10. genarena/deploy/__init__.py +5 -0
  11. genarena/deploy/app.py +84 -0
  12. genarena/experiments.py +121 -0
  13. genarena/leaderboard.py +270 -0
  14. genarena/logs.py +409 -0
  15. genarena/models.py +412 -0
  16. genarena/prompts/__init__.py +127 -0
  17. genarena/prompts/mmrb2.py +373 -0
  18. genarena/sampling.py +336 -0
  19. genarena/state.py +656 -0
  20. genarena/sync/__init__.py +105 -0
  21. genarena/sync/auto_commit.py +118 -0
  22. genarena/sync/deploy_ops.py +543 -0
  23. genarena/sync/git_ops.py +422 -0
  24. genarena/sync/hf_ops.py +891 -0
  25. genarena/sync/init_ops.py +431 -0
  26. genarena/sync/packer.py +587 -0
  27. genarena/sync/submit.py +837 -0
  28. genarena/utils.py +103 -0
  29. genarena/validation/__init__.py +19 -0
  30. genarena/validation/schema.py +327 -0
  31. genarena/validation/validator.py +329 -0
  32. genarena/visualize/README.md +148 -0
  33. genarena/visualize/__init__.py +14 -0
  34. genarena/visualize/app.py +938 -0
  35. genarena/visualize/data_loader.py +2335 -0
  36. genarena/visualize/static/app.js +3762 -0
  37. genarena/visualize/static/model_aliases.json +86 -0
  38. genarena/visualize/static/style.css +4104 -0
  39. genarena/visualize/templates/index.html +413 -0
  40. genarena/vlm.py +519 -0
  41. genarena-0.1.0.dist-info/METADATA +178 -0
  42. genarena-0.1.0.dist-info/RECORD +44 -0
  43. {genarena-0.0.1.dist-info → genarena-0.1.0.dist-info}/WHEEL +1 -2
  44. genarena-0.1.0.dist-info/entry_points.txt +2 -0
  45. genarena-0.0.1.dist-info/METADATA +0 -26
  46. genarena-0.0.1.dist-info/RECORD +0 -5
  47. genarena-0.0.1.dist-info/top_level.txt +0 -1
@@ -0,0 +1,837 @@
1
+ # Copyright 2026 Ruihang Li.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See LICENSE file in the project root for details.
4
+
5
+ """
6
+ Submission functionality for GenArena.
7
+
8
+ This module provides the ability for users to submit their evaluation results
9
+ to the official leaderboard via GitHub PR.
10
+
11
+ Workflow:
12
+ 1. Validate local submission data
13
+ 2. Upload data to user's HuggingFace repository
14
+ 3. Create submission metadata JSON
15
+ 4. Fork official repo and create PR via GitHub CLI
16
+ """
17
+
18
+ import hashlib
19
+ import json
20
+ import logging
21
+ import os
22
+ import subprocess
23
+ import tempfile
24
+ from dataclasses import dataclass, field
25
+ from datetime import datetime, timezone
26
+ from typing import Any, Optional
27
+
28
+ from genarena import __version__
29
+ from genarena.experiments import is_valid_exp_name
30
+ from genarena.logs import load_battle_records
31
+ from genarena.sync.packer import (
32
+ TempPackingContext,
33
+ pack_exp_dir,
34
+ pack_directory,
35
+ IMAGE_EXTENSIONS,
36
+ )
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+ # Default official submissions repository
41
+ DEFAULT_OFFICIAL_REPO = "genarena/submissions"
42
+
43
+ # URL to fetch official models list
44
+ OFFICIAL_MODELS_URL = (
45
+ "https://raw.githubusercontent.com/genarena/submissions/main/official_models.json"
46
+ )
47
+
48
+
49
+ @dataclass
50
+ class ValidationResult:
51
+ """Result of local submission validation."""
52
+
53
+ valid: bool
54
+ exp_name: str
55
+ subset: str
56
+ models: list[str] = field(default_factory=list)
57
+ new_models: list[str] = field(default_factory=list)
58
+ existing_models: list[str] = field(default_factory=list)
59
+ total_battles: int = 0
60
+ battles_per_pair: dict[str, int] = field(default_factory=dict)
61
+ elo_ratings: dict[str, float] = field(default_factory=dict)
62
+ elo_ci: dict[str, tuple[float, float]] = field(default_factory=dict)
63
+ evaluation_config: dict[str, Any] = field(default_factory=dict)
64
+ errors: list[str] = field(default_factory=list)
65
+ warnings: list[str] = field(default_factory=list)
66
+
67
+
68
+ @dataclass
69
+ class UploadResult:
70
+ """Result of HuggingFace upload."""
71
+
72
+ hf_repo: str
73
+ hf_revision: str
74
+ models_zip_path: str
75
+ models_zip_sha256: str
76
+ models_zip_size: int
77
+ pk_logs_zip_path: str
78
+ pk_logs_zip_sha256: str
79
+ pk_logs_zip_size: int
80
+
81
+
82
+ def fetch_official_models(subset: str, timeout: int = 10) -> set[str]:
83
+ """
84
+ Fetch official models list from GitHub.
85
+
86
+ Args:
87
+ subset: Subset name to get models for
88
+ timeout: Request timeout in seconds
89
+
90
+ Returns:
91
+ Set of official model names for the subset
92
+ """
93
+ import urllib.request
94
+ import urllib.error
95
+
96
+ try:
97
+ with urllib.request.urlopen(OFFICIAL_MODELS_URL, timeout=timeout) as resp:
98
+ data = json.load(resp)
99
+ return set(data.get("subsets", {}).get(subset, {}).get("models", []))
100
+ except urllib.error.URLError as e:
101
+ logger.warning(f"Failed to fetch official models list: {e}")
102
+ return set()
103
+ except json.JSONDecodeError as e:
104
+ logger.warning(f"Failed to parse official models list: {e}")
105
+ return set()
106
+ except Exception as e:
107
+ logger.warning(f"Unexpected error fetching official models: {e}")
108
+ return set()
109
+
110
+
111
+ def _load_experiment_config(exp_dir: str) -> dict[str, Any]:
112
+ """Load experiment configuration from config.json."""
113
+ config_path = os.path.join(exp_dir, "config.json")
114
+ if not os.path.isfile(config_path):
115
+ return {}
116
+ try:
117
+ with open(config_path, "r", encoding="utf-8") as f:
118
+ return json.load(f)
119
+ except (json.JSONDecodeError, IOError):
120
+ return {}
121
+
122
+
123
+ def validate_local_submission(
124
+ arena_dir: str,
125
+ subset: str,
126
+ exp_name: str,
127
+ skip_official_check: bool = False,
128
+ ) -> ValidationResult:
129
+ """
130
+ Validate local submission data.
131
+
132
+ Checks:
133
+ 1. exp_name format (_yyyymmdd suffix)
134
+ 2. pk_logs directory exists and has battle records
135
+ 3. models directory exists and has model outputs
136
+ 4. All models in battles have corresponding outputs
137
+ 5. At least one model is new (not in official leaderboard)
138
+
139
+ Args:
140
+ arena_dir: Arena directory path
141
+ subset: Subset name
142
+ exp_name: Experiment name
143
+ skip_official_check: Skip checking against official models (for testing)
144
+
145
+ Returns:
146
+ ValidationResult with validation status and details
147
+ """
148
+ errors: list[str] = []
149
+ warnings: list[str] = []
150
+
151
+ # Check exp_name format
152
+ if not is_valid_exp_name(exp_name):
153
+ errors.append(
154
+ f"Invalid exp_name format: '{exp_name}' must end with _yyyymmdd"
155
+ )
156
+
157
+ # Check paths exist
158
+ pk_logs_dir = os.path.join(arena_dir, subset, "pk_logs")
159
+ exp_dir = os.path.join(pk_logs_dir, exp_name)
160
+ models_root = os.path.join(arena_dir, subset, "models")
161
+ exp_models_dir = os.path.join(models_root, exp_name)
162
+
163
+ if not os.path.isdir(exp_dir):
164
+ errors.append(f"pk_logs directory not found: {exp_dir}")
165
+
166
+ if not os.path.isdir(exp_models_dir):
167
+ errors.append(f"models directory not found: {exp_models_dir}")
168
+
169
+ if errors:
170
+ return ValidationResult(
171
+ valid=False,
172
+ exp_name=exp_name,
173
+ subset=subset,
174
+ errors=errors,
175
+ warnings=warnings,
176
+ )
177
+
178
+ # Load battle records
179
+ records = load_battle_records(pk_logs_dir, exp_name=exp_name)
180
+ if not records:
181
+ errors.append("No battle records found in pk_logs")
182
+ return ValidationResult(
183
+ valid=False,
184
+ exp_name=exp_name,
185
+ subset=subset,
186
+ errors=errors,
187
+ warnings=warnings,
188
+ )
189
+
190
+ # Extract models and battle statistics
191
+ models: set[str] = set()
192
+ battles_per_pair: dict[str, int] = {}
193
+
194
+ for r in records:
195
+ model_a = r.get("model_a", "")
196
+ model_b = r.get("model_b", "")
197
+ if model_a and model_b:
198
+ models.add(model_a)
199
+ models.add(model_b)
200
+ # Ensure consistent pair key (sorted)
201
+ pair_key = f"{min(model_a, model_b)}_vs_{max(model_a, model_b)}"
202
+ battles_per_pair[pair_key] = battles_per_pair.get(pair_key, 0) + 1
203
+
204
+ models_list = sorted(models)
205
+
206
+ # Check model outputs exist
207
+ for model in models_list:
208
+ model_dir = os.path.join(exp_models_dir, model)
209
+ if not os.path.isdir(model_dir):
210
+ errors.append(f"Model output directory not found: {model_dir}")
211
+ else:
212
+ # Check if there are any images
213
+ has_images = False
214
+ for f in os.listdir(model_dir):
215
+ ext = os.path.splitext(f)[1].lower()
216
+ if ext in IMAGE_EXTENSIONS:
217
+ has_images = True
218
+ break
219
+ if not has_images:
220
+ errors.append(f"No image files found in model directory: {model_dir}")
221
+
222
+ # Check against official models
223
+ if not skip_official_check:
224
+ official_models = fetch_official_models(subset)
225
+ new_models = [m for m in models_list if m not in official_models]
226
+ existing_models = [m for m in models_list if m in official_models]
227
+
228
+ if not new_models:
229
+ errors.append(
230
+ "No new models found. All models already exist in official leaderboard. "
231
+ "Submissions must include at least one new model."
232
+ )
233
+ else:
234
+ new_models = models_list
235
+ existing_models = []
236
+ warnings.append("Skipped official models check (--skip-official-check)")
237
+
238
+ # Calculate ELO (only if no critical errors so far)
239
+ elo_ratings: dict[str, float] = {}
240
+ elo_ci: dict[str, tuple[float, float]] = {}
241
+
242
+ if not errors:
243
+ try:
244
+ from genarena.bt_elo import compute_bootstrap_bt_elo
245
+
246
+ battles = [
247
+ (r["model_a"], r["model_b"], r["final_winner"])
248
+ for r in records
249
+ if r.get("model_a") and r.get("model_b") and r.get("final_winner")
250
+ ]
251
+
252
+ if battles:
253
+ bt_result = compute_bootstrap_bt_elo(battles, num_bootstrap=100)
254
+ elo_ratings = bt_result.ratings
255
+ for model in models_list:
256
+ if model in bt_result.ci_lower and model in bt_result.ci_upper:
257
+ elo_ci[model] = (
258
+ bt_result.ci_lower[model],
259
+ bt_result.ci_upper[model],
260
+ )
261
+ except Exception as e:
262
+ warnings.append(f"Failed to calculate ELO: {e}")
263
+
264
+ # Load evaluation config
265
+ evaluation_config = _load_experiment_config(exp_dir)
266
+
267
+ return ValidationResult(
268
+ valid=len(errors) == 0,
269
+ exp_name=exp_name,
270
+ subset=subset,
271
+ models=models_list,
272
+ new_models=new_models,
273
+ existing_models=existing_models,
274
+ total_battles=len(records),
275
+ battles_per_pair=battles_per_pair,
276
+ elo_ratings=elo_ratings,
277
+ elo_ci=elo_ci,
278
+ evaluation_config=evaluation_config,
279
+ errors=errors,
280
+ warnings=warnings,
281
+ )
282
+
283
+
284
+ def upload_submission_data(
285
+ arena_dir: str,
286
+ subset: str,
287
+ exp_name: str,
288
+ hf_repo: str,
289
+ hf_revision: str = "main",
290
+ show_progress: bool = True,
291
+ ) -> UploadResult:
292
+ """
293
+ Pack and upload submission data to HuggingFace.
294
+
295
+ Args:
296
+ arena_dir: Arena directory path
297
+ subset: Subset name
298
+ exp_name: Experiment name
299
+ hf_repo: HuggingFace repository ID (e.g., "username/repo-name")
300
+ hf_revision: Repository revision/branch (default: "main")
301
+ show_progress: Show upload progress
302
+
303
+ Returns:
304
+ UploadResult with upload details
305
+
306
+ Raises:
307
+ RuntimeError: If upload fails
308
+ """
309
+ from huggingface_hub import HfApi
310
+
311
+ api = HfApi()
312
+
313
+ # Paths
314
+ exp_models_dir = os.path.join(arena_dir, subset, "models", exp_name)
315
+ exp_dir = os.path.join(arena_dir, subset, "pk_logs", exp_name)
316
+
317
+ with TempPackingContext() as ctx:
318
+ # Pack models
319
+ models_zip_path = ctx.get_temp_zip_path(f"{subset}/models/{exp_name}.zip")
320
+ success, msg = pack_directory(
321
+ exp_models_dir, models_zip_path, file_extensions=IMAGE_EXTENSIONS
322
+ )
323
+ if not success:
324
+ raise RuntimeError(f"Failed to pack models: {msg}")
325
+
326
+ # Calculate SHA256 for models
327
+ with open(models_zip_path, "rb") as f:
328
+ models_sha256 = hashlib.sha256(f.read()).hexdigest()
329
+ models_size = os.path.getsize(models_zip_path)
330
+
331
+ # Pack pk_logs
332
+ logs_zip_path = ctx.get_temp_zip_path(f"{subset}/pk_logs/{exp_name}.zip")
333
+ success, msg = pack_exp_dir(exp_dir, logs_zip_path)
334
+ if not success:
335
+ raise RuntimeError(f"Failed to pack pk_logs: {msg}")
336
+
337
+ # Calculate SHA256 for logs
338
+ with open(logs_zip_path, "rb") as f:
339
+ logs_sha256 = hashlib.sha256(f.read()).hexdigest()
340
+ logs_size = os.path.getsize(logs_zip_path)
341
+
342
+ # Upload to HF
343
+ hf_models_path = f"{subset}/models/{exp_name}.zip"
344
+ hf_logs_path = f"{subset}/pk_logs/{exp_name}.zip"
345
+
346
+ logger.info(f"Uploading models ZIP ({models_size / 1024 / 1024:.1f} MB)...")
347
+ api.upload_file(
348
+ path_or_fileobj=models_zip_path,
349
+ path_in_repo=hf_models_path,
350
+ repo_id=hf_repo,
351
+ repo_type="dataset",
352
+ revision=hf_revision,
353
+ )
354
+
355
+ logger.info(f"Uploading pk_logs ZIP ({logs_size / 1024 / 1024:.1f} MB)...")
356
+ api.upload_file(
357
+ path_or_fileobj=logs_zip_path,
358
+ path_in_repo=hf_logs_path,
359
+ repo_id=hf_repo,
360
+ repo_type="dataset",
361
+ revision=hf_revision,
362
+ )
363
+
364
+ return UploadResult(
365
+ hf_repo=hf_repo,
366
+ hf_revision=hf_revision,
367
+ models_zip_path=hf_models_path,
368
+ models_zip_sha256=models_sha256,
369
+ models_zip_size=models_size,
370
+ pk_logs_zip_path=hf_logs_path,
371
+ pk_logs_zip_sha256=logs_sha256,
372
+ pk_logs_zip_size=logs_size,
373
+ )
374
+
375
+
376
+ def create_submission_metadata(
377
+ validation: ValidationResult,
378
+ upload: UploadResult,
379
+ github_username: str,
380
+ title: str = "",
381
+ description: str = "",
382
+ contact: str = "",
383
+ ) -> dict[str, Any]:
384
+ """
385
+ Create submission metadata JSON.
386
+
387
+ Args:
388
+ validation: ValidationResult from validate_local_submission
389
+ upload: UploadResult from upload_submission_data
390
+ github_username: GitHub username of submitter
391
+ title: Submission title
392
+ description: Submission description
393
+ contact: Optional contact email
394
+
395
+ Returns:
396
+ Submission metadata dictionary
397
+ """
398
+ # Generate submission ID
399
+ timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S")
400
+ hash_input = f"{timestamp}{validation.exp_name}{github_username}"
401
+ short_hash = hashlib.sha256(hash_input.encode()).hexdigest()[:8]
402
+ submission_id = f"sub_{timestamp}_{short_hash}"
403
+
404
+ # Build submitter info
405
+ submitter: dict[str, str] = {"github_username": github_username}
406
+ if contact:
407
+ submitter["contact"] = contact
408
+
409
+ # Build evaluation config (extract key fields)
410
+ eval_config = validation.evaluation_config
411
+ evaluation_config_summary = {
412
+ "judge_model": eval_config.get("judge_model", "unknown"),
413
+ "prompt_module": eval_config.get("prompt", "unknown"),
414
+ "temperature": eval_config.get("temperature", 0.0),
415
+ "position_debiasing": True, # Always true in genarena
416
+ }
417
+
418
+ # Build model pairs list
419
+ model_pairs = [
420
+ [min(p.split("_vs_")[0], p.split("_vs_")[1]),
421
+ max(p.split("_vs_")[0], p.split("_vs_")[1])]
422
+ for p in validation.battles_per_pair.keys()
423
+ ]
424
+
425
+ return {
426
+ "schema_version": "1.0",
427
+ "submission_id": submission_id,
428
+ "created_at": datetime.now(timezone.utc).isoformat(),
429
+ "submitter": submitter,
430
+ "experiment": {
431
+ "exp_name": validation.exp_name,
432
+ "subset": validation.subset,
433
+ "models": validation.models,
434
+ "new_models": validation.new_models,
435
+ "existing_models": validation.existing_models,
436
+ "model_pairs": model_pairs,
437
+ "total_battles": validation.total_battles,
438
+ "battles_per_pair": validation.battles_per_pair,
439
+ },
440
+ "data_location": {
441
+ "hf_repo_id": upload.hf_repo,
442
+ "hf_revision": upload.hf_revision,
443
+ "files": {
444
+ "models_zip": {
445
+ "path": upload.models_zip_path,
446
+ "sha256": upload.models_zip_sha256,
447
+ "size_bytes": upload.models_zip_size,
448
+ },
449
+ "pk_logs_zip": {
450
+ "path": upload.pk_logs_zip_path,
451
+ "sha256": upload.pk_logs_zip_sha256,
452
+ "size_bytes": upload.pk_logs_zip_size,
453
+ },
454
+ },
455
+ },
456
+ "elo_preview": {
457
+ "ratings": validation.elo_ratings,
458
+ "ci_95": {m: list(ci) for m, ci in validation.elo_ci.items()},
459
+ },
460
+ "evaluation_config": evaluation_config_summary,
461
+ "title": title or f"Submit {validation.exp_name}",
462
+ "description": description,
463
+ "verification": {
464
+ "local_validation_passed": validation.valid,
465
+ "genarena_version": __version__,
466
+ },
467
+ }
468
+
469
+
470
+ def _get_github_username() -> Optional[str]:
471
+ """Get GitHub username from gh CLI."""
472
+ try:
473
+ result = subprocess.run(
474
+ ["gh", "api", "user", "-q", ".login"],
475
+ capture_output=True,
476
+ text=True,
477
+ timeout=30,
478
+ )
479
+ if result.returncode == 0:
480
+ return result.stdout.strip()
481
+ except (subprocess.TimeoutExpired, FileNotFoundError):
482
+ pass
483
+ return None
484
+
485
+
486
+ def _check_gh_cli() -> tuple[bool, str]:
487
+ """Check if GitHub CLI is available and authenticated."""
488
+ try:
489
+ # Check if gh is installed
490
+ result = subprocess.run(
491
+ ["gh", "--version"],
492
+ capture_output=True,
493
+ text=True,
494
+ timeout=10,
495
+ )
496
+ if result.returncode != 0:
497
+ return False, "GitHub CLI (gh) is not installed"
498
+
499
+ # Check if authenticated
500
+ result = subprocess.run(
501
+ ["gh", "auth", "status"],
502
+ capture_output=True,
503
+ text=True,
504
+ timeout=10,
505
+ )
506
+ if result.returncode != 0:
507
+ return False, "GitHub CLI is not authenticated. Run 'gh auth login' first."
508
+
509
+ return True, "GitHub CLI is ready"
510
+ except FileNotFoundError:
511
+ return False, "GitHub CLI (gh) is not installed. Install it from https://cli.github.com"
512
+ except subprocess.TimeoutExpired:
513
+ return False, "GitHub CLI timed out"
514
+
515
+
516
+ def _generate_pr_body(submission: dict[str, Any]) -> str:
517
+ """Generate PR description body."""
518
+ exp = submission["experiment"]
519
+ elo = submission["elo_preview"]["ratings"]
520
+ eval_config = submission["evaluation_config"]
521
+
522
+ body = f"""## Submission Details
523
+
524
+ **Experiment:** `{exp['exp_name']}`
525
+ **Subset:** `{exp['subset']}`
526
+ **New Models:** {', '.join(f'`{m}`' for m in exp['new_models']) or 'None'}
527
+ **Total Battles:** {exp['total_battles']:,}
528
+ **Model Pairs:** {len(exp['model_pairs'])}
529
+
530
+ ### Evaluation Configuration
531
+
532
+ | Setting | Value |
533
+ |---------|-------|
534
+ | Judge Model | `{eval_config.get('judge_model', 'N/A')}` |
535
+ | Prompt Module | `{eval_config.get('prompt_module', 'N/A')}` |
536
+ | Temperature | {eval_config.get('temperature', 'N/A')} |
537
+ | Position Debiasing | {'Yes' if eval_config.get('position_debiasing') else 'No'} |
538
+
539
+ ### ELO Preview
540
+
541
+ | Model | ELO | 95% CI |
542
+ |-------|-----|--------|
543
+ """
544
+ ci_data = submission["elo_preview"].get("ci_95", {})
545
+ for model in sorted(elo.keys(), key=lambda m: -elo[m]):
546
+ ci = ci_data.get(model, [None, None])
547
+ ci_str = f"[{ci[0]:.1f}, {ci[1]:.1f}]" if ci[0] is not None else "N/A"
548
+ body += f"| {model} | {elo[model]:.1f} | {ci_str} |\n"
549
+
550
+ body += f"""
551
+ ### Data Location
552
+
553
+ - **HuggingFace Repo:** `{submission['data_location']['hf_repo_id']}`
554
+ - **Models ZIP:** `{submission['data_location']['files']['models_zip']['path']}`
555
+ - SHA256: `{submission['data_location']['files']['models_zip']['sha256'][:16]}...`
556
+ - Size: {submission['data_location']['files']['models_zip']['size_bytes'] / 1024 / 1024:.1f} MB
557
+ - **Logs ZIP:** `{submission['data_location']['files']['pk_logs_zip']['path']}`
558
+ - SHA256: `{submission['data_location']['files']['pk_logs_zip']['sha256'][:16]}...`
559
+ - Size: {submission['data_location']['files']['pk_logs_zip']['size_bytes'] / 1024:.1f} KB
560
+
561
+ ### Description
562
+
563
+ {submission.get('description') or submission.get('title', 'No description provided.')}
564
+
565
+ ---
566
+ *Submitted via genarena v{submission['verification']['genarena_version']}*
567
+ """
568
+ return body
569
+
570
+
571
+ def create_submission_pr(
572
+ submission: dict[str, Any],
573
+ official_repo: str = DEFAULT_OFFICIAL_REPO,
574
+ title: Optional[str] = None,
575
+ ) -> str:
576
+ """
577
+ Fork official repo and create PR with submission.
578
+
579
+ Args:
580
+ submission: Submission metadata dictionary
581
+ official_repo: Official submissions repository (default: genarena/submissions)
582
+ title: PR title (optional, auto-generated if not provided)
583
+
584
+ Returns:
585
+ PR URL
586
+
587
+ Raises:
588
+ RuntimeError: If PR creation fails
589
+ """
590
+ submission_id = submission["submission_id"]
591
+ filename = f"{submission_id}.json"
592
+
593
+ # Get GitHub username
594
+ gh_username = _get_github_username()
595
+ if not gh_username:
596
+ raise RuntimeError("Failed to get GitHub username. Ensure gh CLI is authenticated.")
597
+
598
+ # Fork the repo (idempotent - won't fail if already forked)
599
+ logger.info(f"Forking {official_repo}...")
600
+ result = subprocess.run(
601
+ ["gh", "repo", "fork", official_repo, "--clone=false"],
602
+ capture_output=True,
603
+ text=True,
604
+ timeout=60,
605
+ )
606
+ # Note: fork may "fail" if already forked, but that's OK
607
+
608
+ # Clone forked repo to temp directory
609
+ with tempfile.TemporaryDirectory() as tmpdir:
610
+ fork_repo = f"{gh_username}/submissions"
611
+ logger.info(f"Cloning {fork_repo}...")
612
+
613
+ result = subprocess.run(
614
+ ["gh", "repo", "clone", fork_repo, tmpdir],
615
+ capture_output=True,
616
+ text=True,
617
+ timeout=120,
618
+ )
619
+ if result.returncode != 0:
620
+ raise RuntimeError(f"Failed to clone fork: {result.stderr}")
621
+
622
+ # Sync with upstream
623
+ logger.info("Syncing with upstream...")
624
+ subprocess.run(
625
+ ["gh", "repo", "sync", fork_repo, "--source", official_repo],
626
+ capture_output=True,
627
+ text=True,
628
+ timeout=60,
629
+ )
630
+
631
+ # Pull latest changes
632
+ subprocess.run(
633
+ ["git", "pull", "origin", "main"],
634
+ cwd=tmpdir,
635
+ capture_output=True,
636
+ text=True,
637
+ timeout=60,
638
+ )
639
+
640
+ # Create branch
641
+ branch_name = f"submit/{submission_id}"
642
+ logger.info(f"Creating branch {branch_name}...")
643
+ result = subprocess.run(
644
+ ["git", "checkout", "-b", branch_name],
645
+ cwd=tmpdir,
646
+ capture_output=True,
647
+ text=True,
648
+ )
649
+ if result.returncode != 0:
650
+ raise RuntimeError(f"Failed to create branch: {result.stderr}")
651
+
652
+ # Write submission file
653
+ submissions_dir = os.path.join(tmpdir, "submissions", "pending")
654
+ os.makedirs(submissions_dir, exist_ok=True)
655
+ submission_path = os.path.join(submissions_dir, filename)
656
+
657
+ with open(submission_path, "w", encoding="utf-8") as f:
658
+ json.dump(submission, f, indent=2, ensure_ascii=False)
659
+
660
+ # Commit
661
+ logger.info("Committing submission...")
662
+ subprocess.run(["git", "add", "."], cwd=tmpdir, check=True)
663
+
664
+ commit_msg = title or f"Submit {submission['experiment']['exp_name']}"
665
+ result = subprocess.run(
666
+ ["git", "commit", "-m", commit_msg],
667
+ cwd=tmpdir,
668
+ capture_output=True,
669
+ text=True,
670
+ )
671
+ if result.returncode != 0:
672
+ raise RuntimeError(f"Failed to commit: {result.stderr}")
673
+
674
+ # Push
675
+ logger.info("Pushing to fork...")
676
+ result = subprocess.run(
677
+ ["git", "push", "-u", "origin", branch_name],
678
+ cwd=tmpdir,
679
+ capture_output=True,
680
+ text=True,
681
+ timeout=120,
682
+ )
683
+ if result.returncode != 0:
684
+ raise RuntimeError(f"Failed to push: {result.stderr}")
685
+
686
+ # Create PR
687
+ logger.info("Creating PR...")
688
+ pr_title = title or f"[Submission] {submission['experiment']['exp_name']}"
689
+ pr_body = _generate_pr_body(submission)
690
+
691
+ result = subprocess.run(
692
+ [
693
+ "gh", "pr", "create",
694
+ "--repo", official_repo,
695
+ "--head", f"{gh_username}:{branch_name}",
696
+ "--title", pr_title,
697
+ "--body", pr_body,
698
+ ],
699
+ capture_output=True,
700
+ text=True,
701
+ timeout=60,
702
+ )
703
+
704
+ if result.returncode != 0:
705
+ raise RuntimeError(f"Failed to create PR: {result.stderr}")
706
+
707
+ pr_url = result.stdout.strip()
708
+ return pr_url
709
+
710
+
711
+ def print_validation_summary(validation: ValidationResult) -> None:
712
+ """Print validation summary to console."""
713
+ print("\nValidation Results:")
714
+ print("-" * 40)
715
+
716
+ if validation.valid:
717
+ print("Status: PASSED")
718
+ else:
719
+ print("Status: FAILED")
720
+
721
+ print(f"\nExperiment: {validation.exp_name}")
722
+ print(f"Subset: {validation.subset}")
723
+ print(f"Models: {len(validation.models)}")
724
+ print(f" New models: {', '.join(validation.new_models) or 'None'}")
725
+ print(f" Existing models: {', '.join(validation.existing_models) or 'None'}")
726
+ print(f"Total battles: {validation.total_battles:,}")
727
+ print(f"Model pairs: {len(validation.battles_per_pair)}")
728
+
729
+ if validation.elo_ratings:
730
+ print("\nELO Preview:")
731
+ sorted_models = sorted(
732
+ validation.elo_ratings.keys(),
733
+ key=lambda m: -validation.elo_ratings[m]
734
+ )
735
+ for model in sorted_models:
736
+ elo = validation.elo_ratings[model]
737
+ ci = validation.elo_ci.get(model)
738
+ ci_str = f" [{ci[0]:.1f}, {ci[1]:.1f}]" if ci else ""
739
+ new_marker = " (new)" if model in validation.new_models else ""
740
+ print(f" {model}: {elo:.1f}{ci_str}{new_marker}")
741
+
742
+ if validation.evaluation_config:
743
+ config = validation.evaluation_config
744
+ print("\nEvaluation Config:")
745
+ print(f" Judge model: {config.get('judge_model', 'N/A')}")
746
+ print(f" Prompt: {config.get('prompt', 'N/A')}")
747
+ print(f" Temperature: {config.get('temperature', 'N/A')}")
748
+
749
+ if validation.warnings:
750
+ print("\nWarnings:")
751
+ for w in validation.warnings:
752
+ print(f" - {w}")
753
+
754
+ if validation.errors:
755
+ print("\nErrors:")
756
+ for e in validation.errors:
757
+ print(f" - {e}")
758
+
759
+ print()
760
+
761
+
762
+ def generate_official_models_json(
763
+ arena_dir: str,
764
+ output_path: Optional[str] = None,
765
+ ) -> dict[str, Any]:
766
+ """
767
+ Generate official_models.json from arena state files.
768
+
769
+ This function scans all subsets in the arena directory and extracts
770
+ the list of models from each subset's state.json file.
771
+
772
+ Args:
773
+ arena_dir: Path to the official arena directory
774
+ output_path: Optional path to write the JSON file
775
+
776
+ Returns:
777
+ The official_models.json content as a dictionary
778
+ """
779
+ from genarena.sync.packer import discover_subsets
780
+ from genarena.state import load_state
781
+
782
+ result: dict[str, Any] = {
783
+ "last_updated": datetime.now(timezone.utc).isoformat(),
784
+ "description": "List of models currently on the official GenArena leaderboard",
785
+ "subsets": {},
786
+ }
787
+
788
+ # Discover all subsets
789
+ subsets = discover_subsets(arena_dir)
790
+
791
+ for subset in subsets:
792
+ state_path = os.path.join(arena_dir, subset, "arena", "state.json")
793
+ if not os.path.isfile(state_path):
794
+ continue
795
+
796
+ state = load_state(state_path)
797
+ if not state.models:
798
+ continue
799
+
800
+ # Get sorted list of model names
801
+ models = sorted(state.models.keys())
802
+
803
+ result["subsets"][subset] = {
804
+ "models": models,
805
+ "model_count": len(models),
806
+ "total_battles": state.total_battles,
807
+ }
808
+
809
+ # Write to file if output_path specified
810
+ if output_path:
811
+ os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
812
+ with open(output_path, "w", encoding="utf-8") as f:
813
+ json.dump(result, f, indent=2, ensure_ascii=False)
814
+ logger.info(f"Wrote official_models.json to {output_path}")
815
+
816
+ return result
817
+
818
+
819
+ def print_official_models_summary(data: dict[str, Any]) -> None:
820
+ """Print summary of official models."""
821
+ print("\n=== Official Models ===\n")
822
+ print(f"Last Updated: {data.get('last_updated', 'N/A')}")
823
+ print()
824
+
825
+ subsets = data.get("subsets", {})
826
+ if not subsets:
827
+ print("No subsets found.")
828
+ return
829
+
830
+ for subset, info in sorted(subsets.items()):
831
+ models = info.get("models", [])
832
+ print(f"Subset: {subset}")
833
+ print(f" Models ({len(models)}):")
834
+ for model in models:
835
+ print(f" - {model}")
836
+ print(f" Total Battles: {info.get('total_battles', 0):,}")
837
+ print()