genarena 0.0.1__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. genarena/__init__.py +49 -2
  2. genarena/__main__.py +10 -0
  3. genarena/arena.py +1685 -0
  4. genarena/battle.py +337 -0
  5. genarena/bt_elo.py +507 -0
  6. genarena/cli.py +1581 -0
  7. genarena/data.py +476 -0
  8. genarena/deploy/Dockerfile +22 -0
  9. genarena/deploy/README.md +55 -0
  10. genarena/deploy/__init__.py +5 -0
  11. genarena/deploy/app.py +84 -0
  12. genarena/experiments.py +121 -0
  13. genarena/leaderboard.py +270 -0
  14. genarena/logs.py +409 -0
  15. genarena/models.py +412 -0
  16. genarena/prompts/__init__.py +127 -0
  17. genarena/prompts/mmrb2.py +373 -0
  18. genarena/sampling.py +336 -0
  19. genarena/state.py +656 -0
  20. genarena/sync/__init__.py +105 -0
  21. genarena/sync/auto_commit.py +118 -0
  22. genarena/sync/deploy_ops.py +543 -0
  23. genarena/sync/git_ops.py +422 -0
  24. genarena/sync/hf_ops.py +891 -0
  25. genarena/sync/init_ops.py +431 -0
  26. genarena/sync/packer.py +587 -0
  27. genarena/sync/submit.py +837 -0
  28. genarena/utils.py +103 -0
  29. genarena/validation/__init__.py +19 -0
  30. genarena/validation/schema.py +327 -0
  31. genarena/validation/validator.py +329 -0
  32. genarena/visualize/README.md +148 -0
  33. genarena/visualize/__init__.py +14 -0
  34. genarena/visualize/app.py +938 -0
  35. genarena/visualize/data_loader.py +2430 -0
  36. genarena/visualize/static/app.js +3762 -0
  37. genarena/visualize/static/model_aliases.json +86 -0
  38. genarena/visualize/static/style.css +4104 -0
  39. genarena/visualize/templates/index.html +413 -0
  40. genarena/vlm.py +519 -0
  41. genarena-0.1.1.dist-info/METADATA +178 -0
  42. genarena-0.1.1.dist-info/RECORD +44 -0
  43. {genarena-0.0.1.dist-info → genarena-0.1.1.dist-info}/WHEEL +1 -2
  44. genarena-0.1.1.dist-info/entry_points.txt +2 -0
  45. genarena-0.0.1.dist-info/METADATA +0 -26
  46. genarena-0.0.1.dist-info/RECORD +0 -5
  47. genarena-0.0.1.dist-info/top_level.txt +0 -1
genarena/cli.py ADDED
@@ -0,0 +1,1581 @@
1
+ # Copyright 2026 Ruihang Li.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See LICENSE file in the project root for details.
4
+
5
+ """CLI entry point for genarena."""
6
+
7
+ import argparse
8
+ import json
9
+ import logging
10
+ import os
11
+ import sys
12
+ from typing import Optional
13
+
14
+ from genarena import __version__
15
+ from genarena.arena import Arena, ArenaConfig, get_all_subsets_status
16
+ from genarena.data import discover_subsets
17
+ from genarena.leaderboard import print_leaderboard
18
+ from genarena.sampling import SamplingConfig
19
+ from genarena.state import load_state
20
+
21
+
22
+ def setup_logging(verbose: bool = False) -> None:
23
+ """
24
+ Configure logging.
25
+
26
+ Args:
27
+ verbose: If True, enable DEBUG level logging
28
+ """
29
+ level = logging.DEBUG if verbose else logging.INFO
30
+
31
+ logging.basicConfig(
32
+ level=level,
33
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
34
+ datefmt="%Y-%m-%d %H:%M:%S"
35
+ )
36
+
37
+
38
+ def parse_models(models_str: Optional[str]) -> Optional[list[str]]:
39
+ """
40
+ Parse comma-separated model names.
41
+
42
+ Args:
43
+ models_str: Comma-separated model names or None
44
+
45
+ Returns:
46
+ List of model names or None
47
+ """
48
+ if not models_str:
49
+ return None
50
+ return [m.strip() for m in models_str.split(",") if m.strip()]
51
+
52
+
53
+ def cmd_run(args: argparse.Namespace) -> int:
54
+ """
55
+ Execute the 'run' subcommand.
56
+
57
+ Args:
58
+ args: Parsed arguments
59
+
60
+ Returns:
61
+ Exit code (0 for success)
62
+ """
63
+ setup_logging(args.verbose)
64
+ logger = logging.getLogger(__name__)
65
+
66
+ # Discover subsets if not specified
67
+ if args.subset:
68
+ subsets = [args.subset]
69
+ else:
70
+ subsets = discover_subsets(args.data_dir)
71
+ if not subsets:
72
+ logger.error(f"No subsets found in {args.data_dir}")
73
+ return 1
74
+ logger.info(f"Discovered subsets: {subsets}")
75
+
76
+ # Parse models
77
+ models = parse_models(args.models)
78
+
79
+ # Build sampling configuration
80
+ if args.sampling_mode == "full":
81
+ sampling_config = SamplingConfig.full_mode(sample_size=args.sample_size)
82
+ else:
83
+ sampling_config = SamplingConfig.adaptive_mode(
84
+ target_ci_width=args.target_ci_width,
85
+ min_samples=args.min_samples,
86
+ max_samples=args.max_samples,
87
+ )
88
+ # Override milestone_min_samples if provided
89
+ if args.milestone_min_samples:
90
+ sampling_config.milestone_min_samples = args.milestone_min_samples
91
+
92
+ # Run arena for each subset
93
+ for subset in subsets:
94
+ logger.info(f"Running arena for subset: {subset}")
95
+
96
+ config = ArenaConfig(
97
+ arena_dir=args.arena_dir,
98
+ data_dir=args.data_dir,
99
+ subset=subset,
100
+ models=models,
101
+ exp_name=args.exp_name,
102
+ sample_size=args.sample_size,
103
+ num_threads=args.num_threads,
104
+ num_processes=args.num_processes,
105
+ parallel_swap_calls=args.parallel_swap_calls,
106
+ enable_progress_bar=args.enable_progress_bar,
107
+ sampling=sampling_config,
108
+ judge_model=args.judge_model,
109
+ temperature=args.temperature,
110
+ prompt=args.prompt,
111
+ timeout=args.timeout,
112
+ max_retries=args.max_retries,
113
+ base_urls=args.base_urls,
114
+ api_keys=args.api_keys,
115
+ enable_audit_log=not args.no_audit_log,
116
+ clean_orphaned_logs=not args.no_clean_orphaned_logs,
117
+ verbose=args.verbose
118
+ )
119
+
120
+ try:
121
+ arena = Arena(config)
122
+ state = arena.run()
123
+ arena.update_leaderboard()
124
+
125
+ logger.info(
126
+ f"Subset {subset} completed: "
127
+ f"{state.total_battles} total battles, "
128
+ f"{len(state.models)} models"
129
+ )
130
+ except Exception as e:
131
+ logger.error(f"Error running arena for subset {subset}: {e}")
132
+ if args.verbose:
133
+ import traceback
134
+ traceback.print_exc()
135
+ return 1
136
+
137
+ # Auto commit and push if Git is initialized
138
+ from genarena.sync.auto_commit import auto_commit_and_push
139
+ auto_commit_and_push(args.arena_dir, "run")
140
+
141
+ return 0
142
+
143
+
144
+ def cmd_status(args: argparse.Namespace) -> int:
145
+ """
146
+ Execute the 'status' subcommand.
147
+
148
+ Args:
149
+ args: Parsed arguments
150
+
151
+ Returns:
152
+ Exit code (0 for success)
153
+ """
154
+ setup_logging(args.verbose if hasattr(args, 'verbose') else False)
155
+
156
+ # Get status for all subsets
157
+ statuses = get_all_subsets_status(args.arena_dir, args.data_dir)
158
+
159
+ if not statuses:
160
+ print("No subsets found.")
161
+ return 0
162
+
163
+ print("\n=== Arena Status ===\n")
164
+ print(f"Arena Directory: {args.arena_dir}")
165
+ print(f"Data Directory: {args.data_dir}")
166
+ print()
167
+
168
+ for status in statuses:
169
+ print(f"Subset: {status['subset']}")
170
+ print(f" Models: {status['total_models']} ({', '.join(status['models'][:3])}{'...' if len(status['models']) > 3 else ''})")
171
+ print(f" Total Battles: {status['total_battles']}")
172
+ print(f" Last Updated: {status['last_updated'] or 'Never'}")
173
+ print()
174
+
175
+ return 0
176
+
177
+
178
+ def cmd_leaderboard(args: argparse.Namespace) -> int:
179
+ """
180
+ Execute the 'leaderboard' subcommand.
181
+
182
+ Args:
183
+ args: Parsed arguments
184
+
185
+ Returns:
186
+ Exit code (0 for success)
187
+ """
188
+ setup_logging(args.verbose if hasattr(args, 'verbose') else False)
189
+
190
+ # Load state
191
+ state_path = os.path.join(args.arena_dir, args.subset, "arena", "state.json")
192
+
193
+ if not os.path.isfile(state_path):
194
+ print(f"No arena state found for subset '{args.subset}'.")
195
+ print("Run battles first with: genarena run --arena_dir <path> --data_dir <path>")
196
+ return 1
197
+
198
+ state = load_state(state_path)
199
+
200
+ if not state.models:
201
+ print(f"No battles recorded for subset '{args.subset}'.")
202
+ return 0
203
+
204
+ # Print leaderboard
205
+ title = f"{args.subset.capitalize()} Leaderboard"
206
+ print_leaderboard(state, title)
207
+
208
+ return 0
209
+
210
+
211
+ def cmd_serve(args: argparse.Namespace) -> int:
212
+ """
213
+ Execute the 'serve' subcommand to start the visualization server.
214
+
215
+ Args:
216
+ args: Parsed arguments
217
+
218
+ Returns:
219
+ Exit code (0 for success)
220
+ """
221
+ # Import here to avoid loading Flask when not needed
222
+ from genarena.visualize.app import run_server
223
+
224
+ # Validate directories
225
+ if not os.path.isdir(args.arena_dir):
226
+ print(f"Error: Arena directory does not exist: {args.arena_dir}")
227
+ return 1
228
+
229
+ if not os.path.isdir(args.data_dir):
230
+ print(f"Error: Data directory does not exist: {args.data_dir}")
231
+ return 1
232
+
233
+ # Start server
234
+ run_server(
235
+ arena_dir=args.arena_dir,
236
+ data_dir=args.data_dir,
237
+ host=args.host,
238
+ port=args.port,
239
+ debug=args.debug,
240
+ )
241
+
242
+ return 0
243
+
244
+
245
+ # === init command ===
246
+
247
+ def cmd_init(args: argparse.Namespace) -> int:
248
+ """
249
+ Execute the 'init' subcommand.
250
+
251
+ Downloads benchmark data and arena data from official repositories.
252
+
253
+ Args:
254
+ args: Parsed arguments
255
+
256
+ Returns:
257
+ Exit code (0 for success)
258
+ """
259
+ setup_logging(args.verbose if hasattr(args, "verbose") else False)
260
+
261
+ from genarena.sync.init_ops import init_arena
262
+
263
+ # Validate mutually exclusive options
264
+ data_only = getattr(args, "data_only", False)
265
+ arena_only = getattr(args, "arena_only", False)
266
+
267
+ if data_only and arena_only:
268
+ print("Error: --data-only and --arena-only cannot be used together")
269
+ return 1
270
+
271
+ # Parse subsets
272
+ subsets = parse_models(args.subsets) if args.subsets else None
273
+
274
+ print("=== GenArena Init ===\n")
275
+
276
+ success, msg = init_arena(
277
+ arena_dir=args.arena_dir,
278
+ data_dir=args.data_dir,
279
+ subsets=subsets,
280
+ benchmark_repo=args.benchmark_repo,
281
+ arena_repo=args.arena_repo,
282
+ revision=args.revision,
283
+ overwrite=args.overwrite if hasattr(args, "overwrite") else False,
284
+ init_git=args.git if hasattr(args, "git") else False,
285
+ data_only=data_only,
286
+ arena_only=arena_only,
287
+ show_progress=True,
288
+ )
289
+
290
+ print(msg)
291
+ return 0 if success else 1
292
+
293
+
294
+ # === Git commands ===
295
+
296
+ def cmd_git_init(args: argparse.Namespace) -> int:
297
+ """
298
+ Execute the 'git init' subcommand.
299
+
300
+ Args:
301
+ args: Parsed arguments
302
+
303
+ Returns:
304
+ Exit code (0 for success)
305
+ """
306
+ from genarena.sync.git_ops import git_init, is_git_initialized
307
+
308
+ if is_git_initialized(args.arena_dir):
309
+ print(f"Git repository already initialized in {args.arena_dir}")
310
+ return 0
311
+
312
+ success, msg = git_init(args.arena_dir)
313
+ print(msg)
314
+ return 0 if success else 1
315
+
316
+
317
+ def cmd_git_commit(args: argparse.Namespace) -> int:
318
+ """
319
+ Execute the 'git commit' subcommand.
320
+
321
+ Args:
322
+ args: Parsed arguments
323
+
324
+ Returns:
325
+ Exit code (0 for success)
326
+ """
327
+ from genarena.sync.git_ops import git_commit, is_git_initialized
328
+
329
+ if not is_git_initialized(args.arena_dir):
330
+ print(f"Error: Git repository not initialized in {args.arena_dir}")
331
+ print("Run 'genarena git init --arena_dir <path>' first.")
332
+ return 1
333
+
334
+ message = args.message if hasattr(args, "message") and args.message else None
335
+ success, msg = git_commit(args.arena_dir, message=message)
336
+ print(msg)
337
+ return 0 if success else 1
338
+
339
+
340
+ def cmd_git_remote(args: argparse.Namespace) -> int:
341
+ """
342
+ Execute the 'git remote' subcommand.
343
+
344
+ Args:
345
+ args: Parsed arguments
346
+
347
+ Returns:
348
+ Exit code (0 for success)
349
+ """
350
+ from genarena.sync.git_ops import git_remote_add, git_remote_get_url, is_git_initialized
351
+
352
+ if not is_git_initialized(args.arena_dir):
353
+ print(f"Error: Git repository not initialized in {args.arena_dir}")
354
+ print("Run 'genarena git init --arena_dir <path>' first.")
355
+ return 1
356
+
357
+ # If no URL provided, just show current remote
358
+ if not args.url:
359
+ url = git_remote_get_url(args.arena_dir)
360
+ if url:
361
+ print(f"Remote 'origin': {url}")
362
+ else:
363
+ print("No remote configured.")
364
+ return 0
365
+
366
+ force = args.force if hasattr(args, "force") else False
367
+ success, msg = git_remote_add(args.arena_dir, args.url, force=force)
368
+ print(msg)
369
+ return 0 if success else 1
370
+
371
+
372
+ def cmd_git_push(args: argparse.Namespace) -> int:
373
+ """
374
+ Execute the 'git push' subcommand.
375
+
376
+ Args:
377
+ args: Parsed arguments
378
+
379
+ Returns:
380
+ Exit code (0 for success)
381
+ """
382
+ from genarena.sync.git_ops import git_push, is_git_initialized
383
+
384
+ if not is_git_initialized(args.arena_dir):
385
+ print(f"Error: Git repository not initialized in {args.arena_dir}")
386
+ print("Run 'genarena git init --arena_dir <path>' first.")
387
+ return 1
388
+
389
+ success, msg = git_push(args.arena_dir)
390
+ print(msg)
391
+ return 0 if success else 1
392
+
393
+
394
+ def cmd_git_sync(args: argparse.Namespace) -> int:
395
+ """
396
+ Execute the 'git sync' subcommand (commit + push).
397
+
398
+ Args:
399
+ args: Parsed arguments
400
+
401
+ Returns:
402
+ Exit code (0 for success)
403
+ """
404
+ from genarena.sync.git_ops import git_sync, is_git_initialized
405
+
406
+ if not is_git_initialized(args.arena_dir):
407
+ print(f"Error: Git repository not initialized in {args.arena_dir}")
408
+ print("Run 'genarena git init --arena_dir <path>' first.")
409
+ return 1
410
+
411
+ success, msg = git_sync(args.arena_dir)
412
+ print(msg)
413
+ return 0 if success else 1
414
+
415
+
416
+ # === Huggingface commands ===
417
+
418
+ def cmd_hf_upload(args: argparse.Namespace) -> int:
419
+ """
420
+ Execute the 'hf upload' subcommand.
421
+
422
+ Args:
423
+ args: Parsed arguments
424
+
425
+ Returns:
426
+ Exit code (0 for success)
427
+ """
428
+ setup_logging(args.verbose if hasattr(args, "verbose") else False)
429
+
430
+ from genarena.sync.hf_ops import upload_arena_data
431
+
432
+ # Parse filters
433
+ subsets = parse_models(args.subsets) if hasattr(args, "subsets") and args.subsets else None
434
+ models = parse_models(args.models) if hasattr(args, "models") and args.models else None
435
+ experiments = parse_models(args.experiments) if hasattr(args, "experiments") and args.experiments else None
436
+ overwrite = args.overwrite if hasattr(args, "overwrite") else False
437
+ max_retries = getattr(args, "max_retries", 3)
438
+
439
+ print(f"Uploading arena data to {args.repo_id}...")
440
+ if subsets:
441
+ print(f" Subsets: {', '.join(subsets)}")
442
+ if models:
443
+ print(f" Models: {', '.join(models)}")
444
+ if experiments:
445
+ print(f" Experiments: {', '.join(experiments)}")
446
+ print(f" Max retries per file: {max_retries}")
447
+ print()
448
+
449
+ success, msg = upload_arena_data(
450
+ arena_dir=args.arena_dir,
451
+ repo_id=args.repo_id,
452
+ subsets=subsets,
453
+ models=models,
454
+ experiments=experiments,
455
+ overwrite=overwrite,
456
+ show_progress=True,
457
+ max_retries=max_retries,
458
+ )
459
+
460
+ print()
461
+ print(msg)
462
+ return 0 if success else 1
463
+
464
+
465
+ def cmd_hf_pull(args: argparse.Namespace) -> int:
466
+ """
467
+ Execute the 'hf pull' subcommand.
468
+
469
+ Args:
470
+ args: Parsed arguments
471
+
472
+ Returns:
473
+ Exit code (0 for success)
474
+ """
475
+ setup_logging(args.verbose if hasattr(args, "verbose") else False)
476
+
477
+ from genarena.sync.hf_ops import pull_arena_data
478
+
479
+ # Parse filters
480
+ subsets = parse_models(args.subsets) if hasattr(args, "subsets") and args.subsets else None
481
+ models = parse_models(args.models) if hasattr(args, "models") and args.models else None
482
+ experiments = parse_models(args.experiments) if hasattr(args, "experiments") and args.experiments else None
483
+ overwrite = args.overwrite if hasattr(args, "overwrite") else False
484
+ revision = args.revision if hasattr(args, "revision") and args.revision else "main"
485
+
486
+ print(f"Pulling arena data from {args.repo_id} (revision: {revision})...")
487
+ if subsets:
488
+ print(f" Subsets: {', '.join(subsets)}")
489
+ if models:
490
+ print(f" Models: {', '.join(models)}")
491
+ if experiments:
492
+ print(f" Experiments: {', '.join(experiments)}")
493
+ print()
494
+
495
+ success, msg = pull_arena_data(
496
+ arena_dir=args.arena_dir,
497
+ repo_id=args.repo_id,
498
+ subsets=subsets,
499
+ models=models,
500
+ experiments=experiments,
501
+ revision=revision,
502
+ overwrite=overwrite,
503
+ show_progress=True,
504
+ )
505
+
506
+ print()
507
+ print(msg)
508
+ return 0 if success else 1
509
+
510
+
511
+ def cmd_hf_list(args: argparse.Namespace) -> int:
512
+ """
513
+ Execute the 'hf list' subcommand.
514
+
515
+ Args:
516
+ args: Parsed arguments
517
+
518
+ Returns:
519
+ Exit code (0 for success)
520
+ """
521
+ from genarena.sync.hf_ops import list_repo_contents
522
+
523
+ revision = args.revision if hasattr(args, "revision") and args.revision else "main"
524
+
525
+ success, output = list_repo_contents(
526
+ repo_id=args.repo_id,
527
+ revision=revision,
528
+ )
529
+
530
+ print(output)
531
+ return 0 if success else 1
532
+
533
+
534
+ # === submit command ===
535
+
536
+ def cmd_submit(args: argparse.Namespace) -> int:
537
+ """
538
+ Execute the 'submit' subcommand.
539
+
540
+ Args:
541
+ args: Parsed arguments
542
+
543
+ Returns:
544
+ Exit code (0 for success)
545
+ """
546
+ setup_logging(args.verbose if hasattr(args, "verbose") else False)
547
+ logger = logging.getLogger(__name__)
548
+
549
+ from genarena.sync.submit import (
550
+ validate_local_submission,
551
+ upload_submission_data,
552
+ create_submission_metadata,
553
+ create_submission_pr,
554
+ print_validation_summary,
555
+ _check_gh_cli,
556
+ _get_github_username,
557
+ DEFAULT_OFFICIAL_REPO,
558
+ )
559
+
560
+ # Check GitHub CLI (unless dry-run)
561
+ if not args.dry_run:
562
+ gh_ok, gh_msg = _check_gh_cli()
563
+ if not gh_ok:
564
+ print(f"Error: {gh_msg}")
565
+ return 1
566
+
567
+ print("=== GenArena Submission ===\n")
568
+
569
+ # Step 1: Validate local data
570
+ print("Validating local data...")
571
+ validation = validate_local_submission(
572
+ arena_dir=args.arena_dir,
573
+ subset=args.subset,
574
+ exp_name=args.exp_name,
575
+ skip_official_check=args.skip_official_check if hasattr(args, "skip_official_check") else False,
576
+ )
577
+
578
+ print_validation_summary(validation)
579
+
580
+ if not validation.valid:
581
+ print("Validation failed. Please fix the errors above.")
582
+ return 1
583
+
584
+ # Dry run - stop here
585
+ if args.dry_run:
586
+ print("Dry run complete. No data was uploaded or PR created.")
587
+ return 0
588
+
589
+ # Confirm before proceeding
590
+ if not args.yes:
591
+ confirm = input("Proceed with submission? [y/N] ")
592
+ if confirm.lower() != "y":
593
+ print("Submission cancelled.")
594
+ return 0
595
+
596
+ # Step 2: Upload to HuggingFace
597
+ print(f"\nUploading to HuggingFace ({args.hf_repo})...")
598
+ try:
599
+ upload = upload_submission_data(
600
+ arena_dir=args.arena_dir,
601
+ subset=args.subset,
602
+ exp_name=args.exp_name,
603
+ hf_repo=args.hf_repo,
604
+ )
605
+ print(f" Models ZIP: {upload.models_zip_size / 1024 / 1024:.1f} MB uploaded")
606
+ print(f" Logs ZIP: {upload.pk_logs_zip_size / 1024:.1f} KB uploaded")
607
+ except Exception as e:
608
+ print(f"Error uploading to HuggingFace: {e}")
609
+ if args.verbose:
610
+ import traceback
611
+ traceback.print_exc()
612
+ return 1
613
+
614
+ # Step 3: Create submission metadata
615
+ print("\nCreating submission metadata...")
616
+ gh_username = _get_github_username()
617
+ if not gh_username:
618
+ print("Error: Failed to get GitHub username")
619
+ return 1
620
+
621
+ submission = create_submission_metadata(
622
+ validation=validation,
623
+ upload=upload,
624
+ github_username=gh_username,
625
+ title=args.title if hasattr(args, "title") and args.title else "",
626
+ description=args.description if hasattr(args, "description") and args.description else "",
627
+ )
628
+ print(f" Submission ID: {submission['submission_id']}")
629
+
630
+ # Step 4: Create PR
631
+ print("\nCreating PR...")
632
+ official_repo = args.official_repo if hasattr(args, "official_repo") and args.official_repo else DEFAULT_OFFICIAL_REPO
633
+ try:
634
+ pr_url = create_submission_pr(
635
+ submission=submission,
636
+ official_repo=official_repo,
637
+ title=args.title if hasattr(args, "title") else None,
638
+ )
639
+ print(f"\nSubmission created successfully!")
640
+ print(f"PR URL: {pr_url}")
641
+ print("\nNext steps:")
642
+ print(" 1. The validation bot will automatically check your submission")
643
+ print(" 2. A maintainer will review and merge if approved")
644
+ print(" 3. Your models will appear on the official leaderboard after integration")
645
+ except Exception as e:
646
+ print(f"Error creating PR: {e}")
647
+ if args.verbose:
648
+ import traceback
649
+ traceback.print_exc()
650
+ return 1
651
+
652
+ return 0
653
+
654
+
655
+ def cmd_export_models(args: argparse.Namespace) -> int:
656
+ """
657
+ Execute the 'export-models' subcommand.
658
+
659
+ Generates official_models.json from arena state.
660
+
661
+ Args:
662
+ args: Parsed arguments
663
+
664
+ Returns:
665
+ Exit code (0 for success)
666
+ """
667
+ from genarena.sync.submit import (
668
+ generate_official_models_json,
669
+ print_official_models_summary,
670
+ )
671
+
672
+ # Generate official_models.json
673
+ output_path = args.output if hasattr(args, "output") and args.output else None
674
+
675
+ try:
676
+ data = generate_official_models_json(
677
+ arena_dir=args.arena_dir,
678
+ output_path=output_path,
679
+ )
680
+
681
+ # Print summary
682
+ print_official_models_summary(data)
683
+
684
+ if output_path:
685
+ print(f"Wrote to: {output_path}")
686
+ else:
687
+ # Print JSON to stdout if no output file specified
688
+ print("\n--- JSON Output ---\n")
689
+ print(json.dumps(data, indent=2))
690
+
691
+ except Exception as e:
692
+ print(f"Error: {e}")
693
+ return 1
694
+
695
+ return 0
696
+
697
+
698
+ # === deploy commands ===
699
+
700
+ def cmd_deploy_upload(args: argparse.Namespace) -> int:
701
+ """
702
+ Execute the 'deploy upload' subcommand.
703
+
704
+ Uploads arena data to HuggingFace for Spaces deployment.
705
+ Parquet benchmark data is downloaded from rhli/genarena during Docker build.
706
+
707
+ Args:
708
+ args: Parsed arguments
709
+
710
+ Returns:
711
+ Exit code (0 for success)
712
+ """
713
+ setup_logging(args.verbose if hasattr(args, "verbose") else False)
714
+
715
+ from genarena.sync.deploy_ops import upload_for_deploy, DEFAULT_NUM_WORKERS, DEFAULT_WORKER_TIMEOUT
716
+
717
+ subsets = parse_models(args.subsets) if args.subsets else None
718
+ max_retries = getattr(args, "max_retries", 3)
719
+ num_workers = getattr(args, "num_workers", DEFAULT_NUM_WORKERS)
720
+ worker_timeout = getattr(args, "timeout", DEFAULT_WORKER_TIMEOUT)
721
+
722
+ print("=" * 60)
723
+ print(" GenArena HuggingFace Spaces Deployment")
724
+ print("=" * 60)
725
+ print(f" Arena dir: {args.arena_dir}")
726
+ print(f" Arena repo: {args.arena_repo}")
727
+ print(f" Space repo: {args.space_repo}")
728
+ if subsets:
729
+ print(f" Subsets: {', '.join(subsets)}")
730
+ print(f" Mode: {'overwrite' if args.overwrite else 'incremental'}")
731
+ print(f" Max retries: {max_retries}")
732
+ print(f" Workers: {num_workers}")
733
+ print(f" Timeout: {worker_timeout}s per worker")
734
+ print()
735
+ print(" Note: Parquet data is downloaded from rhli/genarena during build")
736
+ print("=" * 60)
737
+ print()
738
+
739
+ success, msg = upload_for_deploy(
740
+ arena_dir=args.arena_dir,
741
+ arena_repo=args.arena_repo,
742
+ space_repo=args.space_repo,
743
+ subsets=subsets,
744
+ overwrite=args.overwrite,
745
+ show_progress=True,
746
+ max_retries=max_retries,
747
+ num_workers=num_workers,
748
+ worker_timeout=worker_timeout,
749
+ )
750
+
751
+ print()
752
+ print("=" * 60)
753
+ if success:
754
+ print(" Deployment upload completed successfully!")
755
+ else:
756
+ print(" Deployment upload failed!")
757
+ print("=" * 60)
758
+ print()
759
+ print(msg)
760
+
761
+ if success:
762
+ print()
763
+ print("Next steps:")
764
+ print(f" 1. Go to https://huggingface.co/spaces/{args.space_repo}")
765
+ print(" 2. Set environment variable: HF_ARENA_REPO=" + args.arena_repo)
766
+ print(" 3. The Space should automatically rebuild and deploy")
767
+
768
+ return 0 if success else 1
769
+
770
+
771
+ def cmd_deploy_info(args: argparse.Namespace) -> int:
772
+ """
773
+ Execute the 'deploy info' subcommand.
774
+
775
+ Shows deployment configuration and instructions.
776
+
777
+ Args:
778
+ args: Parsed arguments
779
+
780
+ Returns:
781
+ Exit code (0 for success)
782
+ """
783
+ print("""
784
+ GenArena HuggingFace Spaces Deployment
785
+ ======================================
786
+
787
+ OVERVIEW
788
+ --------
789
+ Deploy GenArena Explorer to HuggingFace Spaces for public access.
790
+
791
+ Data is split across repositories:
792
+ - Space repo: Contains Dockerfile, app code, and genarena package
793
+ - Arena repo: Contains battle logs and model output images (served via CDN)
794
+ - Data repo (rhli/genarena): Parquet benchmark data (downloaded during build)
795
+
796
+ PREREQUISITES
797
+ -------------
798
+ 1. Create a HuggingFace account and get an API token
799
+ 2. Set the HF_TOKEN environment variable:
800
+ export HF_TOKEN='your_token_here'
801
+
802
+ 3. Create two repositories on HuggingFace:
803
+ - A Dataset repository for arena data (e.g., 'your-org/genarena-arena')
804
+ - A Space repository for deployment (e.g., 'your-org/genarena-explorer')
805
+ Select "Docker" as the SDK when creating the Space
806
+
807
+ USAGE
808
+ -----
809
+ Upload all data for deployment:
810
+
811
+ genarena deploy upload \\
812
+ --arena_dir ./arena \\
813
+ --arena_repo your-org/genarena-arena \\
814
+ --space_repo your-org/genarena-explorer
815
+
816
+ Options:
817
+ --subsets Comma-separated list of subsets to upload (default: all)
818
+ --overwrite Overwrite existing files (default: incremental upload)
819
+ --max-retries Max retry attempts per file (default: 3)
820
+
821
+ INCREMENTAL UPLOADS
822
+ -------------------
823
+ By default, the upload is incremental - existing files are skipped.
824
+ This allows you to resume interrupted uploads or add new data.
825
+
826
+ Use --overwrite to force re-upload of all files.
827
+
828
+ ENVIRONMENT VARIABLES
829
+ ---------------------
830
+ The Space uses these environment variables:
831
+ - HF_ARENA_REPO: Arena data repository (required)
832
+ Set this in the Space settings after deployment.
833
+ - HF_DATA_REPO: Parquet data repository (default: rhli/genarena)
834
+ Override if using a different benchmark dataset.
835
+
836
+ DIRECTORY STRUCTURE
837
+ -------------------
838
+ Space repo:
839
+ ├── Dockerfile # Docker build file (downloads rhli/genarena)
840
+ ├── README.md # Space configuration
841
+ ├── genarena/ # Python package
842
+ │ └── deploy/app.py # Startup script
843
+ └── pyproject.toml # Package config
844
+
845
+ Arena repo (Dataset):
846
+ └── <subset>/
847
+ ├── pk_logs/ # Battle logs
848
+ ├── models/ # Model output images
849
+ └── arena/ # State files
850
+
851
+ Data repo (rhli/genarena - downloaded at build time):
852
+ └── <subset>/
853
+ └── data-*.parquet # Benchmark prompts
854
+
855
+ For more information, see:
856
+ https://github.com/genarena/genarena
857
+ """)
858
+ return 0
859
+
860
+
861
+ def create_parser() -> argparse.ArgumentParser:
862
+ """
863
+ Create the argument parser.
864
+
865
+ Returns:
866
+ Configured ArgumentParser
867
+ """
868
+ parser = argparse.ArgumentParser(
869
+ prog="genarena",
870
+ description="GenArena Arena Evaluation - VLM-based pairwise image generation evaluation"
871
+ )
872
+
873
+ parser.add_argument(
874
+ "--version",
875
+ action="version",
876
+ version=f"%(prog)s {__version__}"
877
+ )
878
+
879
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
880
+
881
+ # === run command ===
882
+ run_parser = subparsers.add_parser(
883
+ "run",
884
+ help="Run pairwise evaluation battles"
885
+ )
886
+
887
+ # Required arguments
888
+ run_parser.add_argument(
889
+ "--arena_dir",
890
+ required=True,
891
+ help="Arena directory path"
892
+ )
893
+ run_parser.add_argument(
894
+ "--data_dir",
895
+ required=True,
896
+ help="Parquet dataset directory path"
897
+ )
898
+
899
+ # Optional arguments
900
+ run_parser.add_argument(
901
+ "--subset",
902
+ default=None,
903
+ help="Subset name (default: process all subsets)"
904
+ )
905
+ run_parser.add_argument(
906
+ "--models",
907
+ default=None,
908
+ help="Comma-separated list of models to include (default: all)"
909
+ )
910
+ run_parser.add_argument(
911
+ "--sample_size",
912
+ type=int,
913
+ default=None,
914
+ help="Number of samples per model pair (default: all)"
915
+ )
916
+ run_parser.add_argument(
917
+ "--num_threads",
918
+ type=int,
919
+ default=8,
920
+ help="Number of parallel threads (default: 8)"
921
+ )
922
+ run_parser.add_argument(
923
+ "--num_processes",
924
+ type=int,
925
+ default=1,
926
+ help="Number of parallel processes (default: 1). When >1, work is sharded by parquet file."
927
+ )
928
+ run_parser.add_argument(
929
+ "--parallel_swap_calls",
930
+ action="store_true",
931
+ help="Run original+swapped VLM calls in parallel within a battle (may increase 429 risk)."
932
+ )
933
+ run_parser.add_argument(
934
+ "--enable_progress_bar",
935
+ action="store_true",
936
+ help="Show a progress bar by VLM API call count (total may be unknown and grow dynamically). "
937
+ "When enabled, noisy httpx/httpcore request logs are silenced."
938
+ )
939
+ run_parser.add_argument(
940
+ "--judge_model",
941
+ default="Qwen/Qwen3-VL-32B-Instruct-FP8",
942
+ help="VLM judge model name (default: Qwen/Qwen3-VL-32B-Instruct-FP8)"
943
+ )
944
+ run_parser.add_argument(
945
+ "--temperature",
946
+ type=float,
947
+ default=0.0,
948
+ help="VLM temperature (default: 0 for greedy mode)"
949
+ )
950
+ run_parser.add_argument(
951
+ "--prompt",
952
+ default="mmrb2",
953
+ help="Prompt module name (default: mmrb2)"
954
+ )
955
+ run_parser.add_argument(
956
+ "--exp_name",
957
+ default=None,
958
+ help="Experiment name (must end with `_yyyymmdd`). "
959
+ "If omitted, uses the latest `models/<exp_name>` by date suffix."
960
+ )
961
+ run_parser.add_argument(
962
+ "--timeout",
963
+ type=int,
964
+ default=120,
965
+ help="API timeout in seconds (default: 120)"
966
+ )
967
+ run_parser.add_argument(
968
+ "--max_retries",
969
+ type=int,
970
+ default=3,
971
+ help="Maximum retry attempts (default: 3)"
972
+ )
973
+ run_parser.add_argument(
974
+ "--verbose",
975
+ action="store_true",
976
+ help="Enable verbose logging"
977
+ )
978
+ run_parser.add_argument(
979
+ "--no-audit-log",
980
+ action="store_true",
981
+ help="Disable audit logging (raw VLM outputs)"
982
+ )
983
+ run_parser.add_argument(
984
+ "--base_urls",
985
+ default=None,
986
+ help="Comma-separated VLM API base URLs for multi-endpoint support (default: from OPENAI_BASE_URL(S) env)"
987
+ )
988
+ run_parser.add_argument(
989
+ "--api_keys",
990
+ default=None,
991
+ help="Comma-separated API keys for multi-endpoint support (default: from OPENAI_API_KEY env)"
992
+ )
993
+ run_parser.add_argument(
994
+ "--no-clean-orphaned-logs",
995
+ action="store_true",
996
+ help="Disable auto-deletion of battle logs for removed models (default: enabled)"
997
+ )
998
+
999
+ # Sampling configuration
1000
+ run_parser.add_argument(
1001
+ "--sampling_mode",
1002
+ choices=["adaptive", "full"],
1003
+ default="adaptive",
1004
+ help="Sampling mode: 'adaptive' (CI-based, default) or 'full' (all samples)"
1005
+ )
1006
+ run_parser.add_argument(
1007
+ "--target_ci_width",
1008
+ type=float,
1009
+ default=15.0,
1010
+ help="Target 95%% CI width for adaptive mode (default: 15.0, i.e., ±7.5 Elo)"
1011
+ )
1012
+ run_parser.add_argument(
1013
+ "--min_samples",
1014
+ type=int,
1015
+ default=100,
1016
+ help="Minimum samples per pair before checking CI in adaptive mode (default: 100)"
1017
+ )
1018
+ run_parser.add_argument(
1019
+ "--max_samples",
1020
+ type=int,
1021
+ default=1500,
1022
+ help="Maximum samples per pair in adaptive mode (default: 1500)"
1023
+ )
1024
+ run_parser.add_argument(
1025
+ "--milestone_min_samples",
1026
+ type=int,
1027
+ default=None,
1028
+ help="Minimum samples per pair for milestone experiments (default: 1000)"
1029
+ )
1030
+
1031
+ run_parser.set_defaults(func=cmd_run)
1032
+
1033
+ # === init command ===
1034
+ init_parser = subparsers.add_parser(
1035
+ "init",
1036
+ help="Initialize arena and download data from official repositories"
1037
+ )
1038
+
1039
+ init_parser.add_argument(
1040
+ "--arena_dir",
1041
+ default="./arena",
1042
+ help="Arena directory path (default: ./arena)"
1043
+ )
1044
+ init_parser.add_argument(
1045
+ "--data_dir",
1046
+ default="./data",
1047
+ help="Benchmark data directory path (default: ./data)"
1048
+ )
1049
+ init_parser.add_argument(
1050
+ "--subsets",
1051
+ default=None,
1052
+ help="Comma-separated list of subsets to download (default: all available)"
1053
+ )
1054
+ init_parser.add_argument(
1055
+ "--git",
1056
+ action="store_true",
1057
+ help="Initialize Git repository in arena_dir after downloading"
1058
+ )
1059
+ init_parser.add_argument(
1060
+ "--data-only",
1061
+ action="store_true",
1062
+ dest="data_only",
1063
+ help="Only download benchmark Parquet data (skip arena data)"
1064
+ )
1065
+ init_parser.add_argument(
1066
+ "--arena-only",
1067
+ action="store_true",
1068
+ dest="arena_only",
1069
+ help="Only download arena data (skip benchmark data)"
1070
+ )
1071
+ init_parser.add_argument(
1072
+ "--benchmark-repo",
1073
+ default="rhli/genarena",
1074
+ dest="benchmark_repo",
1075
+ help="HuggingFace repository for benchmark data (default: rhli/genarena)"
1076
+ )
1077
+ init_parser.add_argument(
1078
+ "--arena-repo",
1079
+ default="rhli/genarena-battlefield",
1080
+ dest="arena_repo",
1081
+ help="HuggingFace repository for arena data (default: rhli/genarena-battlefield)"
1082
+ )
1083
+ init_parser.add_argument(
1084
+ "--revision",
1085
+ default="main",
1086
+ help="HuggingFace revision/branch (default: main)"
1087
+ )
1088
+ init_parser.add_argument(
1089
+ "--overwrite",
1090
+ action="store_true",
1091
+ help="Overwrite existing files"
1092
+ )
1093
+ init_parser.add_argument(
1094
+ "--verbose",
1095
+ action="store_true",
1096
+ help="Enable verbose output"
1097
+ )
1098
+
1099
+ init_parser.set_defaults(func=cmd_init)
1100
+
1101
+ # === status command ===
1102
+ status_parser = subparsers.add_parser(
1103
+ "status",
1104
+ help="Show arena status summary"
1105
+ )
1106
+
1107
+ status_parser.add_argument(
1108
+ "--arena_dir",
1109
+ required=True,
1110
+ help="Arena directory path"
1111
+ )
1112
+ status_parser.add_argument(
1113
+ "--data_dir",
1114
+ required=True,
1115
+ help="Parquet dataset directory path"
1116
+ )
1117
+ status_parser.add_argument(
1118
+ "--verbose",
1119
+ action="store_true",
1120
+ help="Enable verbose output"
1121
+ )
1122
+
1123
+ status_parser.set_defaults(func=cmd_status)
1124
+
1125
+ # === leaderboard command ===
1126
+ lb_parser = subparsers.add_parser(
1127
+ "leaderboard",
1128
+ help="Display leaderboard for a subset"
1129
+ )
1130
+
1131
+ lb_parser.add_argument(
1132
+ "--arena_dir",
1133
+ required=True,
1134
+ help="Arena directory path"
1135
+ )
1136
+ lb_parser.add_argument(
1137
+ "--subset",
1138
+ required=True,
1139
+ help="Subset name to display leaderboard for"
1140
+ )
1141
+ lb_parser.add_argument(
1142
+ "--verbose",
1143
+ action="store_true",
1144
+ help="Enable verbose output"
1145
+ )
1146
+
1147
+ lb_parser.set_defaults(func=cmd_leaderboard)
1148
+
1149
+ # === serve command ===
1150
+ serve_parser = subparsers.add_parser(
1151
+ "serve",
1152
+ help="Start the battle visualization web server"
1153
+ )
1154
+
1155
+ serve_parser.add_argument(
1156
+ "--arena_dir",
1157
+ required=True,
1158
+ help="Arena directory path"
1159
+ )
1160
+ serve_parser.add_argument(
1161
+ "--data_dir",
1162
+ required=True,
1163
+ help="Parquet dataset directory path"
1164
+ )
1165
+ serve_parser.add_argument(
1166
+ "--host",
1167
+ default="0.0.0.0",
1168
+ help="Host to bind the server to (default: 0.0.0.0)"
1169
+ )
1170
+ serve_parser.add_argument(
1171
+ "--port",
1172
+ type=int,
1173
+ default=8080,
1174
+ help="Port to listen on (default: 8080)"
1175
+ )
1176
+ serve_parser.add_argument(
1177
+ "--debug",
1178
+ action="store_true",
1179
+ help="Enable Flask debug mode"
1180
+ )
1181
+
1182
+ serve_parser.set_defaults(func=cmd_serve)
1183
+
1184
+ # === git command group ===
1185
+ git_parser = subparsers.add_parser(
1186
+ "git",
1187
+ help="Git version control commands"
1188
+ )
1189
+ git_subparsers = git_parser.add_subparsers(dest="git_command", help="Git subcommands")
1190
+
1191
+ # git init
1192
+ git_init_parser = git_subparsers.add_parser(
1193
+ "init",
1194
+ help="Initialize Git repository for arena directory"
1195
+ )
1196
+ git_init_parser.add_argument(
1197
+ "--arena_dir",
1198
+ required=True,
1199
+ help="Arena directory path"
1200
+ )
1201
+ git_init_parser.set_defaults(func=cmd_git_init)
1202
+
1203
+ # git commit
1204
+ git_commit_parser = git_subparsers.add_parser(
1205
+ "commit",
1206
+ help="Commit changes to local Git repository"
1207
+ )
1208
+ git_commit_parser.add_argument(
1209
+ "--arena_dir",
1210
+ required=True,
1211
+ help="Arena directory path"
1212
+ )
1213
+ git_commit_parser.add_argument(
1214
+ "--message", "-m",
1215
+ default=None,
1216
+ help="Custom commit message (default: auto-generated)"
1217
+ )
1218
+ git_commit_parser.set_defaults(func=cmd_git_commit)
1219
+
1220
+ # git remote
1221
+ git_remote_parser = git_subparsers.add_parser(
1222
+ "remote",
1223
+ help="Configure remote repository"
1224
+ )
1225
+ git_remote_parser.add_argument(
1226
+ "--arena_dir",
1227
+ required=True,
1228
+ help="Arena directory path"
1229
+ )
1230
+ git_remote_parser.add_argument(
1231
+ "--url",
1232
+ default=None,
1233
+ help="Remote repository URL"
1234
+ )
1235
+ git_remote_parser.add_argument(
1236
+ "--force", "-f",
1237
+ action="store_true",
1238
+ help="Force overwrite existing remote URL"
1239
+ )
1240
+ git_remote_parser.set_defaults(func=cmd_git_remote)
1241
+
1242
+ # git push
1243
+ git_push_parser = git_subparsers.add_parser(
1244
+ "push",
1245
+ help="Push commits to remote repository"
1246
+ )
1247
+ git_push_parser.add_argument(
1248
+ "--arena_dir",
1249
+ required=True,
1250
+ help="Arena directory path"
1251
+ )
1252
+ git_push_parser.set_defaults(func=cmd_git_push)
1253
+
1254
+ # git sync
1255
+ git_sync_parser = git_subparsers.add_parser(
1256
+ "sync",
1257
+ help="Commit all changes and push to remote (one-click sync)"
1258
+ )
1259
+ git_sync_parser.add_argument(
1260
+ "--arena_dir",
1261
+ required=True,
1262
+ help="Arena directory path"
1263
+ )
1264
+ git_sync_parser.set_defaults(func=cmd_git_sync)
1265
+
1266
+ # === hf command group ===
1267
+ hf_parser = subparsers.add_parser(
1268
+ "hf",
1269
+ help="Huggingface Dataset repository commands"
1270
+ )
1271
+ hf_subparsers = hf_parser.add_subparsers(dest="hf_command", help="Huggingface subcommands")
1272
+
1273
+ # hf upload
1274
+ hf_upload_parser = hf_subparsers.add_parser(
1275
+ "upload",
1276
+ help="Upload arena data to Huggingface Dataset repository"
1277
+ )
1278
+ hf_upload_parser.add_argument(
1279
+ "--arena_dir",
1280
+ required=True,
1281
+ help="Arena directory path"
1282
+ )
1283
+ hf_upload_parser.add_argument(
1284
+ "--repo_id",
1285
+ required=True,
1286
+ help="Huggingface repository ID (e.g., 'username/repo-name')"
1287
+ )
1288
+ hf_upload_parser.add_argument(
1289
+ "--subsets",
1290
+ default=None,
1291
+ help="Comma-separated list of subsets to upload (default: all)"
1292
+ )
1293
+ hf_upload_parser.add_argument(
1294
+ "--models",
1295
+ default=None,
1296
+ help="Comma-separated list of models to upload (default: all)"
1297
+ )
1298
+ hf_upload_parser.add_argument(
1299
+ "--experiments",
1300
+ default=None,
1301
+ help="Comma-separated list of experiments (exp_name) to upload (default: all). "
1302
+ "In v2 layout, model outputs are uploaded as one ZIP per exp_name."
1303
+ )
1304
+ hf_upload_parser.add_argument(
1305
+ "--overwrite",
1306
+ action="store_true",
1307
+ help="Overwrite existing files in the repository"
1308
+ )
1309
+ hf_upload_parser.add_argument(
1310
+ "--verbose",
1311
+ action="store_true",
1312
+ help="Enable verbose output"
1313
+ )
1314
+ hf_upload_parser.add_argument(
1315
+ "--max-retries",
1316
+ type=int,
1317
+ default=3,
1318
+ help="Maximum retry attempts per file on connection failure (default: 3)"
1319
+ )
1320
+ hf_upload_parser.set_defaults(func=cmd_hf_upload)
1321
+
1322
+ # hf pull
1323
+ hf_pull_parser = hf_subparsers.add_parser(
1324
+ "pull",
1325
+ help="Pull arena data from Huggingface Dataset repository"
1326
+ )
1327
+ hf_pull_parser.add_argument(
1328
+ "--arena_dir",
1329
+ required=True,
1330
+ help="Arena directory path to save data to"
1331
+ )
1332
+ hf_pull_parser.add_argument(
1333
+ "--repo_id",
1334
+ required=True,
1335
+ help="Huggingface repository ID (e.g., 'username/repo-name')"
1336
+ )
1337
+ hf_pull_parser.add_argument(
1338
+ "--subsets",
1339
+ default=None,
1340
+ help="Comma-separated list of subsets to download (default: all)"
1341
+ )
1342
+ hf_pull_parser.add_argument(
1343
+ "--models",
1344
+ default=None,
1345
+ help="Comma-separated list of models to download (default: all)"
1346
+ )
1347
+ hf_pull_parser.add_argument(
1348
+ "--experiments",
1349
+ default=None,
1350
+ help="Comma-separated list of experiments (exp_name) to download (default: all). "
1351
+ "In v2 layout, model outputs are downloaded as one ZIP per exp_name."
1352
+ )
1353
+ hf_pull_parser.add_argument(
1354
+ "--revision",
1355
+ default="main",
1356
+ help="Repository revision/branch to download from (default: main)"
1357
+ )
1358
+ hf_pull_parser.add_argument(
1359
+ "--overwrite",
1360
+ action="store_true",
1361
+ help="Overwrite existing local files"
1362
+ )
1363
+ hf_pull_parser.add_argument(
1364
+ "--verbose",
1365
+ action="store_true",
1366
+ help="Enable verbose output"
1367
+ )
1368
+ hf_pull_parser.set_defaults(func=cmd_hf_pull)
1369
+
1370
+ # hf list
1371
+ hf_list_parser = hf_subparsers.add_parser(
1372
+ "list",
1373
+ help="List contents of a Huggingface Dataset repository"
1374
+ )
1375
+ hf_list_parser.add_argument(
1376
+ "--repo_id",
1377
+ required=True,
1378
+ help="Huggingface repository ID (e.g., 'username/repo-name')"
1379
+ )
1380
+ hf_list_parser.add_argument(
1381
+ "--revision",
1382
+ default="main",
1383
+ help="Repository revision/branch (default: main)"
1384
+ )
1385
+ hf_list_parser.set_defaults(func=cmd_hf_list)
1386
+
1387
+ # === submit command ===
1388
+ submit_parser = subparsers.add_parser(
1389
+ "submit",
1390
+ help="Submit evaluation results to official leaderboard via GitHub PR"
1391
+ )
1392
+
1393
+ submit_parser.add_argument(
1394
+ "--arena_dir",
1395
+ required=True,
1396
+ help="Arena directory path"
1397
+ )
1398
+ submit_parser.add_argument(
1399
+ "--subset",
1400
+ required=True,
1401
+ help="Subset name (e.g., 'basic')"
1402
+ )
1403
+ submit_parser.add_argument(
1404
+ "--exp_name",
1405
+ required=True,
1406
+ help="Experiment name (must end with _yyyymmdd)"
1407
+ )
1408
+ submit_parser.add_argument(
1409
+ "--hf_repo",
1410
+ required=True,
1411
+ help="Your HuggingFace Dataset repository ID (e.g., 'username/my-genarena-results')"
1412
+ )
1413
+ submit_parser.add_argument(
1414
+ "--official_repo",
1415
+ default=None,
1416
+ help="Official submissions repository (default: genarena/submissions)"
1417
+ )
1418
+ submit_parser.add_argument(
1419
+ "--title",
1420
+ default=None,
1421
+ help="PR title (default: auto-generated)"
1422
+ )
1423
+ submit_parser.add_argument(
1424
+ "--description",
1425
+ default=None,
1426
+ help="PR description"
1427
+ )
1428
+ submit_parser.add_argument(
1429
+ "--yes", "-y",
1430
+ action="store_true",
1431
+ help="Skip confirmation prompt"
1432
+ )
1433
+ submit_parser.add_argument(
1434
+ "--dry-run",
1435
+ action="store_true",
1436
+ help="Validate only, do not upload or create PR"
1437
+ )
1438
+ submit_parser.add_argument(
1439
+ "--skip-official-check",
1440
+ action="store_true",
1441
+ help="Skip checking against official models (for testing)"
1442
+ )
1443
+ submit_parser.add_argument(
1444
+ "--verbose",
1445
+ action="store_true",
1446
+ help="Enable verbose output"
1447
+ )
1448
+
1449
+ submit_parser.set_defaults(func=cmd_submit)
1450
+
1451
+ # === export-models command ===
1452
+ export_models_parser = subparsers.add_parser(
1453
+ "export-models",
1454
+ help="Export official_models.json from arena state (for maintainers)"
1455
+ )
1456
+
1457
+ export_models_parser.add_argument(
1458
+ "--arena_dir",
1459
+ required=True,
1460
+ help="Arena directory path"
1461
+ )
1462
+ export_models_parser.add_argument(
1463
+ "--output", "-o",
1464
+ default=None,
1465
+ help="Output file path (default: print to stdout)"
1466
+ )
1467
+
1468
+ export_models_parser.set_defaults(func=cmd_export_models)
1469
+
1470
+ # === deploy command group ===
1471
+ deploy_parser = subparsers.add_parser(
1472
+ "deploy",
1473
+ help="Deployment commands for HuggingFace Spaces"
1474
+ )
1475
+ deploy_subparsers = deploy_parser.add_subparsers(dest="deploy_command", help="Deploy subcommands")
1476
+
1477
+ # deploy upload
1478
+ deploy_upload_parser = deploy_subparsers.add_parser(
1479
+ "upload",
1480
+ help="Upload all data for HuggingFace Spaces deployment (incremental)"
1481
+ )
1482
+ deploy_upload_parser.add_argument(
1483
+ "--arena_dir",
1484
+ required=True,
1485
+ help="Arena directory path"
1486
+ )
1487
+ deploy_upload_parser.add_argument(
1488
+ "--arena_repo",
1489
+ required=True,
1490
+ help="HuggingFace Dataset repo for arena data (e.g., 'your-org/leaderboard-data')"
1491
+ )
1492
+ deploy_upload_parser.add_argument(
1493
+ "--space_repo",
1494
+ required=True,
1495
+ help="HuggingFace Space repo for deployment (e.g., 'your-org/leaderboard')"
1496
+ )
1497
+ deploy_upload_parser.add_argument(
1498
+ "--subsets",
1499
+ default=None,
1500
+ help="Comma-separated list of subsets to upload (default: all)"
1501
+ )
1502
+ deploy_upload_parser.add_argument(
1503
+ "--overwrite",
1504
+ action="store_true",
1505
+ help="Overwrite existing files (default: skip existing for incremental upload)"
1506
+ )
1507
+ deploy_upload_parser.add_argument(
1508
+ "--max-retries",
1509
+ type=int,
1510
+ default=3,
1511
+ help="Maximum retry attempts per file on connection failure (default: 3)"
1512
+ )
1513
+ deploy_upload_parser.add_argument(
1514
+ "--verbose",
1515
+ action="store_true",
1516
+ help="Enable verbose output"
1517
+ )
1518
+ deploy_upload_parser.add_argument(
1519
+ "--num_workers",
1520
+ type=int,
1521
+ default=16,
1522
+ help="Number of parallel workers for upload (default: 16)"
1523
+ )
1524
+ deploy_upload_parser.add_argument(
1525
+ "--timeout",
1526
+ type=int,
1527
+ default=300,
1528
+ help="Timeout in seconds for each worker (default: 300)"
1529
+ )
1530
+ deploy_upload_parser.set_defaults(func=cmd_deploy_upload)
1531
+
1532
+ # deploy info
1533
+ deploy_info_parser = deploy_subparsers.add_parser(
1534
+ "info",
1535
+ help="Show deployment configuration and instructions"
1536
+ )
1537
+ deploy_info_parser.set_defaults(func=cmd_deploy_info)
1538
+
1539
+ return parser
1540
+
1541
+
1542
+ def main() -> int:
1543
+ """
1544
+ Main entry point.
1545
+
1546
+ Returns:
1547
+ Exit code
1548
+ """
1549
+ parser = create_parser()
1550
+ args = parser.parse_args()
1551
+
1552
+ if args.command is None:
1553
+ parser.print_help()
1554
+ return 0
1555
+
1556
+ # Handle git subcommands
1557
+ if args.command == "git":
1558
+ if not hasattr(args, "git_command") or args.git_command is None:
1559
+ # Print git help
1560
+ parser.parse_args(["git", "--help"])
1561
+ return 0
1562
+
1563
+ # Handle hf subcommands
1564
+ if args.command == "hf":
1565
+ if not hasattr(args, "hf_command") or args.hf_command is None:
1566
+ # Print hf help
1567
+ parser.parse_args(["hf", "--help"])
1568
+ return 0
1569
+
1570
+ # Handle deploy subcommands
1571
+ if args.command == "deploy":
1572
+ if not hasattr(args, "deploy_command") or args.deploy_command is None:
1573
+ # Print deploy help
1574
+ parser.parse_args(["deploy", "--help"])
1575
+ return 0
1576
+
1577
+ return args.func(args)
1578
+
1579
+
1580
+ if __name__ == "__main__":
1581
+ sys.exit(main())