genarena 0.0.1__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. genarena/__init__.py +49 -2
  2. genarena/__main__.py +10 -0
  3. genarena/arena.py +1685 -0
  4. genarena/battle.py +337 -0
  5. genarena/bt_elo.py +507 -0
  6. genarena/cli.py +1581 -0
  7. genarena/data.py +476 -0
  8. genarena/deploy/Dockerfile +22 -0
  9. genarena/deploy/README.md +55 -0
  10. genarena/deploy/__init__.py +5 -0
  11. genarena/deploy/app.py +84 -0
  12. genarena/experiments.py +121 -0
  13. genarena/leaderboard.py +270 -0
  14. genarena/logs.py +409 -0
  15. genarena/models.py +412 -0
  16. genarena/prompts/__init__.py +127 -0
  17. genarena/prompts/mmrb2.py +373 -0
  18. genarena/sampling.py +336 -0
  19. genarena/state.py +656 -0
  20. genarena/sync/__init__.py +105 -0
  21. genarena/sync/auto_commit.py +118 -0
  22. genarena/sync/deploy_ops.py +543 -0
  23. genarena/sync/git_ops.py +422 -0
  24. genarena/sync/hf_ops.py +891 -0
  25. genarena/sync/init_ops.py +431 -0
  26. genarena/sync/packer.py +587 -0
  27. genarena/sync/submit.py +837 -0
  28. genarena/utils.py +103 -0
  29. genarena/validation/__init__.py +19 -0
  30. genarena/validation/schema.py +327 -0
  31. genarena/validation/validator.py +329 -0
  32. genarena/visualize/README.md +148 -0
  33. genarena/visualize/__init__.py +14 -0
  34. genarena/visualize/app.py +938 -0
  35. genarena/visualize/data_loader.py +2430 -0
  36. genarena/visualize/static/app.js +3762 -0
  37. genarena/visualize/static/model_aliases.json +86 -0
  38. genarena/visualize/static/style.css +4104 -0
  39. genarena/visualize/templates/index.html +413 -0
  40. genarena/vlm.py +519 -0
  41. genarena-0.1.1.dist-info/METADATA +178 -0
  42. genarena-0.1.1.dist-info/RECORD +44 -0
  43. {genarena-0.0.1.dist-info → genarena-0.1.1.dist-info}/WHEEL +1 -2
  44. genarena-0.1.1.dist-info/entry_points.txt +2 -0
  45. genarena-0.0.1.dist-info/METADATA +0 -26
  46. genarena-0.0.1.dist-info/RECORD +0 -5
  47. genarena-0.0.1.dist-info/top_level.txt +0 -1
@@ -0,0 +1,431 @@
1
+ # Copyright 2026 Ruihang Li.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See LICENSE file in the project root for details.
4
+
5
+ """
6
+ Initialization operations for GenArena.
7
+
8
+ This module provides functionality for one-click initialization of arena
9
+ directories, including downloading benchmark data and official arena data
10
+ from HuggingFace repositories.
11
+ """
12
+
13
+ import logging
14
+ import os
15
+ from typing import Optional
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Default repository configurations
20
+ DEFAULT_BENCHMARK_REPO = "rhli/genarena"
21
+ DEFAULT_ARENA_REPO = "rhli/genarena-battlefield"
22
+
23
+
24
+ def _format_size(size_bytes: int) -> str:
25
+ """Format file size in human-readable format."""
26
+ if size_bytes < 1024:
27
+ return f"{size_bytes} B"
28
+ elif size_bytes < 1024 * 1024:
29
+ return f"{size_bytes / 1024:.1f} KB"
30
+ elif size_bytes < 1024 * 1024 * 1024:
31
+ return f"{size_bytes / 1024 / 1024:.1f} MB"
32
+ else:
33
+ return f"{size_bytes / 1024 / 1024 / 1024:.2f} GB"
34
+
35
+
36
+ def discover_repo_subsets(
37
+ repo_id: str,
38
+ token: Optional[str] = None,
39
+ revision: str = "main",
40
+ ) -> list[str]:
41
+ """
42
+ Discover available subsets in a HuggingFace repository.
43
+
44
+ Looks for directories containing parquet files or known subset patterns.
45
+
46
+ Args:
47
+ repo_id: HuggingFace repository ID
48
+ token: HuggingFace token (optional for public repos)
49
+ revision: Repository revision/branch
50
+
51
+ Returns:
52
+ List of subset names found in the repository
53
+ """
54
+ from huggingface_hub import HfApi
55
+
56
+ api = HfApi(token=token)
57
+
58
+ try:
59
+ files = api.list_repo_files(
60
+ repo_id=repo_id,
61
+ repo_type="dataset",
62
+ revision=revision,
63
+ )
64
+ except Exception as e:
65
+ logger.warning(f"Failed to list repo files: {e}")
66
+ return []
67
+
68
+ # Find directories that contain parquet files
69
+ subsets: set[str] = set()
70
+ for f in files:
71
+ # Look for patterns like: <subset>/data-*.parquet or <subset>/*.parquet
72
+ if f.endswith(".parquet"):
73
+ parts = f.split("/")
74
+ if len(parts) >= 2:
75
+ # First directory is the subset name
76
+ subset = parts[0]
77
+ # Skip hidden directories and common non-subset directories
78
+ if not subset.startswith(".") and subset not in ("data", "raw"):
79
+ subsets.add(subset)
80
+
81
+ return sorted(subsets)
82
+
83
+
84
+ def download_benchmark_data(
85
+ data_dir: str,
86
+ repo_id: str = DEFAULT_BENCHMARK_REPO,
87
+ subsets: Optional[list[str]] = None,
88
+ revision: str = "main",
89
+ overwrite: bool = False,
90
+ show_progress: bool = True,
91
+ ) -> tuple[bool, str, dict]:
92
+ """
93
+ Download benchmark Parquet data from HuggingFace.
94
+
95
+ Expected repository structure:
96
+ <subset>/data-00000-of-00001.parquet
97
+ <subset>/data-00001-of-00001.parquet
98
+ ...
99
+
100
+ Downloads to:
101
+ data_dir/<subset>/data-*.parquet
102
+
103
+ Args:
104
+ data_dir: Local directory to save data
105
+ repo_id: HuggingFace repository ID
106
+ subsets: List of subsets to download (None = all available)
107
+ revision: Repository revision/branch
108
+ overwrite: If True, overwrite existing files
109
+ show_progress: If True, show progress information
110
+
111
+ Returns:
112
+ Tuple of (success, message, stats_dict)
113
+ """
114
+ from huggingface_hub import HfApi, hf_hub_download
115
+
116
+ from genarena.sync.hf_ops import get_hf_token
117
+
118
+ token = get_hf_token()
119
+ api = HfApi(token=token)
120
+
121
+ stats = {
122
+ "downloaded_files": 0,
123
+ "skipped_files": 0,
124
+ "failed_files": 0,
125
+ "total_bytes": 0,
126
+ "subsets": {},
127
+ }
128
+
129
+ # Discover available subsets if not specified
130
+ if subsets is None:
131
+ logger.info(f"Discovering subsets in {repo_id}...")
132
+ subsets = discover_repo_subsets(repo_id, token, revision)
133
+ if not subsets:
134
+ return False, f"No subsets found in repository {repo_id}", stats
135
+ logger.info(f"Found subsets: {', '.join(subsets)}")
136
+
137
+ # List all files in the repo
138
+ try:
139
+ all_files = list(api.list_repo_files(
140
+ repo_id=repo_id,
141
+ repo_type="dataset",
142
+ revision=revision,
143
+ ))
144
+ except Exception as e:
145
+ return False, f"Failed to list repository files: {e}", stats
146
+
147
+ # Filter files for requested subsets
148
+ files_to_download: list[tuple[str, str]] = [] # (remote_path, local_path)
149
+
150
+ for subset in subsets:
151
+ subset_files = [
152
+ f for f in all_files
153
+ if f.startswith(f"{subset}/") and f.endswith(".parquet")
154
+ ]
155
+
156
+ if not subset_files:
157
+ logger.warning(f"No parquet files found for subset '{subset}'")
158
+ continue
159
+
160
+ stats["subsets"][subset] = {
161
+ "files": len(subset_files),
162
+ "bytes": 0,
163
+ "downloaded": 0,
164
+ "skipped": 0,
165
+ }
166
+
167
+ for remote_path in subset_files:
168
+ # Construct local path: data_dir/<subset>/filename.parquet
169
+ local_path = os.path.join(data_dir, remote_path)
170
+ files_to_download.append((remote_path, local_path))
171
+
172
+ if not files_to_download:
173
+ return False, "No parquet files found for the specified subsets", stats
174
+
175
+ # Create data directory
176
+ os.makedirs(data_dir, exist_ok=True)
177
+
178
+ # Download files
179
+ errors: list[str] = []
180
+
181
+ if show_progress:
182
+ try:
183
+ from tqdm import tqdm
184
+ files_iter = tqdm(files_to_download, desc="Downloading", unit="file")
185
+ except ImportError:
186
+ files_iter = files_to_download
187
+ else:
188
+ files_iter = files_to_download
189
+
190
+ for remote_path, local_path in files_iter:
191
+ subset = remote_path.split("/")[0]
192
+
193
+ # Check if file exists
194
+ if os.path.exists(local_path) and not overwrite:
195
+ logger.debug(f"Skipping existing file: {local_path}")
196
+ stats["skipped_files"] += 1
197
+ if subset in stats["subsets"]:
198
+ stats["subsets"][subset]["skipped"] += 1
199
+ continue
200
+
201
+ # Create directory
202
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
203
+
204
+ try:
205
+ # Download file
206
+ downloaded_path = hf_hub_download(
207
+ repo_id=repo_id,
208
+ filename=remote_path,
209
+ repo_type="dataset",
210
+ revision=revision,
211
+ token=token,
212
+ local_dir=data_dir,
213
+ local_dir_use_symlinks=False,
214
+ )
215
+
216
+ # Get file size
217
+ file_size = os.path.getsize(downloaded_path)
218
+ stats["downloaded_files"] += 1
219
+ stats["total_bytes"] += file_size
220
+
221
+ if subset in stats["subsets"]:
222
+ stats["subsets"][subset]["downloaded"] += 1
223
+ stats["subsets"][subset]["bytes"] += file_size
224
+
225
+ logger.debug(f"Downloaded: {remote_path} ({_format_size(file_size)})")
226
+
227
+ except Exception as e:
228
+ logger.error(f"Failed to download {remote_path}: {e}")
229
+ errors.append(f"{remote_path}: {e}")
230
+ stats["failed_files"] += 1
231
+
232
+ # Build summary message
233
+ lines = [
234
+ f"Benchmark data download complete:",
235
+ f" Downloaded: {stats['downloaded_files']} files ({_format_size(stats['total_bytes'])})",
236
+ f" Skipped: {stats['skipped_files']} files (already exist)",
237
+ f" Failed: {stats['failed_files']} files",
238
+ ]
239
+
240
+ if stats["subsets"]:
241
+ lines.append(" Subsets:")
242
+ for subset, info in stats["subsets"].items():
243
+ lines.append(
244
+ f" - {subset}: {info['downloaded']} downloaded, "
245
+ f"{info['skipped']} skipped ({_format_size(info['bytes'])})"
246
+ )
247
+
248
+ if errors:
249
+ lines.append(" Errors:")
250
+ for err in errors[:5]:
251
+ lines.append(f" - {err}")
252
+ if len(errors) > 5:
253
+ lines.append(f" ... and {len(errors) - 5} more errors")
254
+
255
+ success = stats["failed_files"] == 0 or stats["downloaded_files"] > 0
256
+ return success, "\n".join(lines), stats
257
+
258
+
259
+ def init_arena(
260
+ arena_dir: str = "./arena",
261
+ data_dir: str = "./data",
262
+ subsets: Optional[list[str]] = None,
263
+ benchmark_repo: str = DEFAULT_BENCHMARK_REPO,
264
+ arena_repo: str = DEFAULT_ARENA_REPO,
265
+ revision: str = "main",
266
+ overwrite: bool = False,
267
+ init_git: bool = False,
268
+ data_only: bool = False,
269
+ arena_only: bool = False,
270
+ show_progress: bool = True,
271
+ ) -> tuple[bool, str]:
272
+ """
273
+ One-click arena initialization.
274
+
275
+ This function:
276
+ 1. Downloads benchmark Parquet data from HuggingFace (unless arena_only)
277
+ 2. Downloads arena data (model outputs + logs) from HuggingFace (unless data_only)
278
+ 3. Initializes Git repository in arena_dir (if init_git)
279
+
280
+ Args:
281
+ arena_dir: Path to arena directory
282
+ data_dir: Path to benchmark data directory
283
+ subsets: List of subsets to download (None = all available)
284
+ benchmark_repo: HuggingFace repo for benchmark data
285
+ arena_repo: HuggingFace repo for arena data
286
+ revision: HuggingFace revision/branch
287
+ overwrite: If True, overwrite existing files
288
+ init_git: If True, initialize Git repository in arena_dir
289
+ data_only: If True, only download benchmark data
290
+ arena_only: If True, only download arena data
291
+ show_progress: If True, show progress information
292
+
293
+ Returns:
294
+ Tuple of (success, summary_message)
295
+ """
296
+ from genarena.sync.hf_ops import pull_arena_data, get_hf_token
297
+ from genarena.sync.git_ops import git_init, is_git_initialized
298
+
299
+ lines: list[str] = []
300
+ all_success = True
301
+ benchmark_stats: dict = {}
302
+ arena_stats: dict = {}
303
+
304
+ # Resolve absolute paths
305
+ arena_dir = os.path.abspath(arena_dir)
306
+ data_dir = os.path.abspath(data_dir)
307
+
308
+ # Step 1: Download benchmark data
309
+ if not arena_only:
310
+ step_num = 1
311
+ total_steps = 2 if not data_only else 1
312
+ if init_git:
313
+ total_steps += 1
314
+
315
+ print(f"[Step {step_num}/{total_steps}] Downloading benchmark data from {benchmark_repo}...")
316
+ print(f" Target directory: {data_dir}")
317
+ if subsets:
318
+ print(f" Subsets: {', '.join(subsets)}")
319
+ print()
320
+
321
+ success, msg, benchmark_stats = download_benchmark_data(
322
+ data_dir=data_dir,
323
+ repo_id=benchmark_repo,
324
+ subsets=subsets,
325
+ revision=revision,
326
+ overwrite=overwrite,
327
+ show_progress=show_progress,
328
+ )
329
+
330
+ print(f" {msg.replace(chr(10), chr(10) + ' ')}")
331
+ print()
332
+
333
+ if not success:
334
+ all_success = False
335
+ lines.append(f"Benchmark data download failed")
336
+ else:
337
+ lines.append(
338
+ f"Benchmark data: {benchmark_stats.get('downloaded_files', 0)} files "
339
+ f"({_format_size(benchmark_stats.get('total_bytes', 0))})"
340
+ )
341
+
342
+ # Step 2: Download arena data
343
+ if not data_only:
344
+ step_num = 1 if arena_only else 2
345
+ total_steps = 1 if arena_only else 2
346
+ if init_git:
347
+ total_steps += 1
348
+
349
+ print(f"[Step {step_num}/{total_steps}] Downloading arena data from {arena_repo}...")
350
+ print(f" Target directory: {arena_dir}")
351
+ if subsets:
352
+ print(f" Subsets: {', '.join(subsets)}")
353
+ print()
354
+
355
+ # Create arena directory
356
+ os.makedirs(arena_dir, exist_ok=True)
357
+
358
+ success, msg = pull_arena_data(
359
+ arena_dir=arena_dir,
360
+ repo_id=arena_repo,
361
+ subsets=subsets,
362
+ revision=revision,
363
+ overwrite=overwrite,
364
+ show_progress=show_progress,
365
+ )
366
+
367
+ print(f" {msg.replace(chr(10), chr(10) + ' ')}")
368
+ print()
369
+
370
+ if not success:
371
+ all_success = False
372
+ lines.append(f"Arena data download failed: {msg}")
373
+ else:
374
+ lines.append(f"Arena data: downloaded to {arena_dir}")
375
+
376
+ # Step 3: Initialize Git
377
+ if init_git and not data_only:
378
+ step_num = total_steps
379
+ print(f"[Step {step_num}/{total_steps}] Initializing Git repository...")
380
+
381
+ if is_git_initialized(arena_dir):
382
+ print(f" Git repository already initialized at {arena_dir}")
383
+ lines.append("Git: already initialized")
384
+ else:
385
+ success, msg = git_init(arena_dir)
386
+ print(f" {msg}")
387
+ if success:
388
+ lines.append("Git: initialized")
389
+ else:
390
+ lines.append(f"Git: initialization failed - {msg}")
391
+ print()
392
+
393
+ # Build final summary
394
+ summary_lines = [
395
+ "=== Summary ===",
396
+ ]
397
+
398
+ if not arena_only:
399
+ summary_lines.append(f"Data directory: {data_dir}")
400
+ if not data_only:
401
+ summary_lines.append(f"Arena directory: {arena_dir}")
402
+
403
+ if subsets:
404
+ summary_lines.append(f"Subsets: {', '.join(subsets)}")
405
+ elif benchmark_stats.get("subsets"):
406
+ summary_lines.append(f"Subsets: {', '.join(benchmark_stats['subsets'].keys())}")
407
+
408
+ for line in lines:
409
+ summary_lines.append(f" {line}")
410
+
411
+ # Add next steps
412
+ summary_lines.append("")
413
+ summary_lines.append("Next steps:")
414
+
415
+ if not data_only:
416
+ summary_lines.append(f" # View current status")
417
+ summary_lines.append(f" genarena status --arena_dir {arena_dir} --data_dir {data_dir}")
418
+ summary_lines.append("")
419
+ summary_lines.append(f" # Run evaluation battles")
420
+ example_subset = subsets[0] if subsets else "basic"
421
+ summary_lines.append(
422
+ f" genarena run --arena_dir {arena_dir} --data_dir {data_dir} --subset {example_subset}"
423
+ )
424
+ summary_lines.append("")
425
+ summary_lines.append(f" # View leaderboard")
426
+ summary_lines.append(f" genarena leaderboard --arena_dir {arena_dir} --subset {example_subset}")
427
+ else:
428
+ summary_lines.append(f" # Initialize arena directory")
429
+ summary_lines.append(f" genarena init --arena_dir <path> --arena-only")
430
+
431
+ return all_success, "\n".join(summary_lines)