genarena 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genarena/__init__.py +49 -2
- genarena/__main__.py +10 -0
- genarena/arena.py +1685 -0
- genarena/battle.py +337 -0
- genarena/bt_elo.py +507 -0
- genarena/cli.py +1581 -0
- genarena/data.py +476 -0
- genarena/deploy/Dockerfile +25 -0
- genarena/deploy/README.md +55 -0
- genarena/deploy/__init__.py +5 -0
- genarena/deploy/app.py +84 -0
- genarena/experiments.py +121 -0
- genarena/leaderboard.py +270 -0
- genarena/logs.py +409 -0
- genarena/models.py +412 -0
- genarena/prompts/__init__.py +127 -0
- genarena/prompts/mmrb2.py +373 -0
- genarena/sampling.py +336 -0
- genarena/state.py +656 -0
- genarena/sync/__init__.py +105 -0
- genarena/sync/auto_commit.py +118 -0
- genarena/sync/deploy_ops.py +543 -0
- genarena/sync/git_ops.py +422 -0
- genarena/sync/hf_ops.py +891 -0
- genarena/sync/init_ops.py +431 -0
- genarena/sync/packer.py +587 -0
- genarena/sync/submit.py +837 -0
- genarena/utils.py +103 -0
- genarena/validation/__init__.py +19 -0
- genarena/validation/schema.py +327 -0
- genarena/validation/validator.py +329 -0
- genarena/visualize/README.md +148 -0
- genarena/visualize/__init__.py +14 -0
- genarena/visualize/app.py +938 -0
- genarena/visualize/data_loader.py +2335 -0
- genarena/visualize/static/app.js +3762 -0
- genarena/visualize/static/model_aliases.json +86 -0
- genarena/visualize/static/style.css +4104 -0
- genarena/visualize/templates/index.html +413 -0
- genarena/vlm.py +519 -0
- genarena-0.1.0.dist-info/METADATA +178 -0
- genarena-0.1.0.dist-info/RECORD +44 -0
- {genarena-0.0.1.dist-info → genarena-0.1.0.dist-info}/WHEEL +1 -2
- genarena-0.1.0.dist-info/entry_points.txt +2 -0
- genarena-0.0.1.dist-info/METADATA +0 -26
- genarena-0.0.1.dist-info/RECORD +0 -5
- genarena-0.0.1.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,431 @@
|
|
|
1
|
+
# Copyright 2026 Ruihang Li.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0.
|
|
3
|
+
# See LICENSE file in the project root for details.
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Initialization operations for GenArena.
|
|
7
|
+
|
|
8
|
+
This module provides functionality for one-click initialization of arena
|
|
9
|
+
directories, including downloading benchmark data and official arena data
|
|
10
|
+
from HuggingFace repositories.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
# Default repository configurations
|
|
20
|
+
DEFAULT_BENCHMARK_REPO = "rhli/genarena"
|
|
21
|
+
DEFAULT_ARENA_REPO = "rhli/genarena-battlefield"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _format_size(size_bytes: int) -> str:
|
|
25
|
+
"""Format file size in human-readable format."""
|
|
26
|
+
if size_bytes < 1024:
|
|
27
|
+
return f"{size_bytes} B"
|
|
28
|
+
elif size_bytes < 1024 * 1024:
|
|
29
|
+
return f"{size_bytes / 1024:.1f} KB"
|
|
30
|
+
elif size_bytes < 1024 * 1024 * 1024:
|
|
31
|
+
return f"{size_bytes / 1024 / 1024:.1f} MB"
|
|
32
|
+
else:
|
|
33
|
+
return f"{size_bytes / 1024 / 1024 / 1024:.2f} GB"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def discover_repo_subsets(
|
|
37
|
+
repo_id: str,
|
|
38
|
+
token: Optional[str] = None,
|
|
39
|
+
revision: str = "main",
|
|
40
|
+
) -> list[str]:
|
|
41
|
+
"""
|
|
42
|
+
Discover available subsets in a HuggingFace repository.
|
|
43
|
+
|
|
44
|
+
Looks for directories containing parquet files or known subset patterns.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
repo_id: HuggingFace repository ID
|
|
48
|
+
token: HuggingFace token (optional for public repos)
|
|
49
|
+
revision: Repository revision/branch
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
List of subset names found in the repository
|
|
53
|
+
"""
|
|
54
|
+
from huggingface_hub import HfApi
|
|
55
|
+
|
|
56
|
+
api = HfApi(token=token)
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
files = api.list_repo_files(
|
|
60
|
+
repo_id=repo_id,
|
|
61
|
+
repo_type="dataset",
|
|
62
|
+
revision=revision,
|
|
63
|
+
)
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.warning(f"Failed to list repo files: {e}")
|
|
66
|
+
return []
|
|
67
|
+
|
|
68
|
+
# Find directories that contain parquet files
|
|
69
|
+
subsets: set[str] = set()
|
|
70
|
+
for f in files:
|
|
71
|
+
# Look for patterns like: <subset>/data-*.parquet or <subset>/*.parquet
|
|
72
|
+
if f.endswith(".parquet"):
|
|
73
|
+
parts = f.split("/")
|
|
74
|
+
if len(parts) >= 2:
|
|
75
|
+
# First directory is the subset name
|
|
76
|
+
subset = parts[0]
|
|
77
|
+
# Skip hidden directories and common non-subset directories
|
|
78
|
+
if not subset.startswith(".") and subset not in ("data", "raw"):
|
|
79
|
+
subsets.add(subset)
|
|
80
|
+
|
|
81
|
+
return sorted(subsets)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def download_benchmark_data(
|
|
85
|
+
data_dir: str,
|
|
86
|
+
repo_id: str = DEFAULT_BENCHMARK_REPO,
|
|
87
|
+
subsets: Optional[list[str]] = None,
|
|
88
|
+
revision: str = "main",
|
|
89
|
+
overwrite: bool = False,
|
|
90
|
+
show_progress: bool = True,
|
|
91
|
+
) -> tuple[bool, str, dict]:
|
|
92
|
+
"""
|
|
93
|
+
Download benchmark Parquet data from HuggingFace.
|
|
94
|
+
|
|
95
|
+
Expected repository structure:
|
|
96
|
+
<subset>/data-00000-of-00001.parquet
|
|
97
|
+
<subset>/data-00001-of-00001.parquet
|
|
98
|
+
...
|
|
99
|
+
|
|
100
|
+
Downloads to:
|
|
101
|
+
data_dir/<subset>/data-*.parquet
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
data_dir: Local directory to save data
|
|
105
|
+
repo_id: HuggingFace repository ID
|
|
106
|
+
subsets: List of subsets to download (None = all available)
|
|
107
|
+
revision: Repository revision/branch
|
|
108
|
+
overwrite: If True, overwrite existing files
|
|
109
|
+
show_progress: If True, show progress information
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Tuple of (success, message, stats_dict)
|
|
113
|
+
"""
|
|
114
|
+
from huggingface_hub import HfApi, hf_hub_download
|
|
115
|
+
|
|
116
|
+
from genarena.sync.hf_ops import get_hf_token
|
|
117
|
+
|
|
118
|
+
token = get_hf_token()
|
|
119
|
+
api = HfApi(token=token)
|
|
120
|
+
|
|
121
|
+
stats = {
|
|
122
|
+
"downloaded_files": 0,
|
|
123
|
+
"skipped_files": 0,
|
|
124
|
+
"failed_files": 0,
|
|
125
|
+
"total_bytes": 0,
|
|
126
|
+
"subsets": {},
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
# Discover available subsets if not specified
|
|
130
|
+
if subsets is None:
|
|
131
|
+
logger.info(f"Discovering subsets in {repo_id}...")
|
|
132
|
+
subsets = discover_repo_subsets(repo_id, token, revision)
|
|
133
|
+
if not subsets:
|
|
134
|
+
return False, f"No subsets found in repository {repo_id}", stats
|
|
135
|
+
logger.info(f"Found subsets: {', '.join(subsets)}")
|
|
136
|
+
|
|
137
|
+
# List all files in the repo
|
|
138
|
+
try:
|
|
139
|
+
all_files = list(api.list_repo_files(
|
|
140
|
+
repo_id=repo_id,
|
|
141
|
+
repo_type="dataset",
|
|
142
|
+
revision=revision,
|
|
143
|
+
))
|
|
144
|
+
except Exception as e:
|
|
145
|
+
return False, f"Failed to list repository files: {e}", stats
|
|
146
|
+
|
|
147
|
+
# Filter files for requested subsets
|
|
148
|
+
files_to_download: list[tuple[str, str]] = [] # (remote_path, local_path)
|
|
149
|
+
|
|
150
|
+
for subset in subsets:
|
|
151
|
+
subset_files = [
|
|
152
|
+
f for f in all_files
|
|
153
|
+
if f.startswith(f"{subset}/") and f.endswith(".parquet")
|
|
154
|
+
]
|
|
155
|
+
|
|
156
|
+
if not subset_files:
|
|
157
|
+
logger.warning(f"No parquet files found for subset '{subset}'")
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
stats["subsets"][subset] = {
|
|
161
|
+
"files": len(subset_files),
|
|
162
|
+
"bytes": 0,
|
|
163
|
+
"downloaded": 0,
|
|
164
|
+
"skipped": 0,
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
for remote_path in subset_files:
|
|
168
|
+
# Construct local path: data_dir/<subset>/filename.parquet
|
|
169
|
+
local_path = os.path.join(data_dir, remote_path)
|
|
170
|
+
files_to_download.append((remote_path, local_path))
|
|
171
|
+
|
|
172
|
+
if not files_to_download:
|
|
173
|
+
return False, "No parquet files found for the specified subsets", stats
|
|
174
|
+
|
|
175
|
+
# Create data directory
|
|
176
|
+
os.makedirs(data_dir, exist_ok=True)
|
|
177
|
+
|
|
178
|
+
# Download files
|
|
179
|
+
errors: list[str] = []
|
|
180
|
+
|
|
181
|
+
if show_progress:
|
|
182
|
+
try:
|
|
183
|
+
from tqdm import tqdm
|
|
184
|
+
files_iter = tqdm(files_to_download, desc="Downloading", unit="file")
|
|
185
|
+
except ImportError:
|
|
186
|
+
files_iter = files_to_download
|
|
187
|
+
else:
|
|
188
|
+
files_iter = files_to_download
|
|
189
|
+
|
|
190
|
+
for remote_path, local_path in files_iter:
|
|
191
|
+
subset = remote_path.split("/")[0]
|
|
192
|
+
|
|
193
|
+
# Check if file exists
|
|
194
|
+
if os.path.exists(local_path) and not overwrite:
|
|
195
|
+
logger.debug(f"Skipping existing file: {local_path}")
|
|
196
|
+
stats["skipped_files"] += 1
|
|
197
|
+
if subset in stats["subsets"]:
|
|
198
|
+
stats["subsets"][subset]["skipped"] += 1
|
|
199
|
+
continue
|
|
200
|
+
|
|
201
|
+
# Create directory
|
|
202
|
+
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
# Download file
|
|
206
|
+
downloaded_path = hf_hub_download(
|
|
207
|
+
repo_id=repo_id,
|
|
208
|
+
filename=remote_path,
|
|
209
|
+
repo_type="dataset",
|
|
210
|
+
revision=revision,
|
|
211
|
+
token=token,
|
|
212
|
+
local_dir=data_dir,
|
|
213
|
+
local_dir_use_symlinks=False,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# Get file size
|
|
217
|
+
file_size = os.path.getsize(downloaded_path)
|
|
218
|
+
stats["downloaded_files"] += 1
|
|
219
|
+
stats["total_bytes"] += file_size
|
|
220
|
+
|
|
221
|
+
if subset in stats["subsets"]:
|
|
222
|
+
stats["subsets"][subset]["downloaded"] += 1
|
|
223
|
+
stats["subsets"][subset]["bytes"] += file_size
|
|
224
|
+
|
|
225
|
+
logger.debug(f"Downloaded: {remote_path} ({_format_size(file_size)})")
|
|
226
|
+
|
|
227
|
+
except Exception as e:
|
|
228
|
+
logger.error(f"Failed to download {remote_path}: {e}")
|
|
229
|
+
errors.append(f"{remote_path}: {e}")
|
|
230
|
+
stats["failed_files"] += 1
|
|
231
|
+
|
|
232
|
+
# Build summary message
|
|
233
|
+
lines = [
|
|
234
|
+
f"Benchmark data download complete:",
|
|
235
|
+
f" Downloaded: {stats['downloaded_files']} files ({_format_size(stats['total_bytes'])})",
|
|
236
|
+
f" Skipped: {stats['skipped_files']} files (already exist)",
|
|
237
|
+
f" Failed: {stats['failed_files']} files",
|
|
238
|
+
]
|
|
239
|
+
|
|
240
|
+
if stats["subsets"]:
|
|
241
|
+
lines.append(" Subsets:")
|
|
242
|
+
for subset, info in stats["subsets"].items():
|
|
243
|
+
lines.append(
|
|
244
|
+
f" - {subset}: {info['downloaded']} downloaded, "
|
|
245
|
+
f"{info['skipped']} skipped ({_format_size(info['bytes'])})"
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
if errors:
|
|
249
|
+
lines.append(" Errors:")
|
|
250
|
+
for err in errors[:5]:
|
|
251
|
+
lines.append(f" - {err}")
|
|
252
|
+
if len(errors) > 5:
|
|
253
|
+
lines.append(f" ... and {len(errors) - 5} more errors")
|
|
254
|
+
|
|
255
|
+
success = stats["failed_files"] == 0 or stats["downloaded_files"] > 0
|
|
256
|
+
return success, "\n".join(lines), stats
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def init_arena(
|
|
260
|
+
arena_dir: str = "./arena",
|
|
261
|
+
data_dir: str = "./data",
|
|
262
|
+
subsets: Optional[list[str]] = None,
|
|
263
|
+
benchmark_repo: str = DEFAULT_BENCHMARK_REPO,
|
|
264
|
+
arena_repo: str = DEFAULT_ARENA_REPO,
|
|
265
|
+
revision: str = "main",
|
|
266
|
+
overwrite: bool = False,
|
|
267
|
+
init_git: bool = False,
|
|
268
|
+
data_only: bool = False,
|
|
269
|
+
arena_only: bool = False,
|
|
270
|
+
show_progress: bool = True,
|
|
271
|
+
) -> tuple[bool, str]:
|
|
272
|
+
"""
|
|
273
|
+
One-click arena initialization.
|
|
274
|
+
|
|
275
|
+
This function:
|
|
276
|
+
1. Downloads benchmark Parquet data from HuggingFace (unless arena_only)
|
|
277
|
+
2. Downloads arena data (model outputs + logs) from HuggingFace (unless data_only)
|
|
278
|
+
3. Initializes Git repository in arena_dir (if init_git)
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
arena_dir: Path to arena directory
|
|
282
|
+
data_dir: Path to benchmark data directory
|
|
283
|
+
subsets: List of subsets to download (None = all available)
|
|
284
|
+
benchmark_repo: HuggingFace repo for benchmark data
|
|
285
|
+
arena_repo: HuggingFace repo for arena data
|
|
286
|
+
revision: HuggingFace revision/branch
|
|
287
|
+
overwrite: If True, overwrite existing files
|
|
288
|
+
init_git: If True, initialize Git repository in arena_dir
|
|
289
|
+
data_only: If True, only download benchmark data
|
|
290
|
+
arena_only: If True, only download arena data
|
|
291
|
+
show_progress: If True, show progress information
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
Tuple of (success, summary_message)
|
|
295
|
+
"""
|
|
296
|
+
from genarena.sync.hf_ops import pull_arena_data, get_hf_token
|
|
297
|
+
from genarena.sync.git_ops import git_init, is_git_initialized
|
|
298
|
+
|
|
299
|
+
lines: list[str] = []
|
|
300
|
+
all_success = True
|
|
301
|
+
benchmark_stats: dict = {}
|
|
302
|
+
arena_stats: dict = {}
|
|
303
|
+
|
|
304
|
+
# Resolve absolute paths
|
|
305
|
+
arena_dir = os.path.abspath(arena_dir)
|
|
306
|
+
data_dir = os.path.abspath(data_dir)
|
|
307
|
+
|
|
308
|
+
# Step 1: Download benchmark data
|
|
309
|
+
if not arena_only:
|
|
310
|
+
step_num = 1
|
|
311
|
+
total_steps = 2 if not data_only else 1
|
|
312
|
+
if init_git:
|
|
313
|
+
total_steps += 1
|
|
314
|
+
|
|
315
|
+
print(f"[Step {step_num}/{total_steps}] Downloading benchmark data from {benchmark_repo}...")
|
|
316
|
+
print(f" Target directory: {data_dir}")
|
|
317
|
+
if subsets:
|
|
318
|
+
print(f" Subsets: {', '.join(subsets)}")
|
|
319
|
+
print()
|
|
320
|
+
|
|
321
|
+
success, msg, benchmark_stats = download_benchmark_data(
|
|
322
|
+
data_dir=data_dir,
|
|
323
|
+
repo_id=benchmark_repo,
|
|
324
|
+
subsets=subsets,
|
|
325
|
+
revision=revision,
|
|
326
|
+
overwrite=overwrite,
|
|
327
|
+
show_progress=show_progress,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
print(f" {msg.replace(chr(10), chr(10) + ' ')}")
|
|
331
|
+
print()
|
|
332
|
+
|
|
333
|
+
if not success:
|
|
334
|
+
all_success = False
|
|
335
|
+
lines.append(f"Benchmark data download failed")
|
|
336
|
+
else:
|
|
337
|
+
lines.append(
|
|
338
|
+
f"Benchmark data: {benchmark_stats.get('downloaded_files', 0)} files "
|
|
339
|
+
f"({_format_size(benchmark_stats.get('total_bytes', 0))})"
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
# Step 2: Download arena data
|
|
343
|
+
if not data_only:
|
|
344
|
+
step_num = 1 if arena_only else 2
|
|
345
|
+
total_steps = 1 if arena_only else 2
|
|
346
|
+
if init_git:
|
|
347
|
+
total_steps += 1
|
|
348
|
+
|
|
349
|
+
print(f"[Step {step_num}/{total_steps}] Downloading arena data from {arena_repo}...")
|
|
350
|
+
print(f" Target directory: {arena_dir}")
|
|
351
|
+
if subsets:
|
|
352
|
+
print(f" Subsets: {', '.join(subsets)}")
|
|
353
|
+
print()
|
|
354
|
+
|
|
355
|
+
# Create arena directory
|
|
356
|
+
os.makedirs(arena_dir, exist_ok=True)
|
|
357
|
+
|
|
358
|
+
success, msg = pull_arena_data(
|
|
359
|
+
arena_dir=arena_dir,
|
|
360
|
+
repo_id=arena_repo,
|
|
361
|
+
subsets=subsets,
|
|
362
|
+
revision=revision,
|
|
363
|
+
overwrite=overwrite,
|
|
364
|
+
show_progress=show_progress,
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
print(f" {msg.replace(chr(10), chr(10) + ' ')}")
|
|
368
|
+
print()
|
|
369
|
+
|
|
370
|
+
if not success:
|
|
371
|
+
all_success = False
|
|
372
|
+
lines.append(f"Arena data download failed: {msg}")
|
|
373
|
+
else:
|
|
374
|
+
lines.append(f"Arena data: downloaded to {arena_dir}")
|
|
375
|
+
|
|
376
|
+
# Step 3: Initialize Git
|
|
377
|
+
if init_git and not data_only:
|
|
378
|
+
step_num = total_steps
|
|
379
|
+
print(f"[Step {step_num}/{total_steps}] Initializing Git repository...")
|
|
380
|
+
|
|
381
|
+
if is_git_initialized(arena_dir):
|
|
382
|
+
print(f" Git repository already initialized at {arena_dir}")
|
|
383
|
+
lines.append("Git: already initialized")
|
|
384
|
+
else:
|
|
385
|
+
success, msg = git_init(arena_dir)
|
|
386
|
+
print(f" {msg}")
|
|
387
|
+
if success:
|
|
388
|
+
lines.append("Git: initialized")
|
|
389
|
+
else:
|
|
390
|
+
lines.append(f"Git: initialization failed - {msg}")
|
|
391
|
+
print()
|
|
392
|
+
|
|
393
|
+
# Build final summary
|
|
394
|
+
summary_lines = [
|
|
395
|
+
"=== Summary ===",
|
|
396
|
+
]
|
|
397
|
+
|
|
398
|
+
if not arena_only:
|
|
399
|
+
summary_lines.append(f"Data directory: {data_dir}")
|
|
400
|
+
if not data_only:
|
|
401
|
+
summary_lines.append(f"Arena directory: {arena_dir}")
|
|
402
|
+
|
|
403
|
+
if subsets:
|
|
404
|
+
summary_lines.append(f"Subsets: {', '.join(subsets)}")
|
|
405
|
+
elif benchmark_stats.get("subsets"):
|
|
406
|
+
summary_lines.append(f"Subsets: {', '.join(benchmark_stats['subsets'].keys())}")
|
|
407
|
+
|
|
408
|
+
for line in lines:
|
|
409
|
+
summary_lines.append(f" {line}")
|
|
410
|
+
|
|
411
|
+
# Add next steps
|
|
412
|
+
summary_lines.append("")
|
|
413
|
+
summary_lines.append("Next steps:")
|
|
414
|
+
|
|
415
|
+
if not data_only:
|
|
416
|
+
summary_lines.append(f" # View current status")
|
|
417
|
+
summary_lines.append(f" genarena status --arena_dir {arena_dir} --data_dir {data_dir}")
|
|
418
|
+
summary_lines.append("")
|
|
419
|
+
summary_lines.append(f" # Run evaluation battles")
|
|
420
|
+
example_subset = subsets[0] if subsets else "basic"
|
|
421
|
+
summary_lines.append(
|
|
422
|
+
f" genarena run --arena_dir {arena_dir} --data_dir {data_dir} --subset {example_subset}"
|
|
423
|
+
)
|
|
424
|
+
summary_lines.append("")
|
|
425
|
+
summary_lines.append(f" # View leaderboard")
|
|
426
|
+
summary_lines.append(f" genarena leaderboard --arena_dir {arena_dir} --subset {example_subset}")
|
|
427
|
+
else:
|
|
428
|
+
summary_lines.append(f" # Initialize arena directory")
|
|
429
|
+
summary_lines.append(f" genarena init --arena_dir <path> --arena-only")
|
|
430
|
+
|
|
431
|
+
return all_success, "\n".join(summary_lines)
|