genarena 0.0.1__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genarena/__init__.py +49 -2
- genarena/__main__.py +10 -0
- genarena/arena.py +1685 -0
- genarena/battle.py +337 -0
- genarena/bt_elo.py +507 -0
- genarena/cli.py +1581 -0
- genarena/data.py +476 -0
- genarena/deploy/Dockerfile +22 -0
- genarena/deploy/README.md +55 -0
- genarena/deploy/__init__.py +5 -0
- genarena/deploy/app.py +84 -0
- genarena/experiments.py +121 -0
- genarena/leaderboard.py +270 -0
- genarena/logs.py +409 -0
- genarena/models.py +412 -0
- genarena/prompts/__init__.py +127 -0
- genarena/prompts/mmrb2.py +373 -0
- genarena/sampling.py +336 -0
- genarena/state.py +656 -0
- genarena/sync/__init__.py +105 -0
- genarena/sync/auto_commit.py +118 -0
- genarena/sync/deploy_ops.py +543 -0
- genarena/sync/git_ops.py +422 -0
- genarena/sync/hf_ops.py +891 -0
- genarena/sync/init_ops.py +431 -0
- genarena/sync/packer.py +587 -0
- genarena/sync/submit.py +837 -0
- genarena/utils.py +103 -0
- genarena/validation/__init__.py +19 -0
- genarena/validation/schema.py +327 -0
- genarena/validation/validator.py +329 -0
- genarena/visualize/README.md +148 -0
- genarena/visualize/__init__.py +14 -0
- genarena/visualize/app.py +938 -0
- genarena/visualize/data_loader.py +2430 -0
- genarena/visualize/static/app.js +3762 -0
- genarena/visualize/static/model_aliases.json +86 -0
- genarena/visualize/static/style.css +4104 -0
- genarena/visualize/templates/index.html +413 -0
- genarena/vlm.py +519 -0
- genarena-0.1.1.dist-info/METADATA +178 -0
- genarena-0.1.1.dist-info/RECORD +44 -0
- {genarena-0.0.1.dist-info → genarena-0.1.1.dist-info}/WHEEL +1 -2
- genarena-0.1.1.dist-info/entry_points.txt +2 -0
- genarena-0.0.1.dist-info/METADATA +0 -26
- genarena-0.0.1.dist-info/RECORD +0 -5
- genarena-0.0.1.dist-info/top_level.txt +0 -1
genarena/vlm.py
ADDED
|
@@ -0,0 +1,519 @@
|
|
|
1
|
+
# Copyright 2026 Ruihang Li.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0.
|
|
3
|
+
# See LICENSE file in the project root for details.
|
|
4
|
+
|
|
5
|
+
"""VLM API calling module with retry and multi-endpoint support."""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from typing import Any, Optional, Union
|
|
13
|
+
from urllib.parse import urlparse
|
|
14
|
+
|
|
15
|
+
from openai import OpenAI
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
def _silence_http_client_logs() -> None:
|
|
21
|
+
"""Silence noisy HTTP client logs (e.g., httpx INFO request lines)."""
|
|
22
|
+
try:
|
|
23
|
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
24
|
+
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
|
25
|
+
except Exception:
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _short_endpoint(base_url: str) -> str:
|
|
30
|
+
"""Convert base_url to a short host:port label for progress display."""
|
|
31
|
+
if not base_url:
|
|
32
|
+
return "default"
|
|
33
|
+
try:
|
|
34
|
+
u = urlparse(base_url)
|
|
35
|
+
host = u.hostname or base_url
|
|
36
|
+
port = f":{u.port}" if u.port else ""
|
|
37
|
+
return f"{host}{port}"
|
|
38
|
+
except Exception:
|
|
39
|
+
return base_url
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _progress_put(progress: Any, item: Any) -> None:
|
|
43
|
+
"""Best-effort put to a queue-like progress sink."""
|
|
44
|
+
if progress is None:
|
|
45
|
+
return
|
|
46
|
+
if hasattr(progress, "put"):
|
|
47
|
+
try:
|
|
48
|
+
progress.put(item)
|
|
49
|
+
except Exception:
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class EndpointConfig:
|
|
55
|
+
"""Configuration for an API endpoint."""
|
|
56
|
+
|
|
57
|
+
base_url: str
|
|
58
|
+
api_keys: list[str]
|
|
59
|
+
timeout: int = 120
|
|
60
|
+
max_retries: int = 3
|
|
61
|
+
weight: float = 1.0
|
|
62
|
+
disabled: bool = False
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class MultiEndpointManager:
|
|
66
|
+
"""
|
|
67
|
+
Manages multiple API endpoints with automatic failover and load balancing.
|
|
68
|
+
|
|
69
|
+
Supports:
|
|
70
|
+
- Round-robin endpoint selection
|
|
71
|
+
- Per-endpoint API key rotation
|
|
72
|
+
- Automatic endpoint disable/re-enable on errors
|
|
73
|
+
- Thread-safe operations
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(self, endpoint_configs: list[EndpointConfig], timeout: int = 120):
|
|
77
|
+
"""
|
|
78
|
+
Initialize multi-endpoint manager.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
endpoint_configs: List of endpoint configurations
|
|
82
|
+
timeout: Default timeout for API calls
|
|
83
|
+
"""
|
|
84
|
+
self.timeout = timeout
|
|
85
|
+
self.endpoints: list[dict[str, Any]] = []
|
|
86
|
+
|
|
87
|
+
for config in endpoint_configs:
|
|
88
|
+
if not config.api_keys:
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
self.endpoints.append({
|
|
92
|
+
"config": config,
|
|
93
|
+
"key_index": 0, # Current API key index for round-robin
|
|
94
|
+
"error_count": 0,
|
|
95
|
+
"last_error_time": 0.0,
|
|
96
|
+
"disabled": config.disabled,
|
|
97
|
+
})
|
|
98
|
+
|
|
99
|
+
if not self.endpoints:
|
|
100
|
+
raise ValueError("At least one valid endpoint configuration is required")
|
|
101
|
+
|
|
102
|
+
self.current_endpoint_index = 0
|
|
103
|
+
self.lock = threading.Lock()
|
|
104
|
+
|
|
105
|
+
def get_client(self) -> tuple[OpenAI, EndpointConfig, str]:
|
|
106
|
+
"""
|
|
107
|
+
Get a client from available endpoints (thread-safe round-robin).
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
Tuple of (OpenAI client, EndpointConfig, api_key used)
|
|
111
|
+
|
|
112
|
+
Raises:
|
|
113
|
+
RuntimeError: If all endpoints are disabled
|
|
114
|
+
"""
|
|
115
|
+
with self.lock:
|
|
116
|
+
# Check if any endpoints are available
|
|
117
|
+
if not any(not ep["disabled"] for ep in self.endpoints):
|
|
118
|
+
# Try to re-enable endpoints that have been disabled for > 5 minutes
|
|
119
|
+
for ep in self.endpoints:
|
|
120
|
+
if ep["disabled"] and time.time() - ep["last_error_time"] > 300:
|
|
121
|
+
ep["disabled"] = False
|
|
122
|
+
ep["error_count"] = 0
|
|
123
|
+
|
|
124
|
+
if not any(not ep["disabled"] for ep in self.endpoints):
|
|
125
|
+
raise RuntimeError("All endpoints are temporarily disabled")
|
|
126
|
+
|
|
127
|
+
attempts = 0
|
|
128
|
+
while attempts < len(self.endpoints):
|
|
129
|
+
endpoint = self.endpoints[self.current_endpoint_index]
|
|
130
|
+
self.current_endpoint_index = (self.current_endpoint_index + 1) % len(self.endpoints)
|
|
131
|
+
|
|
132
|
+
# Skip disabled endpoints
|
|
133
|
+
if endpoint["disabled"]:
|
|
134
|
+
# Check if we should re-enable
|
|
135
|
+
if time.time() - endpoint["last_error_time"] > 300:
|
|
136
|
+
endpoint["disabled"] = False
|
|
137
|
+
endpoint["error_count"] = 0
|
|
138
|
+
else:
|
|
139
|
+
attempts += 1
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
config = endpoint["config"]
|
|
143
|
+
|
|
144
|
+
# Get API key with round-robin
|
|
145
|
+
api_key = config.api_keys[endpoint["key_index"]]
|
|
146
|
+
endpoint["key_index"] = (endpoint["key_index"] + 1) % len(config.api_keys)
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
client = OpenAI(
|
|
150
|
+
api_key=api_key,
|
|
151
|
+
base_url=config.base_url,
|
|
152
|
+
timeout=config.timeout or self.timeout,
|
|
153
|
+
)
|
|
154
|
+
return client, config, api_key
|
|
155
|
+
except Exception as e:
|
|
156
|
+
logger.warning(f"Failed to create client for {config.base_url}: {e}")
|
|
157
|
+
attempts += 1
|
|
158
|
+
|
|
159
|
+
raise RuntimeError("No available endpoints")
|
|
160
|
+
|
|
161
|
+
def record_success(self, config: EndpointConfig) -> None:
|
|
162
|
+
"""
|
|
163
|
+
Record a successful API call for an endpoint.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
config: The endpoint config that succeeded
|
|
167
|
+
"""
|
|
168
|
+
with self.lock:
|
|
169
|
+
for endpoint in self.endpoints:
|
|
170
|
+
if endpoint["config"].base_url == config.base_url:
|
|
171
|
+
# Reduce error count on success
|
|
172
|
+
if endpoint["error_count"] > 0:
|
|
173
|
+
endpoint["error_count"] = max(0, endpoint["error_count"] - 1)
|
|
174
|
+
# Re-enable if disabled
|
|
175
|
+
if endpoint["disabled"]:
|
|
176
|
+
endpoint["disabled"] = False
|
|
177
|
+
break
|
|
178
|
+
|
|
179
|
+
def record_failure(self, config: EndpointConfig, error_type: str = "generic") -> None:
|
|
180
|
+
"""
|
|
181
|
+
Record a failed API call for an endpoint.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
config: The endpoint config that failed
|
|
185
|
+
error_type: Type of error ('auth', 'rate_limit', 'timeout', 'generic')
|
|
186
|
+
"""
|
|
187
|
+
with self.lock:
|
|
188
|
+
for endpoint in self.endpoints:
|
|
189
|
+
if endpoint["config"].base_url == config.base_url:
|
|
190
|
+
endpoint["error_count"] += 1
|
|
191
|
+
endpoint["last_error_time"] = time.time()
|
|
192
|
+
|
|
193
|
+
# Disable endpoint after 3 consecutive errors
|
|
194
|
+
if endpoint["error_count"] >= 3:
|
|
195
|
+
endpoint["disabled"] = True
|
|
196
|
+
logger.warning(
|
|
197
|
+
f"Endpoint {config.base_url} disabled after {endpoint['error_count']} errors"
|
|
198
|
+
)
|
|
199
|
+
break
|
|
200
|
+
|
|
201
|
+
def get_stats(self) -> dict[str, Any]:
|
|
202
|
+
"""Get statistics about endpoint usage."""
|
|
203
|
+
with self.lock:
|
|
204
|
+
stats = {
|
|
205
|
+
"total_endpoints": len(self.endpoints),
|
|
206
|
+
"enabled_endpoints": sum(1 for ep in self.endpoints if not ep["disabled"]),
|
|
207
|
+
"disabled_endpoints": sum(1 for ep in self.endpoints if ep["disabled"]),
|
|
208
|
+
"endpoints": [],
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
for endpoint in self.endpoints:
|
|
212
|
+
stats["endpoints"].append({
|
|
213
|
+
"base_url": endpoint["config"].base_url,
|
|
214
|
+
"enabled": not endpoint["disabled"],
|
|
215
|
+
"error_count": endpoint["error_count"],
|
|
216
|
+
"num_keys": len(endpoint["config"].api_keys),
|
|
217
|
+
})
|
|
218
|
+
|
|
219
|
+
return stats
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
class VLMJudge:
|
|
223
|
+
"""
|
|
224
|
+
VLM Judge class for calling vision-language models via OpenAI-compatible API.
|
|
225
|
+
|
|
226
|
+
Supports:
|
|
227
|
+
- Greedy mode (temperature=0) for reproducible results
|
|
228
|
+
- Multi-endpoint with automatic failover
|
|
229
|
+
- Per-endpoint API key rotation
|
|
230
|
+
- Exponential backoff retry mechanism
|
|
231
|
+
"""
|
|
232
|
+
|
|
233
|
+
DEFAULT_MODEL = "Qwen/Qwen3-VL-32B-Instruct-FP8"
|
|
234
|
+
|
|
235
|
+
def __init__(
|
|
236
|
+
self,
|
|
237
|
+
model: str = DEFAULT_MODEL,
|
|
238
|
+
temperature: float = 0.0,
|
|
239
|
+
timeout: int = 120,
|
|
240
|
+
max_retries: int = 3,
|
|
241
|
+
base_url: Optional[str] = None,
|
|
242
|
+
base_urls: Optional[Union[str, list[str]]] = None,
|
|
243
|
+
api_key: Optional[str] = None,
|
|
244
|
+
api_keys: Optional[Union[str, list[str]]] = None,
|
|
245
|
+
progress: Any = None,
|
|
246
|
+
):
|
|
247
|
+
"""
|
|
248
|
+
Initialize the VLM Judge.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
model: Model name to use for evaluation
|
|
252
|
+
temperature: Sampling temperature (0 for greedy/deterministic)
|
|
253
|
+
timeout: API call timeout in seconds
|
|
254
|
+
max_retries: Maximum number of retry attempts
|
|
255
|
+
base_url: Single OpenAI API base URL (legacy, use base_urls for multi-endpoint)
|
|
256
|
+
base_urls: Multiple base URLs (comma-separated string or list)
|
|
257
|
+
api_key: Single API key (legacy, use api_keys for multiple)
|
|
258
|
+
api_keys: Multiple API keys (comma-separated string or list)
|
|
259
|
+
"""
|
|
260
|
+
self.model = model
|
|
261
|
+
self.temperature = temperature
|
|
262
|
+
self.timeout = timeout
|
|
263
|
+
self.max_retries = max_retries
|
|
264
|
+
self._progress = progress
|
|
265
|
+
|
|
266
|
+
# Avoid printing per-request httpx INFO lines to stdout.
|
|
267
|
+
_silence_http_client_logs()
|
|
268
|
+
|
|
269
|
+
# Parse base URLs
|
|
270
|
+
self._base_urls = self._parse_urls(base_urls, base_url, "OPENAI_BASE_URLS", "OPENAI_BASE_URL")
|
|
271
|
+
|
|
272
|
+
# Parse API keys
|
|
273
|
+
self._api_keys = self._parse_keys(api_keys, api_key, "OPENAI_API_KEY")
|
|
274
|
+
|
|
275
|
+
if not self._api_keys:
|
|
276
|
+
raise ValueError(
|
|
277
|
+
"API key must be provided via api_key/api_keys parameter or "
|
|
278
|
+
"OPENAI_API_KEY environment variable"
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
# If progress is enabled, silence noisy per-request httpx logs.
|
|
282
|
+
if self._progress is not None:
|
|
283
|
+
_silence_http_client_logs()
|
|
284
|
+
|
|
285
|
+
# Build endpoint configs
|
|
286
|
+
endpoint_configs = self._build_endpoint_configs()
|
|
287
|
+
|
|
288
|
+
# Initialize multi-endpoint manager
|
|
289
|
+
self.endpoint_manager = MultiEndpointManager(endpoint_configs, timeout=timeout)
|
|
290
|
+
|
|
291
|
+
# For backward compatibility and logging
|
|
292
|
+
self.base_url = self._base_urls[0] if self._base_urls else None
|
|
293
|
+
self.api_key = self._api_keys[0] if self._api_keys else None
|
|
294
|
+
|
|
295
|
+
def set_progress(self, progress: Any) -> None:
|
|
296
|
+
"""Attach a progress sink (queue-like) for emitting request events."""
|
|
297
|
+
self._progress = progress
|
|
298
|
+
if self._progress is not None:
|
|
299
|
+
_silence_http_client_logs()
|
|
300
|
+
|
|
301
|
+
def _progress_event(self, msg: str) -> None:
|
|
302
|
+
"""Emit a short request event for progress UI."""
|
|
303
|
+
_progress_put(self._progress, ("log", msg))
|
|
304
|
+
|
|
305
|
+
def _parse_urls(
|
|
306
|
+
self,
|
|
307
|
+
urls: Optional[Union[str, list[str]]],
|
|
308
|
+
single_url: Optional[str],
|
|
309
|
+
env_multi: str,
|
|
310
|
+
env_single: str
|
|
311
|
+
) -> list[str]:
|
|
312
|
+
"""Parse base URLs from various sources."""
|
|
313
|
+
# Priority: urls param > single_url param > env vars
|
|
314
|
+
if urls:
|
|
315
|
+
if isinstance(urls, str):
|
|
316
|
+
return [u.strip() for u in urls.split(",") if u.strip()]
|
|
317
|
+
return list(urls)
|
|
318
|
+
|
|
319
|
+
if single_url:
|
|
320
|
+
return [single_url]
|
|
321
|
+
|
|
322
|
+
# Try environment variables
|
|
323
|
+
env_urls = os.environ.get(env_multi) or os.environ.get(env_single)
|
|
324
|
+
if env_urls:
|
|
325
|
+
return [u.strip() for u in env_urls.split(",") if u.strip()]
|
|
326
|
+
|
|
327
|
+
return []
|
|
328
|
+
|
|
329
|
+
def _parse_keys(
|
|
330
|
+
self,
|
|
331
|
+
keys: Optional[Union[str, list[str]]],
|
|
332
|
+
single_key: Optional[str],
|
|
333
|
+
env_name: str
|
|
334
|
+
) -> list[str]:
|
|
335
|
+
"""Parse API keys from various sources."""
|
|
336
|
+
# Priority: keys param > single_key param > env var
|
|
337
|
+
if keys:
|
|
338
|
+
if isinstance(keys, str):
|
|
339
|
+
return [k.strip() for k in keys.split(",") if k.strip()]
|
|
340
|
+
return list(keys)
|
|
341
|
+
|
|
342
|
+
if single_key:
|
|
343
|
+
return [single_key]
|
|
344
|
+
|
|
345
|
+
# Try environment variable
|
|
346
|
+
env_keys = os.environ.get(env_name)
|
|
347
|
+
if env_keys:
|
|
348
|
+
return [k.strip() for k in env_keys.split(",") if k.strip()]
|
|
349
|
+
|
|
350
|
+
return []
|
|
351
|
+
|
|
352
|
+
def _build_endpoint_configs(self) -> list[EndpointConfig]:
|
|
353
|
+
"""Build endpoint configurations from URLs and keys."""
|
|
354
|
+
configs = []
|
|
355
|
+
|
|
356
|
+
if not self._base_urls:
|
|
357
|
+
# No base URL specified, create single endpoint with all keys
|
|
358
|
+
configs.append(EndpointConfig(
|
|
359
|
+
base_url="", # Will use OpenAI default
|
|
360
|
+
api_keys=self._api_keys,
|
|
361
|
+
timeout=self.timeout,
|
|
362
|
+
max_retries=self.max_retries,
|
|
363
|
+
))
|
|
364
|
+
else:
|
|
365
|
+
# Distribute API keys among endpoints
|
|
366
|
+
num_urls = len(self._base_urls)
|
|
367
|
+
num_keys = len(self._api_keys)
|
|
368
|
+
keys_per_endpoint = max(1, num_keys // num_urls)
|
|
369
|
+
remainder = num_keys % num_urls
|
|
370
|
+
|
|
371
|
+
key_index = 0
|
|
372
|
+
for i, url in enumerate(self._base_urls):
|
|
373
|
+
# Calculate keys for this endpoint
|
|
374
|
+
num_keys_for_endpoint = keys_per_endpoint
|
|
375
|
+
if i < remainder:
|
|
376
|
+
num_keys_for_endpoint += 1
|
|
377
|
+
|
|
378
|
+
endpoint_keys = self._api_keys[key_index:key_index + num_keys_for_endpoint]
|
|
379
|
+
if not endpoint_keys:
|
|
380
|
+
# If no keys left, use all keys
|
|
381
|
+
endpoint_keys = self._api_keys
|
|
382
|
+
key_index += num_keys_for_endpoint
|
|
383
|
+
|
|
384
|
+
configs.append(EndpointConfig(
|
|
385
|
+
base_url=url,
|
|
386
|
+
api_keys=endpoint_keys,
|
|
387
|
+
timeout=self.timeout,
|
|
388
|
+
max_retries=self.max_retries,
|
|
389
|
+
))
|
|
390
|
+
|
|
391
|
+
return configs
|
|
392
|
+
|
|
393
|
+
def call(self, messages: list[dict[str, Any]]) -> str:
|
|
394
|
+
"""
|
|
395
|
+
Call the VLM API and return the response text.
|
|
396
|
+
|
|
397
|
+
Uses multi-endpoint failover and exponential backoff for retries.
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
messages: List of message dicts in OpenAI Chat Completion format
|
|
401
|
+
|
|
402
|
+
Returns:
|
|
403
|
+
Raw response text from the VLM
|
|
404
|
+
|
|
405
|
+
Raises:
|
|
406
|
+
Exception: If all retry attempts fail
|
|
407
|
+
"""
|
|
408
|
+
last_exception = None
|
|
409
|
+
|
|
410
|
+
for attempt in range(self.max_retries):
|
|
411
|
+
try:
|
|
412
|
+
# Get client from endpoint manager
|
|
413
|
+
client, config, _ = self.endpoint_manager.get_client()
|
|
414
|
+
endpoint_label = _short_endpoint(getattr(config, "base_url", "") or "")
|
|
415
|
+
|
|
416
|
+
response = client.chat.completions.create(
|
|
417
|
+
model=self.model,
|
|
418
|
+
messages=messages,
|
|
419
|
+
temperature=self.temperature,
|
|
420
|
+
max_tokens=4096,
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
# Extract text from response
|
|
424
|
+
if response.choices and len(response.choices) > 0:
|
|
425
|
+
content = response.choices[0].message.content
|
|
426
|
+
if content:
|
|
427
|
+
# Record success
|
|
428
|
+
self.endpoint_manager.record_success(config)
|
|
429
|
+
self._progress_event(f"OK {endpoint_label}")
|
|
430
|
+
return content
|
|
431
|
+
else:
|
|
432
|
+
raise ValueError("Empty response content from VLM")
|
|
433
|
+
else:
|
|
434
|
+
raise ValueError("No choices in VLM response")
|
|
435
|
+
|
|
436
|
+
except Exception as e:
|
|
437
|
+
last_exception = e
|
|
438
|
+
endpoint_label = _short_endpoint(getattr(config, "base_url", "") or "") if "config" in dir() else "unknown"
|
|
439
|
+
|
|
440
|
+
# Record failure for endpoint
|
|
441
|
+
if 'config' in dir():
|
|
442
|
+
error_type = self._classify_error(e)
|
|
443
|
+
self.endpoint_manager.record_failure(config, error_type)
|
|
444
|
+
self._progress_event(f"ERR {endpoint_label} {error_type}")
|
|
445
|
+
else:
|
|
446
|
+
self._progress_event(f"ERR {endpoint_label} generic")
|
|
447
|
+
|
|
448
|
+
logger.warning(
|
|
449
|
+
f"VLM call attempt {attempt + 1}/{self.max_retries} failed: {e}"
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
# Exponential backoff: 1s, 2s, 4s, ...
|
|
453
|
+
if attempt < self.max_retries - 1:
|
|
454
|
+
wait_time = 2 ** attempt
|
|
455
|
+
logger.info(f"Retrying in {wait_time} seconds...")
|
|
456
|
+
time.sleep(wait_time)
|
|
457
|
+
|
|
458
|
+
# All retries failed
|
|
459
|
+
raise Exception(
|
|
460
|
+
f"VLM call failed after {self.max_retries} attempts. "
|
|
461
|
+
f"Last error: {last_exception}"
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
def _classify_error(self, error: Exception) -> str:
|
|
465
|
+
"""Classify an error for endpoint management."""
|
|
466
|
+
error_str = str(error).lower()
|
|
467
|
+
|
|
468
|
+
if "401" in error_str or "unauthorized" in error_str or "invalid api key" in error_str:
|
|
469
|
+
return "auth"
|
|
470
|
+
elif "429" in error_str or "rate limit" in error_str:
|
|
471
|
+
return "rate_limit"
|
|
472
|
+
elif "timeout" in error_str:
|
|
473
|
+
return "timeout"
|
|
474
|
+
else:
|
|
475
|
+
return "generic"
|
|
476
|
+
|
|
477
|
+
def call_with_raw(
|
|
478
|
+
self,
|
|
479
|
+
messages: list[dict[str, Any]]
|
|
480
|
+
) -> tuple[str, Optional[Exception]]:
|
|
481
|
+
"""
|
|
482
|
+
Call the VLM API and return both response and any error.
|
|
483
|
+
|
|
484
|
+
This variant returns the error instead of raising it,
|
|
485
|
+
useful for audit logging.
|
|
486
|
+
|
|
487
|
+
Args:
|
|
488
|
+
messages: List of message dicts in OpenAI Chat Completion format
|
|
489
|
+
|
|
490
|
+
Returns:
|
|
491
|
+
Tuple of (response_text, error) where error is None on success
|
|
492
|
+
"""
|
|
493
|
+
try:
|
|
494
|
+
response = self.call(messages)
|
|
495
|
+
return response, None
|
|
496
|
+
except Exception as e:
|
|
497
|
+
return "", e
|
|
498
|
+
|
|
499
|
+
def get_endpoint_stats(self) -> dict[str, Any]:
|
|
500
|
+
"""Get statistics about endpoint usage."""
|
|
501
|
+
return self.endpoint_manager.get_stats()
|
|
502
|
+
|
|
503
|
+
@property
|
|
504
|
+
def config(self) -> dict[str, Any]:
|
|
505
|
+
"""
|
|
506
|
+
Get the judge configuration for logging/persistence.
|
|
507
|
+
|
|
508
|
+
Returns:
|
|
509
|
+
Dict with model, temperature, timeout, max_retries, and endpoint info
|
|
510
|
+
"""
|
|
511
|
+
return {
|
|
512
|
+
"model": self.model,
|
|
513
|
+
"temperature": self.temperature,
|
|
514
|
+
"timeout": self.timeout,
|
|
515
|
+
"max_retries": self.max_retries,
|
|
516
|
+
"base_urls": self._base_urls,
|
|
517
|
+
"num_api_keys": len(self._api_keys),
|
|
518
|
+
"endpoint_stats": self.get_endpoint_stats(),
|
|
519
|
+
}
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: genarena
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: GenArena Arena Evaluation - VLM-based pairwise image generation evaluation
|
|
5
|
+
Author: GenArena Team
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Science/Research
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Requires-Dist: datasets>=2.0.0
|
|
17
|
+
Requires-Dist: huggingface-hub>=0.20.0
|
|
18
|
+
Requires-Dist: json-repair>=0.25.0
|
|
19
|
+
Requires-Dist: openai>=1.0.0
|
|
20
|
+
Requires-Dist: pandas>=2.0.0
|
|
21
|
+
Requires-Dist: pillow>=9.0.0
|
|
22
|
+
Requires-Dist: pyarrow>=12.0.0
|
|
23
|
+
Requires-Dist: tqdm>=4.65.0
|
|
24
|
+
Provides-Extra: web
|
|
25
|
+
Requires-Dist: flask>=2.0.0; extra == 'web'
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# GenArena
|
|
29
|
+
|
|
30
|
+
A unified evaluation framework for visual generation tasks using VLM-based pairwise comparison and Elo ranking.
|
|
31
|
+
|
|
32
|
+
[](https://arxiv.org/abs/2602.XXXXX)
|
|
33
|
+
[](https://genarena.github.io)
|
|
34
|
+
[](https://huggingface.co/spaces/genarena/leaderboard)
|
|
35
|
+
[](https://huggingface.co/datasets/rhli/genarena)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
## Abstract
|
|
39
|
+
|
|
40
|
+
The rapid advancement of visual generation models has outpaced traditional evaluation approaches, necessitating the adoption of Vision-Language Models as surrogate judges. In this work, we systematically investigate the reliability of the prevailing absolute pointwise scoring standard, across a wide spectrum of visual generation tasks. Our analysis reveals that this paradigm is limited due to stochastic inconsistency and poor alignment with human perception. To resolve these limitations, we introduce **GenArena**, a unified evaluation framework that leverages a *pairwise comparison* paradigm to ensure stable and human-aligned evaluation. Crucially, our experiments uncover a transformative finding that simply adopting this pairwise protocol enables off-the-shelf open-source models to outperform top-tier proprietary models. Notably, our method boosts evaluation accuracy by over 20% and achieves a Spearman correlation of 0.86 with the authoritative LMArena leaderboard, drastically surpassing the 0.36 correlation of pointwise methods. Based on GenArena, we benchmark state-of-the-art visual generation models across diverse tasks, providing the community with a rigorous and automated evaluation standard for visual generation.
|
|
41
|
+
|
|
42
|
+
## Quick Start
|
|
43
|
+
|
|
44
|
+
### Installation
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install genarena
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Or install from source:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
git clone https://github.com/ruihanglix/genarena.git
|
|
54
|
+
cd genarena
|
|
55
|
+
pip install -e .
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Initialize Arena
|
|
59
|
+
|
|
60
|
+
Download benchmark data and official arena data with one command:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
genarena init --arena_dir ./arena --data_dir ./data
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
This downloads:
|
|
67
|
+
- Benchmark Parquet data from `rhli/genarena` (HuggingFace)
|
|
68
|
+
- Official arena data (model outputs + battle logs) from `rhli/genarena-battlefield`
|
|
69
|
+
|
|
70
|
+
### Environment Setup
|
|
71
|
+
|
|
72
|
+
Set your VLM API credentials:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
export OPENAI_API_KEY="your-api-key"
|
|
76
|
+
export OPENAI_BASE_URL="https://api.example.com/v1"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
For multi-endpoint support (load balancing and failover), use comma-separated values:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
export OPENAI_BASE_URLS="https://api1.example.com/v1,https://api2.example.com/v1"
|
|
83
|
+
export OPENAI_API_KEYS="key1,key2,key3"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Run Evaluation
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
genarena run --arena_dir ./arena --data_dir ./data
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### View Leaderboard
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
genarena leaderboard --arena_dir ./arena --subset basic
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Check Status
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
genarena status --arena_dir ./arena --data_dir ./data
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Running Your Own Experiments
|
|
105
|
+
|
|
106
|
+
### Directory Structure
|
|
107
|
+
|
|
108
|
+
To add your own model for evaluation, organize outputs in the following structure:
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
arena_dir/
|
|
112
|
+
└── <subset>/
|
|
113
|
+
└── models/
|
|
114
|
+
└── <GithubID>_<modelName>_<yyyymmdd>/
|
|
115
|
+
└── <model_name>/
|
|
116
|
+
├── 000000.png
|
|
117
|
+
├── 000001.png
|
|
118
|
+
└── ...
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
For example:
|
|
122
|
+
```
|
|
123
|
+
arena/basic/models/johndoe_MyNewModel_20260205/MyNewModel/
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Generate Images with Diffgentor
|
|
127
|
+
|
|
128
|
+
Use [Diffgentor](https://github.com/ruihanglix/diffgentor) to batch generate images for evaluation:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
# Download benchmark data
|
|
132
|
+
hf download rhli/genarena --repo-type dataset --local-dir ./data
|
|
133
|
+
|
|
134
|
+
# Generate images with your model
|
|
135
|
+
diffgentor edit --backend diffusers \
|
|
136
|
+
--model_name YourModel \
|
|
137
|
+
--input ./data/basic/ \
|
|
138
|
+
--output_dir ./arena/basic/models/yourname_YourModel_20260205/YourModel/
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Run Battles for New Models
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
genarena run --arena_dir ./arena --data_dir ./data \
|
|
145
|
+
--subset basic \
|
|
146
|
+
--exp_name yourname_YourModel_20260205
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
GenArena automatically detects new models and schedules battles against existing models.
|
|
150
|
+
|
|
151
|
+
## Submit to Official Leaderboard
|
|
152
|
+
|
|
153
|
+
> **Coming Soon**: The `genarena submit` command will allow you to submit your evaluation results to the official GenArena leaderboard via GitHub PR.
|
|
154
|
+
|
|
155
|
+
The workflow will be:
|
|
156
|
+
1. Run evaluation locally with `genarena run`
|
|
157
|
+
2. Upload results to your HuggingFace repository
|
|
158
|
+
3. Submit via `genarena submit` which creates a PR for review
|
|
159
|
+
|
|
160
|
+
## Documentation
|
|
161
|
+
|
|
162
|
+
| Document | Description |
|
|
163
|
+
|----------|-------------|
|
|
164
|
+
| [Quick Start](./docs/quickstart.md) | Installation and basic usage guide |
|
|
165
|
+
| [Architecture](./docs/architecture.md) | System design and key concepts |
|
|
166
|
+
| [CLI Reference](./docs/cli-reference.md) | Complete command-line interface documentation |
|
|
167
|
+
| [Experiment Management](./docs/experiments.md) | How to organize and manage experiments |
|
|
168
|
+
| [FAQ](./docs/faq.md) | Frequently asked questions |
|
|
169
|
+
|
|
170
|
+
## Citation
|
|
171
|
+
|
|
172
|
+
```bibtex
|
|
173
|
+
TBD
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## License
|
|
177
|
+
|
|
178
|
+
Apache License 2.0
|