genarena 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. genarena/__init__.py +49 -2
  2. genarena/__main__.py +10 -0
  3. genarena/arena.py +1685 -0
  4. genarena/battle.py +337 -0
  5. genarena/bt_elo.py +507 -0
  6. genarena/cli.py +1581 -0
  7. genarena/data.py +476 -0
  8. genarena/deploy/Dockerfile +25 -0
  9. genarena/deploy/README.md +55 -0
  10. genarena/deploy/__init__.py +5 -0
  11. genarena/deploy/app.py +84 -0
  12. genarena/experiments.py +121 -0
  13. genarena/leaderboard.py +270 -0
  14. genarena/logs.py +409 -0
  15. genarena/models.py +412 -0
  16. genarena/prompts/__init__.py +127 -0
  17. genarena/prompts/mmrb2.py +373 -0
  18. genarena/sampling.py +336 -0
  19. genarena/state.py +656 -0
  20. genarena/sync/__init__.py +105 -0
  21. genarena/sync/auto_commit.py +118 -0
  22. genarena/sync/deploy_ops.py +543 -0
  23. genarena/sync/git_ops.py +422 -0
  24. genarena/sync/hf_ops.py +891 -0
  25. genarena/sync/init_ops.py +431 -0
  26. genarena/sync/packer.py +587 -0
  27. genarena/sync/submit.py +837 -0
  28. genarena/utils.py +103 -0
  29. genarena/validation/__init__.py +19 -0
  30. genarena/validation/schema.py +327 -0
  31. genarena/validation/validator.py +329 -0
  32. genarena/visualize/README.md +148 -0
  33. genarena/visualize/__init__.py +14 -0
  34. genarena/visualize/app.py +938 -0
  35. genarena/visualize/data_loader.py +2335 -0
  36. genarena/visualize/static/app.js +3762 -0
  37. genarena/visualize/static/model_aliases.json +86 -0
  38. genarena/visualize/static/style.css +4104 -0
  39. genarena/visualize/templates/index.html +413 -0
  40. genarena/vlm.py +519 -0
  41. genarena-0.1.0.dist-info/METADATA +178 -0
  42. genarena-0.1.0.dist-info/RECORD +44 -0
  43. {genarena-0.0.1.dist-info → genarena-0.1.0.dist-info}/WHEEL +1 -2
  44. genarena-0.1.0.dist-info/entry_points.txt +2 -0
  45. genarena-0.0.1.dist-info/METADATA +0 -26
  46. genarena-0.0.1.dist-info/RECORD +0 -5
  47. genarena-0.0.1.dist-info/top_level.txt +0 -1
genarena/vlm.py ADDED
@@ -0,0 +1,519 @@
1
+ # Copyright 2026 Ruihang Li.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See LICENSE file in the project root for details.
4
+
5
+ """VLM API calling module with retry and multi-endpoint support."""
6
+
7
+ import logging
8
+ import os
9
+ import threading
10
+ import time
11
+ from dataclasses import dataclass
12
+ from typing import Any, Optional, Union
13
+ from urllib.parse import urlparse
14
+
15
+ from openai import OpenAI
16
+
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ def _silence_http_client_logs() -> None:
21
+ """Silence noisy HTTP client logs (e.g., httpx INFO request lines)."""
22
+ try:
23
+ logging.getLogger("httpx").setLevel(logging.WARNING)
24
+ logging.getLogger("httpcore").setLevel(logging.WARNING)
25
+ except Exception:
26
+ pass
27
+
28
+
29
+ def _short_endpoint(base_url: str) -> str:
30
+ """Convert base_url to a short host:port label for progress display."""
31
+ if not base_url:
32
+ return "default"
33
+ try:
34
+ u = urlparse(base_url)
35
+ host = u.hostname or base_url
36
+ port = f":{u.port}" if u.port else ""
37
+ return f"{host}{port}"
38
+ except Exception:
39
+ return base_url
40
+
41
+
42
+ def _progress_put(progress: Any, item: Any) -> None:
43
+ """Best-effort put to a queue-like progress sink."""
44
+ if progress is None:
45
+ return
46
+ if hasattr(progress, "put"):
47
+ try:
48
+ progress.put(item)
49
+ except Exception:
50
+ pass
51
+
52
+
53
+ @dataclass
54
+ class EndpointConfig:
55
+ """Configuration for an API endpoint."""
56
+
57
+ base_url: str
58
+ api_keys: list[str]
59
+ timeout: int = 120
60
+ max_retries: int = 3
61
+ weight: float = 1.0
62
+ disabled: bool = False
63
+
64
+
65
+ class MultiEndpointManager:
66
+ """
67
+ Manages multiple API endpoints with automatic failover and load balancing.
68
+
69
+ Supports:
70
+ - Round-robin endpoint selection
71
+ - Per-endpoint API key rotation
72
+ - Automatic endpoint disable/re-enable on errors
73
+ - Thread-safe operations
74
+ """
75
+
76
+ def __init__(self, endpoint_configs: list[EndpointConfig], timeout: int = 120):
77
+ """
78
+ Initialize multi-endpoint manager.
79
+
80
+ Args:
81
+ endpoint_configs: List of endpoint configurations
82
+ timeout: Default timeout for API calls
83
+ """
84
+ self.timeout = timeout
85
+ self.endpoints: list[dict[str, Any]] = []
86
+
87
+ for config in endpoint_configs:
88
+ if not config.api_keys:
89
+ continue
90
+
91
+ self.endpoints.append({
92
+ "config": config,
93
+ "key_index": 0, # Current API key index for round-robin
94
+ "error_count": 0,
95
+ "last_error_time": 0.0,
96
+ "disabled": config.disabled,
97
+ })
98
+
99
+ if not self.endpoints:
100
+ raise ValueError("At least one valid endpoint configuration is required")
101
+
102
+ self.current_endpoint_index = 0
103
+ self.lock = threading.Lock()
104
+
105
+ def get_client(self) -> tuple[OpenAI, EndpointConfig, str]:
106
+ """
107
+ Get a client from available endpoints (thread-safe round-robin).
108
+
109
+ Returns:
110
+ Tuple of (OpenAI client, EndpointConfig, api_key used)
111
+
112
+ Raises:
113
+ RuntimeError: If all endpoints are disabled
114
+ """
115
+ with self.lock:
116
+ # Check if any endpoints are available
117
+ if not any(not ep["disabled"] for ep in self.endpoints):
118
+ # Try to re-enable endpoints that have been disabled for > 5 minutes
119
+ for ep in self.endpoints:
120
+ if ep["disabled"] and time.time() - ep["last_error_time"] > 300:
121
+ ep["disabled"] = False
122
+ ep["error_count"] = 0
123
+
124
+ if not any(not ep["disabled"] for ep in self.endpoints):
125
+ raise RuntimeError("All endpoints are temporarily disabled")
126
+
127
+ attempts = 0
128
+ while attempts < len(self.endpoints):
129
+ endpoint = self.endpoints[self.current_endpoint_index]
130
+ self.current_endpoint_index = (self.current_endpoint_index + 1) % len(self.endpoints)
131
+
132
+ # Skip disabled endpoints
133
+ if endpoint["disabled"]:
134
+ # Check if we should re-enable
135
+ if time.time() - endpoint["last_error_time"] > 300:
136
+ endpoint["disabled"] = False
137
+ endpoint["error_count"] = 0
138
+ else:
139
+ attempts += 1
140
+ continue
141
+
142
+ config = endpoint["config"]
143
+
144
+ # Get API key with round-robin
145
+ api_key = config.api_keys[endpoint["key_index"]]
146
+ endpoint["key_index"] = (endpoint["key_index"] + 1) % len(config.api_keys)
147
+
148
+ try:
149
+ client = OpenAI(
150
+ api_key=api_key,
151
+ base_url=config.base_url,
152
+ timeout=config.timeout or self.timeout,
153
+ )
154
+ return client, config, api_key
155
+ except Exception as e:
156
+ logger.warning(f"Failed to create client for {config.base_url}: {e}")
157
+ attempts += 1
158
+
159
+ raise RuntimeError("No available endpoints")
160
+
161
+ def record_success(self, config: EndpointConfig) -> None:
162
+ """
163
+ Record a successful API call for an endpoint.
164
+
165
+ Args:
166
+ config: The endpoint config that succeeded
167
+ """
168
+ with self.lock:
169
+ for endpoint in self.endpoints:
170
+ if endpoint["config"].base_url == config.base_url:
171
+ # Reduce error count on success
172
+ if endpoint["error_count"] > 0:
173
+ endpoint["error_count"] = max(0, endpoint["error_count"] - 1)
174
+ # Re-enable if disabled
175
+ if endpoint["disabled"]:
176
+ endpoint["disabled"] = False
177
+ break
178
+
179
+ def record_failure(self, config: EndpointConfig, error_type: str = "generic") -> None:
180
+ """
181
+ Record a failed API call for an endpoint.
182
+
183
+ Args:
184
+ config: The endpoint config that failed
185
+ error_type: Type of error ('auth', 'rate_limit', 'timeout', 'generic')
186
+ """
187
+ with self.lock:
188
+ for endpoint in self.endpoints:
189
+ if endpoint["config"].base_url == config.base_url:
190
+ endpoint["error_count"] += 1
191
+ endpoint["last_error_time"] = time.time()
192
+
193
+ # Disable endpoint after 3 consecutive errors
194
+ if endpoint["error_count"] >= 3:
195
+ endpoint["disabled"] = True
196
+ logger.warning(
197
+ f"Endpoint {config.base_url} disabled after {endpoint['error_count']} errors"
198
+ )
199
+ break
200
+
201
+ def get_stats(self) -> dict[str, Any]:
202
+ """Get statistics about endpoint usage."""
203
+ with self.lock:
204
+ stats = {
205
+ "total_endpoints": len(self.endpoints),
206
+ "enabled_endpoints": sum(1 for ep in self.endpoints if not ep["disabled"]),
207
+ "disabled_endpoints": sum(1 for ep in self.endpoints if ep["disabled"]),
208
+ "endpoints": [],
209
+ }
210
+
211
+ for endpoint in self.endpoints:
212
+ stats["endpoints"].append({
213
+ "base_url": endpoint["config"].base_url,
214
+ "enabled": not endpoint["disabled"],
215
+ "error_count": endpoint["error_count"],
216
+ "num_keys": len(endpoint["config"].api_keys),
217
+ })
218
+
219
+ return stats
220
+
221
+
222
+ class VLMJudge:
223
+ """
224
+ VLM Judge class for calling vision-language models via OpenAI-compatible API.
225
+
226
+ Supports:
227
+ - Greedy mode (temperature=0) for reproducible results
228
+ - Multi-endpoint with automatic failover
229
+ - Per-endpoint API key rotation
230
+ - Exponential backoff retry mechanism
231
+ """
232
+
233
+ DEFAULT_MODEL = "Qwen/Qwen3-VL-32B-Instruct-FP8"
234
+
235
+ def __init__(
236
+ self,
237
+ model: str = DEFAULT_MODEL,
238
+ temperature: float = 0.0,
239
+ timeout: int = 120,
240
+ max_retries: int = 3,
241
+ base_url: Optional[str] = None,
242
+ base_urls: Optional[Union[str, list[str]]] = None,
243
+ api_key: Optional[str] = None,
244
+ api_keys: Optional[Union[str, list[str]]] = None,
245
+ progress: Any = None,
246
+ ):
247
+ """
248
+ Initialize the VLM Judge.
249
+
250
+ Args:
251
+ model: Model name to use for evaluation
252
+ temperature: Sampling temperature (0 for greedy/deterministic)
253
+ timeout: API call timeout in seconds
254
+ max_retries: Maximum number of retry attempts
255
+ base_url: Single OpenAI API base URL (legacy, use base_urls for multi-endpoint)
256
+ base_urls: Multiple base URLs (comma-separated string or list)
257
+ api_key: Single API key (legacy, use api_keys for multiple)
258
+ api_keys: Multiple API keys (comma-separated string or list)
259
+ """
260
+ self.model = model
261
+ self.temperature = temperature
262
+ self.timeout = timeout
263
+ self.max_retries = max_retries
264
+ self._progress = progress
265
+
266
+ # Avoid printing per-request httpx INFO lines to stdout.
267
+ _silence_http_client_logs()
268
+
269
+ # Parse base URLs
270
+ self._base_urls = self._parse_urls(base_urls, base_url, "OPENAI_BASE_URLS", "OPENAI_BASE_URL")
271
+
272
+ # Parse API keys
273
+ self._api_keys = self._parse_keys(api_keys, api_key, "OPENAI_API_KEY")
274
+
275
+ if not self._api_keys:
276
+ raise ValueError(
277
+ "API key must be provided via api_key/api_keys parameter or "
278
+ "OPENAI_API_KEY environment variable"
279
+ )
280
+
281
+ # If progress is enabled, silence noisy per-request httpx logs.
282
+ if self._progress is not None:
283
+ _silence_http_client_logs()
284
+
285
+ # Build endpoint configs
286
+ endpoint_configs = self._build_endpoint_configs()
287
+
288
+ # Initialize multi-endpoint manager
289
+ self.endpoint_manager = MultiEndpointManager(endpoint_configs, timeout=timeout)
290
+
291
+ # For backward compatibility and logging
292
+ self.base_url = self._base_urls[0] if self._base_urls else None
293
+ self.api_key = self._api_keys[0] if self._api_keys else None
294
+
295
+ def set_progress(self, progress: Any) -> None:
296
+ """Attach a progress sink (queue-like) for emitting request events."""
297
+ self._progress = progress
298
+ if self._progress is not None:
299
+ _silence_http_client_logs()
300
+
301
+ def _progress_event(self, msg: str) -> None:
302
+ """Emit a short request event for progress UI."""
303
+ _progress_put(self._progress, ("log", msg))
304
+
305
+ def _parse_urls(
306
+ self,
307
+ urls: Optional[Union[str, list[str]]],
308
+ single_url: Optional[str],
309
+ env_multi: str,
310
+ env_single: str
311
+ ) -> list[str]:
312
+ """Parse base URLs from various sources."""
313
+ # Priority: urls param > single_url param > env vars
314
+ if urls:
315
+ if isinstance(urls, str):
316
+ return [u.strip() for u in urls.split(",") if u.strip()]
317
+ return list(urls)
318
+
319
+ if single_url:
320
+ return [single_url]
321
+
322
+ # Try environment variables
323
+ env_urls = os.environ.get(env_multi) or os.environ.get(env_single)
324
+ if env_urls:
325
+ return [u.strip() for u in env_urls.split(",") if u.strip()]
326
+
327
+ return []
328
+
329
+ def _parse_keys(
330
+ self,
331
+ keys: Optional[Union[str, list[str]]],
332
+ single_key: Optional[str],
333
+ env_name: str
334
+ ) -> list[str]:
335
+ """Parse API keys from various sources."""
336
+ # Priority: keys param > single_key param > env var
337
+ if keys:
338
+ if isinstance(keys, str):
339
+ return [k.strip() for k in keys.split(",") if k.strip()]
340
+ return list(keys)
341
+
342
+ if single_key:
343
+ return [single_key]
344
+
345
+ # Try environment variable
346
+ env_keys = os.environ.get(env_name)
347
+ if env_keys:
348
+ return [k.strip() for k in env_keys.split(",") if k.strip()]
349
+
350
+ return []
351
+
352
+ def _build_endpoint_configs(self) -> list[EndpointConfig]:
353
+ """Build endpoint configurations from URLs and keys."""
354
+ configs = []
355
+
356
+ if not self._base_urls:
357
+ # No base URL specified, create single endpoint with all keys
358
+ configs.append(EndpointConfig(
359
+ base_url="", # Will use OpenAI default
360
+ api_keys=self._api_keys,
361
+ timeout=self.timeout,
362
+ max_retries=self.max_retries,
363
+ ))
364
+ else:
365
+ # Distribute API keys among endpoints
366
+ num_urls = len(self._base_urls)
367
+ num_keys = len(self._api_keys)
368
+ keys_per_endpoint = max(1, num_keys // num_urls)
369
+ remainder = num_keys % num_urls
370
+
371
+ key_index = 0
372
+ for i, url in enumerate(self._base_urls):
373
+ # Calculate keys for this endpoint
374
+ num_keys_for_endpoint = keys_per_endpoint
375
+ if i < remainder:
376
+ num_keys_for_endpoint += 1
377
+
378
+ endpoint_keys = self._api_keys[key_index:key_index + num_keys_for_endpoint]
379
+ if not endpoint_keys:
380
+ # If no keys left, use all keys
381
+ endpoint_keys = self._api_keys
382
+ key_index += num_keys_for_endpoint
383
+
384
+ configs.append(EndpointConfig(
385
+ base_url=url,
386
+ api_keys=endpoint_keys,
387
+ timeout=self.timeout,
388
+ max_retries=self.max_retries,
389
+ ))
390
+
391
+ return configs
392
+
393
+ def call(self, messages: list[dict[str, Any]]) -> str:
394
+ """
395
+ Call the VLM API and return the response text.
396
+
397
+ Uses multi-endpoint failover and exponential backoff for retries.
398
+
399
+ Args:
400
+ messages: List of message dicts in OpenAI Chat Completion format
401
+
402
+ Returns:
403
+ Raw response text from the VLM
404
+
405
+ Raises:
406
+ Exception: If all retry attempts fail
407
+ """
408
+ last_exception = None
409
+
410
+ for attempt in range(self.max_retries):
411
+ try:
412
+ # Get client from endpoint manager
413
+ client, config, _ = self.endpoint_manager.get_client()
414
+ endpoint_label = _short_endpoint(getattr(config, "base_url", "") or "")
415
+
416
+ response = client.chat.completions.create(
417
+ model=self.model,
418
+ messages=messages,
419
+ temperature=self.temperature,
420
+ max_tokens=4096,
421
+ )
422
+
423
+ # Extract text from response
424
+ if response.choices and len(response.choices) > 0:
425
+ content = response.choices[0].message.content
426
+ if content:
427
+ # Record success
428
+ self.endpoint_manager.record_success(config)
429
+ self._progress_event(f"OK {endpoint_label}")
430
+ return content
431
+ else:
432
+ raise ValueError("Empty response content from VLM")
433
+ else:
434
+ raise ValueError("No choices in VLM response")
435
+
436
+ except Exception as e:
437
+ last_exception = e
438
+ endpoint_label = _short_endpoint(getattr(config, "base_url", "") or "") if "config" in dir() else "unknown"
439
+
440
+ # Record failure for endpoint
441
+ if 'config' in dir():
442
+ error_type = self._classify_error(e)
443
+ self.endpoint_manager.record_failure(config, error_type)
444
+ self._progress_event(f"ERR {endpoint_label} {error_type}")
445
+ else:
446
+ self._progress_event(f"ERR {endpoint_label} generic")
447
+
448
+ logger.warning(
449
+ f"VLM call attempt {attempt + 1}/{self.max_retries} failed: {e}"
450
+ )
451
+
452
+ # Exponential backoff: 1s, 2s, 4s, ...
453
+ if attempt < self.max_retries - 1:
454
+ wait_time = 2 ** attempt
455
+ logger.info(f"Retrying in {wait_time} seconds...")
456
+ time.sleep(wait_time)
457
+
458
+ # All retries failed
459
+ raise Exception(
460
+ f"VLM call failed after {self.max_retries} attempts. "
461
+ f"Last error: {last_exception}"
462
+ )
463
+
464
+ def _classify_error(self, error: Exception) -> str:
465
+ """Classify an error for endpoint management."""
466
+ error_str = str(error).lower()
467
+
468
+ if "401" in error_str or "unauthorized" in error_str or "invalid api key" in error_str:
469
+ return "auth"
470
+ elif "429" in error_str or "rate limit" in error_str:
471
+ return "rate_limit"
472
+ elif "timeout" in error_str:
473
+ return "timeout"
474
+ else:
475
+ return "generic"
476
+
477
+ def call_with_raw(
478
+ self,
479
+ messages: list[dict[str, Any]]
480
+ ) -> tuple[str, Optional[Exception]]:
481
+ """
482
+ Call the VLM API and return both response and any error.
483
+
484
+ This variant returns the error instead of raising it,
485
+ useful for audit logging.
486
+
487
+ Args:
488
+ messages: List of message dicts in OpenAI Chat Completion format
489
+
490
+ Returns:
491
+ Tuple of (response_text, error) where error is None on success
492
+ """
493
+ try:
494
+ response = self.call(messages)
495
+ return response, None
496
+ except Exception as e:
497
+ return "", e
498
+
499
+ def get_endpoint_stats(self) -> dict[str, Any]:
500
+ """Get statistics about endpoint usage."""
501
+ return self.endpoint_manager.get_stats()
502
+
503
+ @property
504
+ def config(self) -> dict[str, Any]:
505
+ """
506
+ Get the judge configuration for logging/persistence.
507
+
508
+ Returns:
509
+ Dict with model, temperature, timeout, max_retries, and endpoint info
510
+ """
511
+ return {
512
+ "model": self.model,
513
+ "temperature": self.temperature,
514
+ "timeout": self.timeout,
515
+ "max_retries": self.max_retries,
516
+ "base_urls": self._base_urls,
517
+ "num_api_keys": len(self._api_keys),
518
+ "endpoint_stats": self.get_endpoint_stats(),
519
+ }
@@ -0,0 +1,178 @@
1
+ Metadata-Version: 2.4
2
+ Name: genarena
3
+ Version: 0.1.0
4
+ Summary: GenArena Arena Evaluation - VLM-based pairwise image generation evaluation
5
+ Author: GenArena Team
6
+ License: Apache-2.0
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Science/Research
9
+ Classifier: License :: OSI Approved :: Apache Software License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Requires-Python: >=3.10
16
+ Requires-Dist: datasets>=2.0.0
17
+ Requires-Dist: huggingface-hub>=0.20.0
18
+ Requires-Dist: json-repair>=0.25.0
19
+ Requires-Dist: openai>=1.0.0
20
+ Requires-Dist: pandas>=2.0.0
21
+ Requires-Dist: pillow>=9.0.0
22
+ Requires-Dist: pyarrow>=12.0.0
23
+ Requires-Dist: tqdm>=4.65.0
24
+ Provides-Extra: web
25
+ Requires-Dist: flask>=2.0.0; extra == 'web'
26
+ Description-Content-Type: text/markdown
27
+
28
+ # GenArena
29
+
30
+ A unified evaluation framework for visual generation tasks using VLM-based pairwise comparison and Elo ranking.
31
+
32
+ [![arXiv](https://img.shields.io/badge/arXiv-2602.XXXXX-b31b1b.svg)](https://arxiv.org/abs/2602.XXXXX)
33
+ [![Project Page](https://img.shields.io/badge/Project-Website-orange)](https://genarena.github.io)
34
+ [![Leaderboard](https://img.shields.io/badge/%F0%9F%8F%86%20Leaderboard-Live-brightgreen)](https://huggingface.co/spaces/genarena/leaderboard)
35
+ [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-GenArena-yellow)](https://huggingface.co/datasets/rhli/genarena)
36
+
37
+
38
+ ## Abstract
39
+
40
+ The rapid advancement of visual generation models has outpaced traditional evaluation approaches, necessitating the adoption of Vision-Language Models as surrogate judges. In this work, we systematically investigate the reliability of the prevailing absolute pointwise scoring standard, across a wide spectrum of visual generation tasks. Our analysis reveals that this paradigm is limited due to stochastic inconsistency and poor alignment with human perception. To resolve these limitations, we introduce **GenArena**, a unified evaluation framework that leverages a *pairwise comparison* paradigm to ensure stable and human-aligned evaluation. Crucially, our experiments uncover a transformative finding that simply adopting this pairwise protocol enables off-the-shelf open-source models to outperform top-tier proprietary models. Notably, our method boosts evaluation accuracy by over 20% and achieves a Spearman correlation of 0.86 with the authoritative LMArena leaderboard, drastically surpassing the 0.36 correlation of pointwise methods. Based on GenArena, we benchmark state-of-the-art visual generation models across diverse tasks, providing the community with a rigorous and automated evaluation standard for visual generation.
41
+
42
+ ## Quick Start
43
+
44
+ ### Installation
45
+
46
+ ```bash
47
+ pip install genarena
48
+ ```
49
+
50
+ Or install from source:
51
+
52
+ ```bash
53
+ git clone https://github.com/ruihanglix/genarena.git
54
+ cd genarena
55
+ pip install -e .
56
+ ```
57
+
58
+ ### Initialize Arena
59
+
60
+ Download benchmark data and official arena data with one command:
61
+
62
+ ```bash
63
+ genarena init --arena_dir ./arena --data_dir ./data
64
+ ```
65
+
66
+ This downloads:
67
+ - Benchmark Parquet data from `rhli/genarena` (HuggingFace)
68
+ - Official arena data (model outputs + battle logs) from `rhli/genarena-battlefield`
69
+
70
+ ### Environment Setup
71
+
72
+ Set your VLM API credentials:
73
+
74
+ ```bash
75
+ export OPENAI_API_KEY="your-api-key"
76
+ export OPENAI_BASE_URL="https://api.example.com/v1"
77
+ ```
78
+
79
+ For multi-endpoint support (load balancing and failover), use comma-separated values:
80
+
81
+ ```bash
82
+ export OPENAI_BASE_URLS="https://api1.example.com/v1,https://api2.example.com/v1"
83
+ export OPENAI_API_KEYS="key1,key2,key3"
84
+ ```
85
+
86
+ ### Run Evaluation
87
+
88
+ ```bash
89
+ genarena run --arena_dir ./arena --data_dir ./data
90
+ ```
91
+
92
+ ### View Leaderboard
93
+
94
+ ```bash
95
+ genarena leaderboard --arena_dir ./arena --subset basic
96
+ ```
97
+
98
+ ### Check Status
99
+
100
+ ```bash
101
+ genarena status --arena_dir ./arena --data_dir ./data
102
+ ```
103
+
104
+ ## Running Your Own Experiments
105
+
106
+ ### Directory Structure
107
+
108
+ To add your own model for evaluation, organize outputs in the following structure:
109
+
110
+ ```
111
+ arena_dir/
112
+ └── <subset>/
113
+ └── models/
114
+ └── <GithubID>_<modelName>_<yyyymmdd>/
115
+ └── <model_name>/
116
+ ├── 000000.png
117
+ ├── 000001.png
118
+ └── ...
119
+ ```
120
+
121
+ For example:
122
+ ```
123
+ arena/basic/models/johndoe_MyNewModel_20260205/MyNewModel/
124
+ ```
125
+
126
+ ### Generate Images with Diffgentor
127
+
128
+ Use [Diffgentor](https://github.com/ruihanglix/diffgentor) to batch generate images for evaluation:
129
+
130
+ ```bash
131
+ # Download benchmark data
132
+ hf download rhli/genarena --repo-type dataset --local-dir ./data
133
+
134
+ # Generate images with your model
135
+ diffgentor edit --backend diffusers \
136
+ --model_name YourModel \
137
+ --input ./data/basic/ \
138
+ --output_dir ./arena/basic/models/yourname_YourModel_20260205/YourModel/
139
+ ```
140
+
141
+ ### Run Battles for New Models
142
+
143
+ ```bash
144
+ genarena run --arena_dir ./arena --data_dir ./data \
145
+ --subset basic \
146
+ --exp_name yourname_YourModel_20260205
147
+ ```
148
+
149
+ GenArena automatically detects new models and schedules battles against existing models.
150
+
151
+ ## Submit to Official Leaderboard
152
+
153
+ > **Coming Soon**: The `genarena submit` command will allow you to submit your evaluation results to the official GenArena leaderboard via GitHub PR.
154
+
155
+ The workflow will be:
156
+ 1. Run evaluation locally with `genarena run`
157
+ 2. Upload results to your HuggingFace repository
158
+ 3. Submit via `genarena submit` which creates a PR for review
159
+
160
+ ## Documentation
161
+
162
+ | Document | Description |
163
+ |----------|-------------|
164
+ | [Quick Start](./docs/quickstart.md) | Installation and basic usage guide |
165
+ | [Architecture](./docs/architecture.md) | System design and key concepts |
166
+ | [CLI Reference](./docs/cli-reference.md) | Complete command-line interface documentation |
167
+ | [Experiment Management](./docs/experiments.md) | How to organize and manage experiments |
168
+ | [FAQ](./docs/faq.md) | Frequently asked questions |
169
+
170
+ ## Citation
171
+
172
+ ```bibtex
173
+ TBD
174
+ ```
175
+
176
+ ## License
177
+
178
+ Apache License 2.0