genarena 0.0.1__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genarena-0.1.1/.github/workflows/publish.yml +72 -0
- genarena-0.1.1/.gitignore +207 -0
- genarena-0.1.1/PKG-INFO +178 -0
- genarena-0.1.1/README.md +151 -0
- genarena-0.1.1/docs/README.md +37 -0
- genarena-0.1.1/docs/architecture.md +380 -0
- genarena-0.1.1/docs/cli-reference.md +471 -0
- genarena-0.1.1/docs/experiments.md +367 -0
- genarena-0.1.1/docs/faq.md +234 -0
- genarena-0.1.1/docs/maintainer-guide/README.md +464 -0
- genarena-0.1.1/docs/maintainer-guide/deploy.md +312 -0
- genarena-0.1.1/docs/quickstart.md +206 -0
- genarena-0.1.1/genarena/__init__.py +50 -0
- genarena-0.1.1/genarena/__main__.py +10 -0
- genarena-0.1.1/genarena/arena.py +1685 -0
- genarena-0.1.1/genarena/battle.py +337 -0
- genarena-0.1.1/genarena/bt_elo.py +507 -0
- genarena-0.1.1/genarena/cli.py +1581 -0
- genarena-0.1.1/genarena/data.py +476 -0
- genarena-0.1.1/genarena/deploy/Dockerfile +22 -0
- genarena-0.1.1/genarena/deploy/README.md +55 -0
- genarena-0.1.1/genarena/deploy/__init__.py +5 -0
- genarena-0.1.1/genarena/deploy/app.py +84 -0
- genarena-0.1.1/genarena/experiments.py +121 -0
- genarena-0.1.1/genarena/leaderboard.py +270 -0
- genarena-0.1.1/genarena/logs.py +409 -0
- genarena-0.1.1/genarena/models.py +412 -0
- genarena-0.1.1/genarena/prompts/__init__.py +127 -0
- genarena-0.1.1/genarena/prompts/mmrb2.py +373 -0
- genarena-0.1.1/genarena/sampling.py +336 -0
- genarena-0.1.1/genarena/state.py +656 -0
- genarena-0.1.1/genarena/sync/__init__.py +105 -0
- genarena-0.1.1/genarena/sync/auto_commit.py +118 -0
- genarena-0.1.1/genarena/sync/deploy_ops.py +543 -0
- genarena-0.1.1/genarena/sync/git_ops.py +422 -0
- genarena-0.1.1/genarena/sync/hf_ops.py +891 -0
- genarena-0.1.1/genarena/sync/init_ops.py +431 -0
- genarena-0.1.1/genarena/sync/packer.py +587 -0
- genarena-0.1.1/genarena/sync/submit.py +837 -0
- genarena-0.1.1/genarena/utils.py +103 -0
- genarena-0.1.1/genarena/validation/__init__.py +19 -0
- genarena-0.1.1/genarena/validation/schema.py +327 -0
- genarena-0.1.1/genarena/validation/validator.py +329 -0
- genarena-0.1.1/genarena/visualize/README.md +148 -0
- genarena-0.1.1/genarena/visualize/__init__.py +14 -0
- genarena-0.1.1/genarena/visualize/app.py +938 -0
- genarena-0.1.1/genarena/visualize/data_loader.py +2430 -0
- genarena-0.1.1/genarena/visualize/static/app.js +3762 -0
- genarena-0.1.1/genarena/visualize/static/model_aliases.json +86 -0
- genarena-0.1.1/genarena/visualize/static/style.css +4104 -0
- genarena-0.1.1/genarena/visualize/templates/index.html +413 -0
- genarena-0.1.1/genarena/vlm.py +519 -0
- genarena-0.1.1/pyproject.toml +48 -0
- genarena-0.1.1/requirements.txt +12 -0
- genarena-0.1.1/setup.py +44 -0
- genarena-0.0.1/PKG-INFO +0 -26
- genarena-0.0.1/README.md +0 -13
- genarena-0.0.1/pyproject.toml +0 -22
- genarena-0.0.1/setup.cfg +0 -4
- genarena-0.0.1/src/genarena/__init__.py +0 -3
- genarena-0.0.1/src/genarena.egg-info/PKG-INFO +0 -26
- genarena-0.0.1/src/genarena.egg-info/SOURCES.txt +0 -7
- genarena-0.0.1/src/genarena.egg-info/dependency_links.txt +0 -1
- genarena-0.0.1/src/genarena.egg-info/top_level.txt +0 -1
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*" # Trigger on tags like v0.1.0, v1.0.0, etc.
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
build:
|
|
11
|
+
name: Build distribution
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- name: Set up Python
|
|
17
|
+
uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: "3.11"
|
|
20
|
+
|
|
21
|
+
- name: Install build dependencies
|
|
22
|
+
run: |
|
|
23
|
+
python -m pip install --upgrade pip
|
|
24
|
+
pip install build
|
|
25
|
+
|
|
26
|
+
- name: Build package
|
|
27
|
+
run: python -m build
|
|
28
|
+
|
|
29
|
+
- name: Upload distribution artifacts
|
|
30
|
+
uses: actions/upload-artifact@v4
|
|
31
|
+
with:
|
|
32
|
+
name: python-package-distributions
|
|
33
|
+
path: dist/
|
|
34
|
+
|
|
35
|
+
create-release:
|
|
36
|
+
name: Create GitHub Release
|
|
37
|
+
needs: build
|
|
38
|
+
runs-on: ubuntu-latest
|
|
39
|
+
permissions:
|
|
40
|
+
contents: write # Required for creating releases
|
|
41
|
+
steps:
|
|
42
|
+
- name: Download distribution artifacts
|
|
43
|
+
uses: actions/download-artifact@v4
|
|
44
|
+
with:
|
|
45
|
+
name: python-package-distributions
|
|
46
|
+
path: dist/
|
|
47
|
+
|
|
48
|
+
- name: Create GitHub Release
|
|
49
|
+
uses: softprops/action-gh-release@v2
|
|
50
|
+
with:
|
|
51
|
+
files: dist/*
|
|
52
|
+
generate_release_notes: true
|
|
53
|
+
|
|
54
|
+
publish-to-pypi:
|
|
55
|
+
name: Publish to PyPI
|
|
56
|
+
needs: build
|
|
57
|
+
runs-on: ubuntu-latest
|
|
58
|
+
environment:
|
|
59
|
+
name: pypi
|
|
60
|
+
url: https://pypi.org/p/genarena
|
|
61
|
+
permissions:
|
|
62
|
+
id-token: write # Required for trusted publishing
|
|
63
|
+
|
|
64
|
+
steps:
|
|
65
|
+
- name: Download distribution artifacts
|
|
66
|
+
uses: actions/download-artifact@v4
|
|
67
|
+
with:
|
|
68
|
+
name: python-package-distributions
|
|
69
|
+
path: dist/
|
|
70
|
+
|
|
71
|
+
- name: Publish to PyPI
|
|
72
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
#uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
#poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
#pdm.lock
|
|
116
|
+
#pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
#pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# SageMath parsed files
|
|
135
|
+
*.sage.py
|
|
136
|
+
|
|
137
|
+
# Environments
|
|
138
|
+
.env
|
|
139
|
+
.envrc
|
|
140
|
+
.venv
|
|
141
|
+
env/
|
|
142
|
+
venv/
|
|
143
|
+
ENV/
|
|
144
|
+
env.bak/
|
|
145
|
+
venv.bak/
|
|
146
|
+
|
|
147
|
+
# Spyder project settings
|
|
148
|
+
.spyderproject
|
|
149
|
+
.spyproject
|
|
150
|
+
|
|
151
|
+
# Rope project settings
|
|
152
|
+
.ropeproject
|
|
153
|
+
|
|
154
|
+
# mkdocs documentation
|
|
155
|
+
/site
|
|
156
|
+
|
|
157
|
+
# mypy
|
|
158
|
+
.mypy_cache/
|
|
159
|
+
.dmypy.json
|
|
160
|
+
dmypy.json
|
|
161
|
+
|
|
162
|
+
# Pyre type checker
|
|
163
|
+
.pyre/
|
|
164
|
+
|
|
165
|
+
# pytype static type analyzer
|
|
166
|
+
.pytype/
|
|
167
|
+
|
|
168
|
+
# Cython debug symbols
|
|
169
|
+
cython_debug/
|
|
170
|
+
|
|
171
|
+
# PyCharm
|
|
172
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
173
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
174
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
175
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
176
|
+
#.idea/
|
|
177
|
+
|
|
178
|
+
# Abstra
|
|
179
|
+
# Abstra is an AI-powered process automation framework.
|
|
180
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
181
|
+
# Learn more at https://abstra.io/docs
|
|
182
|
+
.abstra/
|
|
183
|
+
|
|
184
|
+
# Visual Studio Code
|
|
185
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
186
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
188
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
189
|
+
# .vscode/
|
|
190
|
+
|
|
191
|
+
# Ruff stuff:
|
|
192
|
+
.ruff_cache/
|
|
193
|
+
|
|
194
|
+
# PyPI configuration file
|
|
195
|
+
.pypirc
|
|
196
|
+
|
|
197
|
+
# Cursor
|
|
198
|
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
|
199
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
|
200
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
|
201
|
+
.cursorignore
|
|
202
|
+
.cursorindexingignore
|
|
203
|
+
|
|
204
|
+
# Marimo
|
|
205
|
+
marimo/_static/
|
|
206
|
+
marimo/_lsp/
|
|
207
|
+
__marimo__/
|
genarena-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: genarena
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: GenArena Arena Evaluation - VLM-based pairwise image generation evaluation
|
|
5
|
+
Author: GenArena Team
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Science/Research
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Requires-Dist: datasets>=2.0.0
|
|
17
|
+
Requires-Dist: huggingface-hub>=0.20.0
|
|
18
|
+
Requires-Dist: json-repair>=0.25.0
|
|
19
|
+
Requires-Dist: openai>=1.0.0
|
|
20
|
+
Requires-Dist: pandas>=2.0.0
|
|
21
|
+
Requires-Dist: pillow>=9.0.0
|
|
22
|
+
Requires-Dist: pyarrow>=12.0.0
|
|
23
|
+
Requires-Dist: tqdm>=4.65.0
|
|
24
|
+
Provides-Extra: web
|
|
25
|
+
Requires-Dist: flask>=2.0.0; extra == 'web'
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# GenArena
|
|
29
|
+
|
|
30
|
+
A unified evaluation framework for visual generation tasks using VLM-based pairwise comparison and Elo ranking.
|
|
31
|
+
|
|
32
|
+
[](https://arxiv.org/abs/2602.XXXXX)
|
|
33
|
+
[](https://genarena.github.io)
|
|
34
|
+
[](https://huggingface.co/spaces/genarena/leaderboard)
|
|
35
|
+
[](https://huggingface.co/datasets/rhli/genarena)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
## Abstract
|
|
39
|
+
|
|
40
|
+
The rapid advancement of visual generation models has outpaced traditional evaluation approaches, necessitating the adoption of Vision-Language Models as surrogate judges. In this work, we systematically investigate the reliability of the prevailing absolute pointwise scoring standard, across a wide spectrum of visual generation tasks. Our analysis reveals that this paradigm is limited due to stochastic inconsistency and poor alignment with human perception. To resolve these limitations, we introduce **GenArena**, a unified evaluation framework that leverages a *pairwise comparison* paradigm to ensure stable and human-aligned evaluation. Crucially, our experiments uncover a transformative finding that simply adopting this pairwise protocol enables off-the-shelf open-source models to outperform top-tier proprietary models. Notably, our method boosts evaluation accuracy by over 20% and achieves a Spearman correlation of 0.86 with the authoritative LMArena leaderboard, drastically surpassing the 0.36 correlation of pointwise methods. Based on GenArena, we benchmark state-of-the-art visual generation models across diverse tasks, providing the community with a rigorous and automated evaluation standard for visual generation.
|
|
41
|
+
|
|
42
|
+
## Quick Start
|
|
43
|
+
|
|
44
|
+
### Installation
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install genarena
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Or install from source:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
git clone https://github.com/ruihanglix/genarena.git
|
|
54
|
+
cd genarena
|
|
55
|
+
pip install -e .
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Initialize Arena
|
|
59
|
+
|
|
60
|
+
Download benchmark data and official arena data with one command:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
genarena init --arena_dir ./arena --data_dir ./data
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
This downloads:
|
|
67
|
+
- Benchmark Parquet data from `rhli/genarena` (HuggingFace)
|
|
68
|
+
- Official arena data (model outputs + battle logs) from `rhli/genarena-battlefield`
|
|
69
|
+
|
|
70
|
+
### Environment Setup
|
|
71
|
+
|
|
72
|
+
Set your VLM API credentials:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
export OPENAI_API_KEY="your-api-key"
|
|
76
|
+
export OPENAI_BASE_URL="https://api.example.com/v1"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
For multi-endpoint support (load balancing and failover), use comma-separated values:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
export OPENAI_BASE_URLS="https://api1.example.com/v1,https://api2.example.com/v1"
|
|
83
|
+
export OPENAI_API_KEYS="key1,key2,key3"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Run Evaluation
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
genarena run --arena_dir ./arena --data_dir ./data
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### View Leaderboard
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
genarena leaderboard --arena_dir ./arena --subset basic
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Check Status
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
genarena status --arena_dir ./arena --data_dir ./data
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Running Your Own Experiments
|
|
105
|
+
|
|
106
|
+
### Directory Structure
|
|
107
|
+
|
|
108
|
+
To add your own model for evaluation, organize outputs in the following structure:
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
arena_dir/
|
|
112
|
+
└── <subset>/
|
|
113
|
+
└── models/
|
|
114
|
+
└── <GithubID>_<modelName>_<yyyymmdd>/
|
|
115
|
+
└── <model_name>/
|
|
116
|
+
├── 000000.png
|
|
117
|
+
├── 000001.png
|
|
118
|
+
└── ...
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
For example:
|
|
122
|
+
```
|
|
123
|
+
arena/basic/models/johndoe_MyNewModel_20260205/MyNewModel/
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Generate Images with Diffgentor
|
|
127
|
+
|
|
128
|
+
Use [Diffgentor](https://github.com/ruihanglix/diffgentor) to batch generate images for evaluation:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
# Download benchmark data
|
|
132
|
+
hf download rhli/genarena --repo-type dataset --local-dir ./data
|
|
133
|
+
|
|
134
|
+
# Generate images with your model
|
|
135
|
+
diffgentor edit --backend diffusers \
|
|
136
|
+
--model_name YourModel \
|
|
137
|
+
--input ./data/basic/ \
|
|
138
|
+
--output_dir ./arena/basic/models/yourname_YourModel_20260205/YourModel/
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Run Battles for New Models
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
genarena run --arena_dir ./arena --data_dir ./data \
|
|
145
|
+
--subset basic \
|
|
146
|
+
--exp_name yourname_YourModel_20260205
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
GenArena automatically detects new models and schedules battles against existing models.
|
|
150
|
+
|
|
151
|
+
## Submit to Official Leaderboard
|
|
152
|
+
|
|
153
|
+
> **Coming Soon**: The `genarena submit` command will allow you to submit your evaluation results to the official GenArena leaderboard via GitHub PR.
|
|
154
|
+
|
|
155
|
+
The workflow will be:
|
|
156
|
+
1. Run evaluation locally with `genarena run`
|
|
157
|
+
2. Upload results to your HuggingFace repository
|
|
158
|
+
3. Submit via `genarena submit` which creates a PR for review
|
|
159
|
+
|
|
160
|
+
## Documentation
|
|
161
|
+
|
|
162
|
+
| Document | Description |
|
|
163
|
+
|----------|-------------|
|
|
164
|
+
| [Quick Start](./docs/quickstart.md) | Installation and basic usage guide |
|
|
165
|
+
| [Architecture](./docs/architecture.md) | System design and key concepts |
|
|
166
|
+
| [CLI Reference](./docs/cli-reference.md) | Complete command-line interface documentation |
|
|
167
|
+
| [Experiment Management](./docs/experiments.md) | How to organize and manage experiments |
|
|
168
|
+
| [FAQ](./docs/faq.md) | Frequently asked questions |
|
|
169
|
+
|
|
170
|
+
## Citation
|
|
171
|
+
|
|
172
|
+
```bibtex
|
|
173
|
+
TBD
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## License
|
|
177
|
+
|
|
178
|
+
Apache License 2.0
|
genarena-0.1.1/README.md
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
# GenArena
|
|
2
|
+
|
|
3
|
+
A unified evaluation framework for visual generation tasks using VLM-based pairwise comparison and Elo ranking.
|
|
4
|
+
|
|
5
|
+
[](https://arxiv.org/abs/2602.XXXXX)
|
|
6
|
+
[](https://genarena.github.io)
|
|
7
|
+
[](https://huggingface.co/spaces/genarena/leaderboard)
|
|
8
|
+
[](https://huggingface.co/datasets/rhli/genarena)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
## Abstract
|
|
12
|
+
|
|
13
|
+
The rapid advancement of visual generation models has outpaced traditional evaluation approaches, necessitating the adoption of Vision-Language Models as surrogate judges. In this work, we systematically investigate the reliability of the prevailing absolute pointwise scoring standard, across a wide spectrum of visual generation tasks. Our analysis reveals that this paradigm is limited due to stochastic inconsistency and poor alignment with human perception. To resolve these limitations, we introduce **GenArena**, a unified evaluation framework that leverages a *pairwise comparison* paradigm to ensure stable and human-aligned evaluation. Crucially, our experiments uncover a transformative finding that simply adopting this pairwise protocol enables off-the-shelf open-source models to outperform top-tier proprietary models. Notably, our method boosts evaluation accuracy by over 20% and achieves a Spearman correlation of 0.86 with the authoritative LMArena leaderboard, drastically surpassing the 0.36 correlation of pointwise methods. Based on GenArena, we benchmark state-of-the-art visual generation models across diverse tasks, providing the community with a rigorous and automated evaluation standard for visual generation.
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
### Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install genarena
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Or install from source:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
git clone https://github.com/ruihanglix/genarena.git
|
|
27
|
+
cd genarena
|
|
28
|
+
pip install -e .
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### Initialize Arena
|
|
32
|
+
|
|
33
|
+
Download benchmark data and official arena data with one command:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
genarena init --arena_dir ./arena --data_dir ./data
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
This downloads:
|
|
40
|
+
- Benchmark Parquet data from `rhli/genarena` (HuggingFace)
|
|
41
|
+
- Official arena data (model outputs + battle logs) from `rhli/genarena-battlefield`
|
|
42
|
+
|
|
43
|
+
### Environment Setup
|
|
44
|
+
|
|
45
|
+
Set your VLM API credentials:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
export OPENAI_API_KEY="your-api-key"
|
|
49
|
+
export OPENAI_BASE_URL="https://api.example.com/v1"
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
For multi-endpoint support (load balancing and failover), use comma-separated values:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
export OPENAI_BASE_URLS="https://api1.example.com/v1,https://api2.example.com/v1"
|
|
56
|
+
export OPENAI_API_KEYS="key1,key2,key3"
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Run Evaluation
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
genarena run --arena_dir ./arena --data_dir ./data
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### View Leaderboard
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
genarena leaderboard --arena_dir ./arena --subset basic
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Check Status
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
genarena status --arena_dir ./arena --data_dir ./data
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Running Your Own Experiments
|
|
78
|
+
|
|
79
|
+
### Directory Structure
|
|
80
|
+
|
|
81
|
+
To add your own model for evaluation, organize outputs in the following structure:
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
arena_dir/
|
|
85
|
+
└── <subset>/
|
|
86
|
+
└── models/
|
|
87
|
+
└── <GithubID>_<modelName>_<yyyymmdd>/
|
|
88
|
+
└── <model_name>/
|
|
89
|
+
├── 000000.png
|
|
90
|
+
├── 000001.png
|
|
91
|
+
└── ...
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
For example:
|
|
95
|
+
```
|
|
96
|
+
arena/basic/models/johndoe_MyNewModel_20260205/MyNewModel/
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Generate Images with Diffgentor
|
|
100
|
+
|
|
101
|
+
Use [Diffgentor](https://github.com/ruihanglix/diffgentor) to batch generate images for evaluation:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
# Download benchmark data
|
|
105
|
+
hf download rhli/genarena --repo-type dataset --local-dir ./data
|
|
106
|
+
|
|
107
|
+
# Generate images with your model
|
|
108
|
+
diffgentor edit --backend diffusers \
|
|
109
|
+
--model_name YourModel \
|
|
110
|
+
--input ./data/basic/ \
|
|
111
|
+
--output_dir ./arena/basic/models/yourname_YourModel_20260205/YourModel/
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Run Battles for New Models
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
genarena run --arena_dir ./arena --data_dir ./data \
|
|
118
|
+
--subset basic \
|
|
119
|
+
--exp_name yourname_YourModel_20260205
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
GenArena automatically detects new models and schedules battles against existing models.
|
|
123
|
+
|
|
124
|
+
## Submit to Official Leaderboard
|
|
125
|
+
|
|
126
|
+
> **Coming Soon**: The `genarena submit` command will allow you to submit your evaluation results to the official GenArena leaderboard via GitHub PR.
|
|
127
|
+
|
|
128
|
+
The workflow will be:
|
|
129
|
+
1. Run evaluation locally with `genarena run`
|
|
130
|
+
2. Upload results to your HuggingFace repository
|
|
131
|
+
3. Submit via `genarena submit` which creates a PR for review
|
|
132
|
+
|
|
133
|
+
## Documentation
|
|
134
|
+
|
|
135
|
+
| Document | Description |
|
|
136
|
+
|----------|-------------|
|
|
137
|
+
| [Quick Start](./docs/quickstart.md) | Installation and basic usage guide |
|
|
138
|
+
| [Architecture](./docs/architecture.md) | System design and key concepts |
|
|
139
|
+
| [CLI Reference](./docs/cli-reference.md) | Complete command-line interface documentation |
|
|
140
|
+
| [Experiment Management](./docs/experiments.md) | How to organize and manage experiments |
|
|
141
|
+
| [FAQ](./docs/faq.md) | Frequently asked questions |
|
|
142
|
+
|
|
143
|
+
## Citation
|
|
144
|
+
|
|
145
|
+
```bibtex
|
|
146
|
+
TBD
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## License
|
|
150
|
+
|
|
151
|
+
Apache License 2.0
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# GenArena Documentation
|
|
2
|
+
|
|
3
|
+
GenArena is a VLM-based pairwise evaluation system for image generation models using Bradley-Terry ELO ranking.
|
|
4
|
+
|
|
5
|
+
## Documentation Structure
|
|
6
|
+
|
|
7
|
+
| Document | Description |
|
|
8
|
+
|----------|-------------|
|
|
9
|
+
| [Quick Start](./quickstart.md) | Installation and basic usage guide |
|
|
10
|
+
| [Architecture](./architecture.md) | System design and key concepts |
|
|
11
|
+
| [CLI Reference](./cli-reference.md) | Complete command-line interface documentation |
|
|
12
|
+
| [Experiment Management](./experiments.md) | How to organize and manage experiments |
|
|
13
|
+
| [FAQ](./faq.md) | Frequently asked questions |
|
|
14
|
+
|
|
15
|
+
## Key Features
|
|
16
|
+
|
|
17
|
+
- **Pairwise Evaluation**: Compare image generation models head-to-head using VLM judges
|
|
18
|
+
- **Position Debiasing**: Double-call swap method to eliminate position bias
|
|
19
|
+
- **Bradley-Terry ELO**: Order-independent batch scoring (not online K-factor)
|
|
20
|
+
- **Incremental Evaluation**: Add new models without re-running historical battles
|
|
21
|
+
- **Milestone Anchoring**: Stable ELO scores across experiments via anchored fitting
|
|
22
|
+
- **Adaptive Sampling**: CI-based sampling to achieve target confidence efficiently
|
|
23
|
+
- **Multi-endpoint Support**: Load balancing across multiple VLM API endpoints
|
|
24
|
+
- **Git/HuggingFace Sync**: Built-in version control and dataset sharing
|
|
25
|
+
|
|
26
|
+
## Quick Links
|
|
27
|
+
|
|
28
|
+
- **Installation**: `uv pip install -e .`
|
|
29
|
+
- **Run battles**: `genarena run --arena_dir <path> --data_dir <path>`
|
|
30
|
+
- **View leaderboard**: `genarena leaderboard --arena_dir <path> --subset basic`
|
|
31
|
+
- **Start visualization**: `genarena serve --arena_dir <path> --data_dir <path>`
|
|
32
|
+
|
|
33
|
+
## System Requirements
|
|
34
|
+
|
|
35
|
+
- Python 3.10+
|
|
36
|
+
- VLM API access (OpenAI-compatible endpoint)
|
|
37
|
+
- Parquet dataset with prompts and model outputs
|