genarena 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genarena/__init__.py +49 -2
- genarena/__main__.py +10 -0
- genarena/arena.py +1685 -0
- genarena/battle.py +337 -0
- genarena/bt_elo.py +507 -0
- genarena/cli.py +1581 -0
- genarena/data.py +476 -0
- genarena/deploy/Dockerfile +25 -0
- genarena/deploy/README.md +55 -0
- genarena/deploy/__init__.py +5 -0
- genarena/deploy/app.py +84 -0
- genarena/experiments.py +121 -0
- genarena/leaderboard.py +270 -0
- genarena/logs.py +409 -0
- genarena/models.py +412 -0
- genarena/prompts/__init__.py +127 -0
- genarena/prompts/mmrb2.py +373 -0
- genarena/sampling.py +336 -0
- genarena/state.py +656 -0
- genarena/sync/__init__.py +105 -0
- genarena/sync/auto_commit.py +118 -0
- genarena/sync/deploy_ops.py +543 -0
- genarena/sync/git_ops.py +422 -0
- genarena/sync/hf_ops.py +891 -0
- genarena/sync/init_ops.py +431 -0
- genarena/sync/packer.py +587 -0
- genarena/sync/submit.py +837 -0
- genarena/utils.py +103 -0
- genarena/validation/__init__.py +19 -0
- genarena/validation/schema.py +327 -0
- genarena/validation/validator.py +329 -0
- genarena/visualize/README.md +148 -0
- genarena/visualize/__init__.py +14 -0
- genarena/visualize/app.py +938 -0
- genarena/visualize/data_loader.py +2335 -0
- genarena/visualize/static/app.js +3762 -0
- genarena/visualize/static/model_aliases.json +86 -0
- genarena/visualize/static/style.css +4104 -0
- genarena/visualize/templates/index.html +413 -0
- genarena/vlm.py +519 -0
- genarena-0.1.0.dist-info/METADATA +178 -0
- genarena-0.1.0.dist-info/RECORD +44 -0
- {genarena-0.0.1.dist-info → genarena-0.1.0.dist-info}/WHEEL +1 -2
- genarena-0.1.0.dist-info/entry_points.txt +2 -0
- genarena-0.0.1.dist-info/METADATA +0 -26
- genarena-0.0.1.dist-info/RECORD +0 -5
- genarena-0.0.1.dist-info/top_level.txt +0 -1
genarena/battle.py
ADDED
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
# Copyright 2026 Ruihang Li.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0.
|
|
3
|
+
# See LICENSE file in the project root for details.
|
|
4
|
+
|
|
5
|
+
"""Battle execution module with position debiasing."""
|
|
6
|
+
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
8
|
+
import logging
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from types import ModuleType
|
|
11
|
+
from typing import Any, Optional, Union
|
|
12
|
+
|
|
13
|
+
from genarena.data import DataSample
|
|
14
|
+
from genarena.vlm import VLMJudge
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
def _progress_update(progress: Any, n: int) -> None:
|
|
20
|
+
"""
|
|
21
|
+
Update progress for each API call.
|
|
22
|
+
|
|
23
|
+
Supports:
|
|
24
|
+
- queue-like objects with .put(int) (recommended for multi-thread/process)
|
|
25
|
+
- tqdm-like objects with .update(int)
|
|
26
|
+
"""
|
|
27
|
+
if progress is None or n <= 0:
|
|
28
|
+
return
|
|
29
|
+
if hasattr(progress, "put"):
|
|
30
|
+
try:
|
|
31
|
+
progress.put(n)
|
|
32
|
+
return
|
|
33
|
+
except Exception:
|
|
34
|
+
return
|
|
35
|
+
if hasattr(progress, "update"):
|
|
36
|
+
try:
|
|
37
|
+
progress.update(n)
|
|
38
|
+
except Exception:
|
|
39
|
+
return
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class CallResult:
|
|
44
|
+
"""Result from a single VLM call."""
|
|
45
|
+
|
|
46
|
+
raw_response: str
|
|
47
|
+
parsed_result: Optional[dict[str, Any]]
|
|
48
|
+
parse_success: bool
|
|
49
|
+
parse_error: Optional[str] = None
|
|
50
|
+
winner: Optional[str] = None # "A" or "B" (position in that call)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class BattleResult:
|
|
55
|
+
"""Result from a complete battle (two VLM calls with position swap)."""
|
|
56
|
+
|
|
57
|
+
# Final result
|
|
58
|
+
final_winner: str # "model_a", "model_b", or "tie"
|
|
59
|
+
is_consistent: bool # Whether both calls agreed
|
|
60
|
+
|
|
61
|
+
# Individual call results
|
|
62
|
+
original_call: CallResult
|
|
63
|
+
swapped_call: CallResult
|
|
64
|
+
|
|
65
|
+
# Metadata
|
|
66
|
+
model_a: str = ""
|
|
67
|
+
model_b: str = ""
|
|
68
|
+
sample_index: int = 0
|
|
69
|
+
|
|
70
|
+
# Converted winners (in terms of model_a/model_b, not A/B position)
|
|
71
|
+
original_model_winner: Optional[str] = None # model_a, model_b, or tie
|
|
72
|
+
swapped_model_winner: Optional[str] = None # model_a, model_b, or tie
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _call_vlm_judge(
|
|
76
|
+
vlm: VLMJudge,
|
|
77
|
+
prompt_module: ModuleType,
|
|
78
|
+
instruction: str,
|
|
79
|
+
input_images: list[Union[str, bytes]],
|
|
80
|
+
image_a: Union[str, bytes],
|
|
81
|
+
image_b: Union[str, bytes]
|
|
82
|
+
) -> CallResult:
|
|
83
|
+
"""
|
|
84
|
+
Execute a single VLM judge call.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
vlm: VLMJudge instance
|
|
88
|
+
prompt_module: Prompt module with build_prompt and parse_response
|
|
89
|
+
instruction: The editing instruction
|
|
90
|
+
input_images: List of input images
|
|
91
|
+
image_a: Output image for position A
|
|
92
|
+
image_b: Output image for position B
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
CallResult with raw response and parsed result
|
|
96
|
+
"""
|
|
97
|
+
# Build prompt
|
|
98
|
+
messages = prompt_module.build_prompt(
|
|
99
|
+
instruction=instruction,
|
|
100
|
+
input_images=input_images,
|
|
101
|
+
output_image_a=image_a,
|
|
102
|
+
output_image_b=image_b
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Call VLM
|
|
106
|
+
try:
|
|
107
|
+
raw_response = vlm.call(messages)
|
|
108
|
+
except Exception as e:
|
|
109
|
+
return CallResult(
|
|
110
|
+
raw_response="",
|
|
111
|
+
parsed_result=None,
|
|
112
|
+
parse_success=False,
|
|
113
|
+
parse_error=f"VLM call failed: {e}",
|
|
114
|
+
winner=None
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Parse response
|
|
118
|
+
try:
|
|
119
|
+
parsed_result = prompt_module.parse_response(raw_response)
|
|
120
|
+
winner = parsed_result.get("winner")
|
|
121
|
+
return CallResult(
|
|
122
|
+
raw_response=raw_response,
|
|
123
|
+
parsed_result=parsed_result,
|
|
124
|
+
parse_success=True,
|
|
125
|
+
parse_error=None,
|
|
126
|
+
winner=winner
|
|
127
|
+
)
|
|
128
|
+
except Exception as e:
|
|
129
|
+
return CallResult(
|
|
130
|
+
raw_response=raw_response,
|
|
131
|
+
parsed_result=None,
|
|
132
|
+
parse_success=False,
|
|
133
|
+
parse_error=f"Parse failed: {e}",
|
|
134
|
+
winner=None
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _convert_position_winner_to_model(
|
|
139
|
+
position_winner: Optional[str],
|
|
140
|
+
is_swapped: bool
|
|
141
|
+
) -> Optional[str]:
|
|
142
|
+
"""
|
|
143
|
+
Convert position-based winner (A/B) to model-based winner (model_a/model_b).
|
|
144
|
+
|
|
145
|
+
In original order: A -> model_a, B -> model_b
|
|
146
|
+
In swapped order: A -> model_b, B -> model_a
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
position_winner: "A", "B", "tie", or None
|
|
150
|
+
is_swapped: Whether this was from the swapped call
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
"model_a", "model_b", "tie", or None
|
|
154
|
+
"""
|
|
155
|
+
if position_winner is None:
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
winner_upper = position_winner.upper()
|
|
159
|
+
|
|
160
|
+
if winner_upper == "TIE":
|
|
161
|
+
return "tie"
|
|
162
|
+
|
|
163
|
+
if not is_swapped:
|
|
164
|
+
# Original order: A = model_a, B = model_b
|
|
165
|
+
if winner_upper == "A":
|
|
166
|
+
return "model_a"
|
|
167
|
+
elif winner_upper == "B":
|
|
168
|
+
return "model_b"
|
|
169
|
+
else:
|
|
170
|
+
# Swapped order: A = model_b, B = model_a
|
|
171
|
+
if winner_upper == "A":
|
|
172
|
+
return "model_b"
|
|
173
|
+
elif winner_upper == "B":
|
|
174
|
+
return "model_a"
|
|
175
|
+
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _combine_votes(
|
|
180
|
+
original_model_winner: Optional[str],
|
|
181
|
+
swapped_model_winner: Optional[str],
|
|
182
|
+
allow_tie: bool
|
|
183
|
+
) -> tuple[str, bool]:
|
|
184
|
+
"""
|
|
185
|
+
Combine two voting results to determine final winner.
|
|
186
|
+
|
|
187
|
+
Position debiasing logic:
|
|
188
|
+
- If both calls agree on a winner -> that model wins (consistent)
|
|
189
|
+
- If calls disagree or either is tie/None -> tie (inconsistent)
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
original_model_winner: Winner from original call ("model_a", "model_b", "tie", None)
|
|
193
|
+
swapped_model_winner: Winner from swapped call ("model_a", "model_b", "tie", None)
|
|
194
|
+
allow_tie: Whether single-round ties are allowed by the prompt
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Tuple of (final_winner, is_consistent)
|
|
198
|
+
- final_winner: "model_a", "model_b", or "tie"
|
|
199
|
+
- is_consistent: True if both calls agreed
|
|
200
|
+
"""
|
|
201
|
+
# Handle None cases (parse failures)
|
|
202
|
+
if original_model_winner is None or swapped_model_winner is None:
|
|
203
|
+
return "tie", False
|
|
204
|
+
|
|
205
|
+
# Both are valid results
|
|
206
|
+
if original_model_winner == swapped_model_winner:
|
|
207
|
+
# Both agree
|
|
208
|
+
if original_model_winner in ("model_a", "model_b"):
|
|
209
|
+
return original_model_winner, True
|
|
210
|
+
else:
|
|
211
|
+
# Both returned tie (only possible if ALLOW_TIE=True)
|
|
212
|
+
return "tie", True
|
|
213
|
+
else:
|
|
214
|
+
# Disagreement -> tie
|
|
215
|
+
return "tie", False
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def execute_battle(
|
|
219
|
+
vlm: VLMJudge,
|
|
220
|
+
prompt_module: ModuleType,
|
|
221
|
+
sample: DataSample,
|
|
222
|
+
model_a_output: Union[str, bytes],
|
|
223
|
+
model_b_output: Union[str, bytes],
|
|
224
|
+
model_a: str = "",
|
|
225
|
+
model_b: str = "",
|
|
226
|
+
parallel_swap_calls: bool = False,
|
|
227
|
+
progress: Any = None,
|
|
228
|
+
) -> BattleResult:
|
|
229
|
+
"""
|
|
230
|
+
Execute a complete battle with position debiasing.
|
|
231
|
+
|
|
232
|
+
Makes two VLM calls:
|
|
233
|
+
1. Original order: position A = model_a output, position B = model_b output
|
|
234
|
+
2. Swapped order: position A = model_b output, position B = model_a output
|
|
235
|
+
|
|
236
|
+
Then combines results according to ALLOW_TIE setting in prompt module.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
vlm: VLMJudge instance
|
|
240
|
+
prompt_module: Prompt module with build_prompt, parse_response, ALLOW_TIE
|
|
241
|
+
sample: DataSample with instruction and input_images
|
|
242
|
+
model_a_output: Output image path/bytes from model A
|
|
243
|
+
model_b_output: Output image path/bytes from model B
|
|
244
|
+
model_a: Model A name (for logging)
|
|
245
|
+
model_b: Model B name (for logging)
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
BattleResult with final winner and both call details
|
|
249
|
+
"""
|
|
250
|
+
allow_tie = getattr(prompt_module, "ALLOW_TIE", False)
|
|
251
|
+
|
|
252
|
+
if not parallel_swap_calls:
|
|
253
|
+
# Call 1: Original order (A = model_a, B = model_b)
|
|
254
|
+
logger.debug(f"Battle {model_a} vs {model_b}: executing original call")
|
|
255
|
+
_progress_update(progress, 1)
|
|
256
|
+
original_call = _call_vlm_judge(
|
|
257
|
+
vlm=vlm,
|
|
258
|
+
prompt_module=prompt_module,
|
|
259
|
+
instruction=sample.instruction,
|
|
260
|
+
input_images=sample.input_images,
|
|
261
|
+
image_a=model_a_output,
|
|
262
|
+
image_b=model_b_output
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# Call 2: Swapped order (A = model_b, B = model_a)
|
|
266
|
+
logger.debug(f"Battle {model_a} vs {model_b}: executing swapped call")
|
|
267
|
+
_progress_update(progress, 1)
|
|
268
|
+
swapped_call = _call_vlm_judge(
|
|
269
|
+
vlm=vlm,
|
|
270
|
+
prompt_module=prompt_module,
|
|
271
|
+
instruction=sample.instruction,
|
|
272
|
+
input_images=sample.input_images,
|
|
273
|
+
image_a=model_b_output,
|
|
274
|
+
image_b=model_a_output
|
|
275
|
+
)
|
|
276
|
+
else:
|
|
277
|
+
# Execute original + swapped calls in parallel to reduce per-battle latency.
|
|
278
|
+
# Note: This doubles the instantaneous request concurrency per battle.
|
|
279
|
+
logger.debug(f"Battle {model_a} vs {model_b}: executing original+swapped calls in parallel")
|
|
280
|
+
with ThreadPoolExecutor(max_workers=2) as executor:
|
|
281
|
+
_progress_update(progress, 1)
|
|
282
|
+
fut_original = executor.submit(
|
|
283
|
+
_call_vlm_judge,
|
|
284
|
+
vlm,
|
|
285
|
+
prompt_module,
|
|
286
|
+
sample.instruction,
|
|
287
|
+
sample.input_images,
|
|
288
|
+
model_a_output,
|
|
289
|
+
model_b_output,
|
|
290
|
+
)
|
|
291
|
+
_progress_update(progress, 1)
|
|
292
|
+
fut_swapped = executor.submit(
|
|
293
|
+
_call_vlm_judge,
|
|
294
|
+
vlm,
|
|
295
|
+
prompt_module,
|
|
296
|
+
sample.instruction,
|
|
297
|
+
sample.input_images,
|
|
298
|
+
model_b_output,
|
|
299
|
+
model_a_output,
|
|
300
|
+
)
|
|
301
|
+
# Preserve error behavior: propagate exceptions if any occur.
|
|
302
|
+
original_call = fut_original.result()
|
|
303
|
+
swapped_call = fut_swapped.result()
|
|
304
|
+
|
|
305
|
+
# Convert position winners to model winners
|
|
306
|
+
original_model_winner = _convert_position_winner_to_model(
|
|
307
|
+
original_call.winner, is_swapped=False
|
|
308
|
+
)
|
|
309
|
+
swapped_model_winner = _convert_position_winner_to_model(
|
|
310
|
+
swapped_call.winner, is_swapped=True
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# Combine votes
|
|
314
|
+
final_winner, is_consistent = _combine_votes(
|
|
315
|
+
original_model_winner,
|
|
316
|
+
swapped_model_winner,
|
|
317
|
+
allow_tie
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
logger.debug(
|
|
321
|
+
f"Battle {model_a} vs {model_b}: "
|
|
322
|
+
f"original={original_call.winner}->{original_model_winner}, "
|
|
323
|
+
f"swapped={swapped_call.winner}->{swapped_model_winner}, "
|
|
324
|
+
f"final={final_winner}, consistent={is_consistent}"
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
return BattleResult(
|
|
328
|
+
final_winner=final_winner,
|
|
329
|
+
is_consistent=is_consistent,
|
|
330
|
+
original_call=original_call,
|
|
331
|
+
swapped_call=swapped_call,
|
|
332
|
+
model_a=model_a,
|
|
333
|
+
model_b=model_b,
|
|
334
|
+
sample_index=sample.index,
|
|
335
|
+
original_model_winner=original_model_winner,
|
|
336
|
+
swapped_model_winner=swapped_model_winner
|
|
337
|
+
)
|