genarena 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. genarena/__init__.py +49 -2
  2. genarena/__main__.py +10 -0
  3. genarena/arena.py +1685 -0
  4. genarena/battle.py +337 -0
  5. genarena/bt_elo.py +507 -0
  6. genarena/cli.py +1581 -0
  7. genarena/data.py +476 -0
  8. genarena/deploy/Dockerfile +25 -0
  9. genarena/deploy/README.md +55 -0
  10. genarena/deploy/__init__.py +5 -0
  11. genarena/deploy/app.py +84 -0
  12. genarena/experiments.py +121 -0
  13. genarena/leaderboard.py +270 -0
  14. genarena/logs.py +409 -0
  15. genarena/models.py +412 -0
  16. genarena/prompts/__init__.py +127 -0
  17. genarena/prompts/mmrb2.py +373 -0
  18. genarena/sampling.py +336 -0
  19. genarena/state.py +656 -0
  20. genarena/sync/__init__.py +105 -0
  21. genarena/sync/auto_commit.py +118 -0
  22. genarena/sync/deploy_ops.py +543 -0
  23. genarena/sync/git_ops.py +422 -0
  24. genarena/sync/hf_ops.py +891 -0
  25. genarena/sync/init_ops.py +431 -0
  26. genarena/sync/packer.py +587 -0
  27. genarena/sync/submit.py +837 -0
  28. genarena/utils.py +103 -0
  29. genarena/validation/__init__.py +19 -0
  30. genarena/validation/schema.py +327 -0
  31. genarena/validation/validator.py +329 -0
  32. genarena/visualize/README.md +148 -0
  33. genarena/visualize/__init__.py +14 -0
  34. genarena/visualize/app.py +938 -0
  35. genarena/visualize/data_loader.py +2335 -0
  36. genarena/visualize/static/app.js +3762 -0
  37. genarena/visualize/static/model_aliases.json +86 -0
  38. genarena/visualize/static/style.css +4104 -0
  39. genarena/visualize/templates/index.html +413 -0
  40. genarena/vlm.py +519 -0
  41. genarena-0.1.0.dist-info/METADATA +178 -0
  42. genarena-0.1.0.dist-info/RECORD +44 -0
  43. {genarena-0.0.1.dist-info → genarena-0.1.0.dist-info}/WHEEL +1 -2
  44. genarena-0.1.0.dist-info/entry_points.txt +2 -0
  45. genarena-0.0.1.dist-info/METADATA +0 -26
  46. genarena-0.0.1.dist-info/RECORD +0 -5
  47. genarena-0.0.1.dist-info/top_level.txt +0 -1
genarena/battle.py ADDED
@@ -0,0 +1,337 @@
1
+ # Copyright 2026 Ruihang Li.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See LICENSE file in the project root for details.
4
+
5
+ """Battle execution module with position debiasing."""
6
+
7
+ from concurrent.futures import ThreadPoolExecutor
8
+ import logging
9
+ from dataclasses import dataclass, field
10
+ from types import ModuleType
11
+ from typing import Any, Optional, Union
12
+
13
+ from genarena.data import DataSample
14
+ from genarena.vlm import VLMJudge
15
+
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ def _progress_update(progress: Any, n: int) -> None:
20
+ """
21
+ Update progress for each API call.
22
+
23
+ Supports:
24
+ - queue-like objects with .put(int) (recommended for multi-thread/process)
25
+ - tqdm-like objects with .update(int)
26
+ """
27
+ if progress is None or n <= 0:
28
+ return
29
+ if hasattr(progress, "put"):
30
+ try:
31
+ progress.put(n)
32
+ return
33
+ except Exception:
34
+ return
35
+ if hasattr(progress, "update"):
36
+ try:
37
+ progress.update(n)
38
+ except Exception:
39
+ return
40
+
41
+
42
+ @dataclass
43
+ class CallResult:
44
+ """Result from a single VLM call."""
45
+
46
+ raw_response: str
47
+ parsed_result: Optional[dict[str, Any]]
48
+ parse_success: bool
49
+ parse_error: Optional[str] = None
50
+ winner: Optional[str] = None # "A" or "B" (position in that call)
51
+
52
+
53
+ @dataclass
54
+ class BattleResult:
55
+ """Result from a complete battle (two VLM calls with position swap)."""
56
+
57
+ # Final result
58
+ final_winner: str # "model_a", "model_b", or "tie"
59
+ is_consistent: bool # Whether both calls agreed
60
+
61
+ # Individual call results
62
+ original_call: CallResult
63
+ swapped_call: CallResult
64
+
65
+ # Metadata
66
+ model_a: str = ""
67
+ model_b: str = ""
68
+ sample_index: int = 0
69
+
70
+ # Converted winners (in terms of model_a/model_b, not A/B position)
71
+ original_model_winner: Optional[str] = None # model_a, model_b, or tie
72
+ swapped_model_winner: Optional[str] = None # model_a, model_b, or tie
73
+
74
+
75
+ def _call_vlm_judge(
76
+ vlm: VLMJudge,
77
+ prompt_module: ModuleType,
78
+ instruction: str,
79
+ input_images: list[Union[str, bytes]],
80
+ image_a: Union[str, bytes],
81
+ image_b: Union[str, bytes]
82
+ ) -> CallResult:
83
+ """
84
+ Execute a single VLM judge call.
85
+
86
+ Args:
87
+ vlm: VLMJudge instance
88
+ prompt_module: Prompt module with build_prompt and parse_response
89
+ instruction: The editing instruction
90
+ input_images: List of input images
91
+ image_a: Output image for position A
92
+ image_b: Output image for position B
93
+
94
+ Returns:
95
+ CallResult with raw response and parsed result
96
+ """
97
+ # Build prompt
98
+ messages = prompt_module.build_prompt(
99
+ instruction=instruction,
100
+ input_images=input_images,
101
+ output_image_a=image_a,
102
+ output_image_b=image_b
103
+ )
104
+
105
+ # Call VLM
106
+ try:
107
+ raw_response = vlm.call(messages)
108
+ except Exception as e:
109
+ return CallResult(
110
+ raw_response="",
111
+ parsed_result=None,
112
+ parse_success=False,
113
+ parse_error=f"VLM call failed: {e}",
114
+ winner=None
115
+ )
116
+
117
+ # Parse response
118
+ try:
119
+ parsed_result = prompt_module.parse_response(raw_response)
120
+ winner = parsed_result.get("winner")
121
+ return CallResult(
122
+ raw_response=raw_response,
123
+ parsed_result=parsed_result,
124
+ parse_success=True,
125
+ parse_error=None,
126
+ winner=winner
127
+ )
128
+ except Exception as e:
129
+ return CallResult(
130
+ raw_response=raw_response,
131
+ parsed_result=None,
132
+ parse_success=False,
133
+ parse_error=f"Parse failed: {e}",
134
+ winner=None
135
+ )
136
+
137
+
138
+ def _convert_position_winner_to_model(
139
+ position_winner: Optional[str],
140
+ is_swapped: bool
141
+ ) -> Optional[str]:
142
+ """
143
+ Convert position-based winner (A/B) to model-based winner (model_a/model_b).
144
+
145
+ In original order: A -> model_a, B -> model_b
146
+ In swapped order: A -> model_b, B -> model_a
147
+
148
+ Args:
149
+ position_winner: "A", "B", "tie", or None
150
+ is_swapped: Whether this was from the swapped call
151
+
152
+ Returns:
153
+ "model_a", "model_b", "tie", or None
154
+ """
155
+ if position_winner is None:
156
+ return None
157
+
158
+ winner_upper = position_winner.upper()
159
+
160
+ if winner_upper == "TIE":
161
+ return "tie"
162
+
163
+ if not is_swapped:
164
+ # Original order: A = model_a, B = model_b
165
+ if winner_upper == "A":
166
+ return "model_a"
167
+ elif winner_upper == "B":
168
+ return "model_b"
169
+ else:
170
+ # Swapped order: A = model_b, B = model_a
171
+ if winner_upper == "A":
172
+ return "model_b"
173
+ elif winner_upper == "B":
174
+ return "model_a"
175
+
176
+ return None
177
+
178
+
179
+ def _combine_votes(
180
+ original_model_winner: Optional[str],
181
+ swapped_model_winner: Optional[str],
182
+ allow_tie: bool
183
+ ) -> tuple[str, bool]:
184
+ """
185
+ Combine two voting results to determine final winner.
186
+
187
+ Position debiasing logic:
188
+ - If both calls agree on a winner -> that model wins (consistent)
189
+ - If calls disagree or either is tie/None -> tie (inconsistent)
190
+
191
+ Args:
192
+ original_model_winner: Winner from original call ("model_a", "model_b", "tie", None)
193
+ swapped_model_winner: Winner from swapped call ("model_a", "model_b", "tie", None)
194
+ allow_tie: Whether single-round ties are allowed by the prompt
195
+
196
+ Returns:
197
+ Tuple of (final_winner, is_consistent)
198
+ - final_winner: "model_a", "model_b", or "tie"
199
+ - is_consistent: True if both calls agreed
200
+ """
201
+ # Handle None cases (parse failures)
202
+ if original_model_winner is None or swapped_model_winner is None:
203
+ return "tie", False
204
+
205
+ # Both are valid results
206
+ if original_model_winner == swapped_model_winner:
207
+ # Both agree
208
+ if original_model_winner in ("model_a", "model_b"):
209
+ return original_model_winner, True
210
+ else:
211
+ # Both returned tie (only possible if ALLOW_TIE=True)
212
+ return "tie", True
213
+ else:
214
+ # Disagreement -> tie
215
+ return "tie", False
216
+
217
+
218
+ def execute_battle(
219
+ vlm: VLMJudge,
220
+ prompt_module: ModuleType,
221
+ sample: DataSample,
222
+ model_a_output: Union[str, bytes],
223
+ model_b_output: Union[str, bytes],
224
+ model_a: str = "",
225
+ model_b: str = "",
226
+ parallel_swap_calls: bool = False,
227
+ progress: Any = None,
228
+ ) -> BattleResult:
229
+ """
230
+ Execute a complete battle with position debiasing.
231
+
232
+ Makes two VLM calls:
233
+ 1. Original order: position A = model_a output, position B = model_b output
234
+ 2. Swapped order: position A = model_b output, position B = model_a output
235
+
236
+ Then combines results according to ALLOW_TIE setting in prompt module.
237
+
238
+ Args:
239
+ vlm: VLMJudge instance
240
+ prompt_module: Prompt module with build_prompt, parse_response, ALLOW_TIE
241
+ sample: DataSample with instruction and input_images
242
+ model_a_output: Output image path/bytes from model A
243
+ model_b_output: Output image path/bytes from model B
244
+ model_a: Model A name (for logging)
245
+ model_b: Model B name (for logging)
246
+
247
+ Returns:
248
+ BattleResult with final winner and both call details
249
+ """
250
+ allow_tie = getattr(prompt_module, "ALLOW_TIE", False)
251
+
252
+ if not parallel_swap_calls:
253
+ # Call 1: Original order (A = model_a, B = model_b)
254
+ logger.debug(f"Battle {model_a} vs {model_b}: executing original call")
255
+ _progress_update(progress, 1)
256
+ original_call = _call_vlm_judge(
257
+ vlm=vlm,
258
+ prompt_module=prompt_module,
259
+ instruction=sample.instruction,
260
+ input_images=sample.input_images,
261
+ image_a=model_a_output,
262
+ image_b=model_b_output
263
+ )
264
+
265
+ # Call 2: Swapped order (A = model_b, B = model_a)
266
+ logger.debug(f"Battle {model_a} vs {model_b}: executing swapped call")
267
+ _progress_update(progress, 1)
268
+ swapped_call = _call_vlm_judge(
269
+ vlm=vlm,
270
+ prompt_module=prompt_module,
271
+ instruction=sample.instruction,
272
+ input_images=sample.input_images,
273
+ image_a=model_b_output,
274
+ image_b=model_a_output
275
+ )
276
+ else:
277
+ # Execute original + swapped calls in parallel to reduce per-battle latency.
278
+ # Note: This doubles the instantaneous request concurrency per battle.
279
+ logger.debug(f"Battle {model_a} vs {model_b}: executing original+swapped calls in parallel")
280
+ with ThreadPoolExecutor(max_workers=2) as executor:
281
+ _progress_update(progress, 1)
282
+ fut_original = executor.submit(
283
+ _call_vlm_judge,
284
+ vlm,
285
+ prompt_module,
286
+ sample.instruction,
287
+ sample.input_images,
288
+ model_a_output,
289
+ model_b_output,
290
+ )
291
+ _progress_update(progress, 1)
292
+ fut_swapped = executor.submit(
293
+ _call_vlm_judge,
294
+ vlm,
295
+ prompt_module,
296
+ sample.instruction,
297
+ sample.input_images,
298
+ model_b_output,
299
+ model_a_output,
300
+ )
301
+ # Preserve error behavior: propagate exceptions if any occur.
302
+ original_call = fut_original.result()
303
+ swapped_call = fut_swapped.result()
304
+
305
+ # Convert position winners to model winners
306
+ original_model_winner = _convert_position_winner_to_model(
307
+ original_call.winner, is_swapped=False
308
+ )
309
+ swapped_model_winner = _convert_position_winner_to_model(
310
+ swapped_call.winner, is_swapped=True
311
+ )
312
+
313
+ # Combine votes
314
+ final_winner, is_consistent = _combine_votes(
315
+ original_model_winner,
316
+ swapped_model_winner,
317
+ allow_tie
318
+ )
319
+
320
+ logger.debug(
321
+ f"Battle {model_a} vs {model_b}: "
322
+ f"original={original_call.winner}->{original_model_winner}, "
323
+ f"swapped={swapped_call.winner}->{swapped_model_winner}, "
324
+ f"final={final_winner}, consistent={is_consistent}"
325
+ )
326
+
327
+ return BattleResult(
328
+ final_winner=final_winner,
329
+ is_consistent=is_consistent,
330
+ original_call=original_call,
331
+ swapped_call=swapped_call,
332
+ model_a=model_a,
333
+ model_b=model_b,
334
+ sample_index=sample.index,
335
+ original_model_winner=original_model_winner,
336
+ swapped_model_winner=swapped_model_winner
337
+ )