genarena 0.0.1__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genarena/__init__.py +49 -2
- genarena/__main__.py +10 -0
- genarena/arena.py +1685 -0
- genarena/battle.py +337 -0
- genarena/bt_elo.py +507 -0
- genarena/cli.py +1581 -0
- genarena/data.py +476 -0
- genarena/deploy/Dockerfile +22 -0
- genarena/deploy/README.md +55 -0
- genarena/deploy/__init__.py +5 -0
- genarena/deploy/app.py +84 -0
- genarena/experiments.py +121 -0
- genarena/leaderboard.py +270 -0
- genarena/logs.py +409 -0
- genarena/models.py +412 -0
- genarena/prompts/__init__.py +127 -0
- genarena/prompts/mmrb2.py +373 -0
- genarena/sampling.py +336 -0
- genarena/state.py +656 -0
- genarena/sync/__init__.py +105 -0
- genarena/sync/auto_commit.py +118 -0
- genarena/sync/deploy_ops.py +543 -0
- genarena/sync/git_ops.py +422 -0
- genarena/sync/hf_ops.py +891 -0
- genarena/sync/init_ops.py +431 -0
- genarena/sync/packer.py +587 -0
- genarena/sync/submit.py +837 -0
- genarena/utils.py +103 -0
- genarena/validation/__init__.py +19 -0
- genarena/validation/schema.py +327 -0
- genarena/validation/validator.py +329 -0
- genarena/visualize/README.md +148 -0
- genarena/visualize/__init__.py +14 -0
- genarena/visualize/app.py +938 -0
- genarena/visualize/data_loader.py +2430 -0
- genarena/visualize/static/app.js +3762 -0
- genarena/visualize/static/model_aliases.json +86 -0
- genarena/visualize/static/style.css +4104 -0
- genarena/visualize/templates/index.html +413 -0
- genarena/vlm.py +519 -0
- genarena-0.1.1.dist-info/METADATA +178 -0
- genarena-0.1.1.dist-info/RECORD +44 -0
- {genarena-0.0.1.dist-info → genarena-0.1.1.dist-info}/WHEEL +1 -2
- genarena-0.1.1.dist-info/entry_points.txt +2 -0
- genarena-0.0.1.dist-info/METADATA +0 -26
- genarena-0.0.1.dist-info/RECORD +0 -5
- genarena-0.0.1.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
# Copyright 2026 Ruihang Li.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0.
|
|
3
|
+
# See LICENSE file in the project root for details.
|
|
4
|
+
|
|
5
|
+
"""MMRB2 prompt implementation for image editing evaluation.
|
|
6
|
+
|
|
7
|
+
This module implements the MMRB2 evaluation prompt for pairwise comparison
|
|
8
|
+
of image editing results. It uses a 1-6 scoring scale and does not allow
|
|
9
|
+
ties in single rounds.
|
|
10
|
+
|
|
11
|
+
Reference: MMRB2 evaluation framework
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import base64
|
|
15
|
+
import io
|
|
16
|
+
import re
|
|
17
|
+
from typing import Any, Union
|
|
18
|
+
|
|
19
|
+
import json_repair
|
|
20
|
+
from PIL import Image as PILImage
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Whether single-round ties are allowed (mmrb2 requires a winner)
|
|
24
|
+
ALLOW_TIE = False
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# The full evaluation prompt text from get_image_edit_prompt()
|
|
28
|
+
PROMPT_TEXT = """You are an expert in image editing quality analysis and AI evaluation. Your role is to act as an objective judge for comparing two AI-generated image editing responses to the same prompt. You will evaluate which response is better based on a comprehensive rubric specifically designed for image editing tasks.
|
|
29
|
+
|
|
30
|
+
**Important Guidelines:**
|
|
31
|
+
- Be completely impartial and avoid any position biases
|
|
32
|
+
- Ensure that the order in which the responses were presented does not influence your decision
|
|
33
|
+
- Do not allow the length of the responses to influence your evaluation
|
|
34
|
+
- Do not favor certain model names or types
|
|
35
|
+
- Be as objective as possible in your assessment
|
|
36
|
+
- Focus on image editing specific factors: faithfulness to editing instructions, preservation of input image elements, and overall editing quality
|
|
37
|
+
|
|
38
|
+
**Understanding the Content Structure:**
|
|
39
|
+
- **[ORIGINAL PROMPT TO MODEL:]**: This is the image editing instruction given to both AI models
|
|
40
|
+
- **[INPUT IMAGE FROM PROMPT:]**: This is the source image provided to both models for editing
|
|
41
|
+
- **[RESPONSE A:]**: The first model's edited image response
|
|
42
|
+
- **[RESPONSE B:]**: The second model's edited image response
|
|
43
|
+
|
|
44
|
+
Your evaluation must be based on a fine-grained rubric that covers the following criteria. For each criterion, you must provide detailed step-by-step reasoning comparing both responses. You will use a 1-6 scoring scale.
|
|
45
|
+
|
|
46
|
+
**Evaluation Criteria:**
|
|
47
|
+
1. **text_faithfulness:** Which response better adheres to the text editing instruction? Consider how well each response follows the specific editing instructions (e.g., adding objects, changing colors, modifying scenes).
|
|
48
|
+
|
|
49
|
+
2. **image_faithfulness:** Which response better respects and incorporates the key elements of the input image? Consider how well each response preserves important aspects of the original image (composition, lighting, style, background elements) while making the requested changes.
|
|
50
|
+
|
|
51
|
+
3. **overall_image_quality:** Which response has better general technical and aesthetic quality, with fewer visual artifacts, distortions, or inconsistencies introduced during the editing process?
|
|
52
|
+
|
|
53
|
+
4. **text_rendering:** If either response contains rendered text, which one has better text quality (spelling, legibility, integration with the image)? If no text is rendered, state "Not Applicable."
|
|
54
|
+
|
|
55
|
+
**Scoring Rubric:**
|
|
56
|
+
- Score 6 (A is significantly better): Response A is significantly superior across most criteria
|
|
57
|
+
- Score 5 (A is marginally better): Response A is noticeably better across several criteria
|
|
58
|
+
- Score 4 (Unsure or A is negligibly better): Response A is slightly better or roughly equivalent
|
|
59
|
+
- Score 3 (Unsure or B is negligibly better): Response B is slightly better or roughly equivalent
|
|
60
|
+
- Score 2 (B is marginally better): Response B is noticeably better across several criteria
|
|
61
|
+
- Score 1 (B is significantly better): Response B is significantly superior across most criteria
|
|
62
|
+
|
|
63
|
+
**Confidence Assessment:**
|
|
64
|
+
After your evaluation, assess your confidence in this judgment on a scale of 0.0 to 1.0:
|
|
65
|
+
|
|
66
|
+
**CRITICAL**: Be EXTREMELY conservative with confidence scores. Most comparisons should be in the 0.2-0.5 range.
|
|
67
|
+
|
|
68
|
+
- **Very High Confidence (0.8-1.0)**: ONLY for absolutely obvious cases where one response is dramatically better across ALL criteria with zero ambiguity. Use this extremely rarely (less than 10% of cases).
|
|
69
|
+
- **High Confidence (0.6-0.7)**: Clear differences but some uncertainty remains. Use sparingly (less than 20% of cases).
|
|
70
|
+
- **Medium Confidence (0.4-0.5)**: Noticeable differences but significant uncertainty. This should be your DEFAULT range.
|
|
71
|
+
- **Low Confidence (0.2-0.3)**: Very close comparison, difficult to distinguish. Responses are roughly equivalent or have conflicting strengths.
|
|
72
|
+
- **Very Low Confidence (0.0-0.1)**: Essentially indistinguishable responses or major conflicting strengths.
|
|
73
|
+
|
|
74
|
+
**IMPORTANT GUIDELINES**:
|
|
75
|
+
- DEFAULT to 0.3-0.5 range for most comparisons
|
|
76
|
+
- Only use 0.6+ when you are absolutely certain
|
|
77
|
+
- Consider: Could reasonable people disagree on this comparison?
|
|
78
|
+
- Consider: Are there any strengths in the "worse" response?
|
|
79
|
+
- Consider: How obvious would this be to a human evaluator?
|
|
80
|
+
- Remember: Quality assessment is inherently subjective
|
|
81
|
+
|
|
82
|
+
After your reasoning, you will provide a final numerical score, indicate which response is better, and assess your confidence. You must always output your response in the following structured JSON format:
|
|
83
|
+
|
|
84
|
+
{
|
|
85
|
+
"reasoning": {
|
|
86
|
+
"text_faithfulness": "YOUR REASONING HERE",
|
|
87
|
+
"image_faithfulness": "YOUR REASONING HERE",
|
|
88
|
+
"overall_image_quality": "YOUR REASONING HERE",
|
|
89
|
+
"text_rendering": "YOUR REASONING HERE",
|
|
90
|
+
"comparison_summary": "YOUR OVERALL COMPARISON SUMMARY HERE"
|
|
91
|
+
},
|
|
92
|
+
"score": <int 1-6>,
|
|
93
|
+
"better_response": "A" or "B",
|
|
94
|
+
"confidence": <float 0.0-1.0>,
|
|
95
|
+
"confidence_rationale": "YOUR CONFIDENCE ASSESSMENT REASONING HERE"
|
|
96
|
+
}"""
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _encode_image_to_base64(image_source: Union[str, bytes, PILImage.Image, io.BytesIO, dict[str, Any]]) -> str:
|
|
100
|
+
"""
|
|
101
|
+
Encode an image to base64.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
image_source: Either a file path (str), raw bytes, PIL.Image object, or BytesIO
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Base64 encoded string
|
|
108
|
+
|
|
109
|
+
Raises:
|
|
110
|
+
TypeError: If image_source type is not supported
|
|
111
|
+
ValueError: If image_source cannot be converted to bytes
|
|
112
|
+
"""
|
|
113
|
+
image_bytes: bytes
|
|
114
|
+
|
|
115
|
+
if isinstance(image_source, str):
|
|
116
|
+
# It's a file path
|
|
117
|
+
with open(image_source, "rb") as f:
|
|
118
|
+
image_bytes = f.read()
|
|
119
|
+
elif isinstance(image_source, io.BytesIO):
|
|
120
|
+
# It's a BytesIO object
|
|
121
|
+
image_source.seek(0)
|
|
122
|
+
image_bytes = image_source.read()
|
|
123
|
+
elif isinstance(image_source, PILImage.Image):
|
|
124
|
+
# It's a PIL Image object (e.g., from HuggingFace datasets)
|
|
125
|
+
buffer = io.BytesIO()
|
|
126
|
+
image_source.save(buffer, format="PNG")
|
|
127
|
+
image_bytes = buffer.getvalue()
|
|
128
|
+
elif isinstance(image_source, dict):
|
|
129
|
+
# It's a dict (e.g., from HuggingFace datasets Image() type)
|
|
130
|
+
if "bytes" in image_source:
|
|
131
|
+
raw = image_source["bytes"]
|
|
132
|
+
if isinstance(raw, bytes):
|
|
133
|
+
image_bytes = raw
|
|
134
|
+
elif isinstance(raw, io.BytesIO):
|
|
135
|
+
raw.seek(0)
|
|
136
|
+
image_bytes = raw.read()
|
|
137
|
+
else:
|
|
138
|
+
# Recurse to handle nested types
|
|
139
|
+
return _encode_image_to_base64(raw)
|
|
140
|
+
elif "path" in image_source and image_source["path"]:
|
|
141
|
+
with open(image_source["path"], "rb") as f:
|
|
142
|
+
image_bytes = f.read()
|
|
143
|
+
else:
|
|
144
|
+
raise ValueError(f"Cannot extract image from dict: {image_source.keys()}")
|
|
145
|
+
elif isinstance(image_source, bytes):
|
|
146
|
+
# It's already bytes - MUST check after more specific types
|
|
147
|
+
image_bytes = image_source
|
|
148
|
+
else:
|
|
149
|
+
# Unknown type - raise error with helpful message
|
|
150
|
+
raise TypeError(
|
|
151
|
+
f"Unsupported image type: {type(image_source).__name__}. "
|
|
152
|
+
f"Expected str (path), bytes, PIL.Image, io.BytesIO, or dict. "
|
|
153
|
+
f"Got: {repr(image_source)[:200]}"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Verify we have valid bytes before encoding
|
|
157
|
+
if not isinstance(image_bytes, bytes):
|
|
158
|
+
raise ValueError(
|
|
159
|
+
f"Failed to convert image to bytes. "
|
|
160
|
+
f"Got {type(image_bytes).__name__} instead. "
|
|
161
|
+
f"Original input was {type(image_source).__name__}"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
return base64.b64encode(image_bytes).decode("utf-8")
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _get_image_media_type(image_source: Union[str, bytes, PILImage.Image]) -> str:
|
|
168
|
+
"""
|
|
169
|
+
Determine the media type of an image.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
image_source: Either a file path (str), raw bytes, or PIL.Image object
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Media type string (e.g., 'image/png')
|
|
176
|
+
"""
|
|
177
|
+
if isinstance(image_source, str):
|
|
178
|
+
ext = image_source.lower().split('.')[-1]
|
|
179
|
+
media_types = {
|
|
180
|
+
'png': 'image/png',
|
|
181
|
+
'jpg': 'image/jpeg',
|
|
182
|
+
'jpeg': 'image/jpeg',
|
|
183
|
+
'webp': 'image/webp',
|
|
184
|
+
'gif': 'image/gif',
|
|
185
|
+
}
|
|
186
|
+
return media_types.get(ext, 'image/png')
|
|
187
|
+
elif isinstance(image_source, PILImage.Image):
|
|
188
|
+
# For PIL.Image, we convert to PNG
|
|
189
|
+
return 'image/png'
|
|
190
|
+
else:
|
|
191
|
+
# Try to detect from bytes magic
|
|
192
|
+
if image_source[:8] == b'\x89PNG\r\n\x1a\n':
|
|
193
|
+
return 'image/png'
|
|
194
|
+
elif image_source[:2] == b'\xff\xd8':
|
|
195
|
+
return 'image/jpeg'
|
|
196
|
+
elif image_source[:4] == b'RIFF' and image_source[8:12] == b'WEBP':
|
|
197
|
+
return 'image/webp'
|
|
198
|
+
else:
|
|
199
|
+
return 'image/png'
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _create_image_content(image_source: Union[str, bytes]) -> dict[str, Any]:
|
|
203
|
+
"""
|
|
204
|
+
Create an image content block for OpenAI API.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
image_source: Either a file path (str) or raw bytes
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Image content dict for OpenAI API
|
|
211
|
+
"""
|
|
212
|
+
base64_data = _encode_image_to_base64(image_source)
|
|
213
|
+
media_type = _get_image_media_type(image_source)
|
|
214
|
+
|
|
215
|
+
return {
|
|
216
|
+
"type": "image_url",
|
|
217
|
+
"image_url": {
|
|
218
|
+
"url": f"data:{media_type};base64,{base64_data}"
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def build_prompt(
|
|
224
|
+
instruction: str,
|
|
225
|
+
input_images: list[Union[str, bytes]],
|
|
226
|
+
output_image_a: Union[str, bytes],
|
|
227
|
+
output_image_b: Union[str, bytes]
|
|
228
|
+
) -> list[dict[str, Any]]:
|
|
229
|
+
"""
|
|
230
|
+
Build the VLM prompt messages for pairwise evaluation.
|
|
231
|
+
|
|
232
|
+
Constructs messages in the format:
|
|
233
|
+
[EVALUATION PROMPT TEXT]
|
|
234
|
+
[ORIGINAL PROMPT TO MODEL:]
|
|
235
|
+
{instruction and input_images}
|
|
236
|
+
[RESPONSE A:]
|
|
237
|
+
{output_image_a}
|
|
238
|
+
[RESPONSE B:]
|
|
239
|
+
{output_image_b}
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
instruction: The editing instruction given to models
|
|
243
|
+
input_images: List of input images (file paths or bytes)
|
|
244
|
+
output_image_a: Output from model A (file path or bytes)
|
|
245
|
+
output_image_b: Output from model B (file path or bytes)
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
List of message dicts for OpenAI Chat Completion API
|
|
249
|
+
"""
|
|
250
|
+
# Build content list
|
|
251
|
+
content = []
|
|
252
|
+
|
|
253
|
+
# 1. Evaluation prompt
|
|
254
|
+
content.append({
|
|
255
|
+
"type": "text",
|
|
256
|
+
"text": PROMPT_TEXT
|
|
257
|
+
})
|
|
258
|
+
|
|
259
|
+
# 2. Original prompt to model section
|
|
260
|
+
content.append({
|
|
261
|
+
"type": "text",
|
|
262
|
+
"text": "[ORIGINAL PROMPT TO MODEL:]"
|
|
263
|
+
})
|
|
264
|
+
|
|
265
|
+
# Add instruction text
|
|
266
|
+
content.append({
|
|
267
|
+
"type": "text",
|
|
268
|
+
"text": instruction
|
|
269
|
+
})
|
|
270
|
+
|
|
271
|
+
# Add input images if any
|
|
272
|
+
if input_images:
|
|
273
|
+
content.append({
|
|
274
|
+
"type": "text",
|
|
275
|
+
"text": "[INPUT IMAGE FROM PROMPT:]"
|
|
276
|
+
})
|
|
277
|
+
for img in input_images:
|
|
278
|
+
content.append(_create_image_content(img))
|
|
279
|
+
|
|
280
|
+
# 3. Response A
|
|
281
|
+
content.append({
|
|
282
|
+
"type": "text",
|
|
283
|
+
"text": "[RESPONSE A:]"
|
|
284
|
+
})
|
|
285
|
+
content.append(_create_image_content(output_image_a))
|
|
286
|
+
|
|
287
|
+
# 4. Response B
|
|
288
|
+
content.append({
|
|
289
|
+
"type": "text",
|
|
290
|
+
"text": "[RESPONSE B:]"
|
|
291
|
+
})
|
|
292
|
+
content.append(_create_image_content(output_image_b))
|
|
293
|
+
|
|
294
|
+
# Return as OpenAI API format
|
|
295
|
+
return [
|
|
296
|
+
{
|
|
297
|
+
"role": "user",
|
|
298
|
+
"content": content
|
|
299
|
+
}
|
|
300
|
+
]
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def parse_response(response: str) -> dict[str, Any]:
|
|
304
|
+
"""
|
|
305
|
+
Parse the VLM judge response.
|
|
306
|
+
|
|
307
|
+
Extracts structured information from VLM's JSON response,
|
|
308
|
+
handling markdown code blocks and minor JSON errors.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
response: Raw response text from VLM
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
Dict containing:
|
|
315
|
+
- winner: "A" or "B" (from better_response field)
|
|
316
|
+
- score: int 1-6
|
|
317
|
+
- confidence: float 0.0-1.0
|
|
318
|
+
- reasoning: dict with evaluation criteria
|
|
319
|
+
- raw_response: the original parsed JSON
|
|
320
|
+
|
|
321
|
+
Raises:
|
|
322
|
+
ValueError: If response cannot be parsed
|
|
323
|
+
"""
|
|
324
|
+
# Remove markdown code block formatting
|
|
325
|
+
text = response.strip()
|
|
326
|
+
text = re.sub(r"^```(?:json)?\s*\n?", "", text)
|
|
327
|
+
text = re.sub(r"\n?```\s*$", "", text)
|
|
328
|
+
|
|
329
|
+
# Try to parse JSON with json_repair for fault tolerance
|
|
330
|
+
try:
|
|
331
|
+
parsed = json_repair.loads(text)
|
|
332
|
+
except Exception as e:
|
|
333
|
+
raise ValueError(f"Failed to parse JSON response: {e}\nResponse was:\n{response}")
|
|
334
|
+
|
|
335
|
+
# Extract fields
|
|
336
|
+
better_response = parsed.get("better_response", "")
|
|
337
|
+
|
|
338
|
+
# Normalize winner to uppercase
|
|
339
|
+
if isinstance(better_response, str):
|
|
340
|
+
winner = better_response.upper().strip()
|
|
341
|
+
if winner not in ("A", "B"):
|
|
342
|
+
# Try to extract from text
|
|
343
|
+
if "A" in winner:
|
|
344
|
+
winner = "A"
|
|
345
|
+
elif "B" in winner:
|
|
346
|
+
winner = "B"
|
|
347
|
+
else:
|
|
348
|
+
raise ValueError(f"Invalid better_response value: {better_response}")
|
|
349
|
+
else:
|
|
350
|
+
raise ValueError(f"better_response must be a string, got: {type(better_response)}")
|
|
351
|
+
|
|
352
|
+
# Extract score (1-6)
|
|
353
|
+
score = parsed.get("score", 4)
|
|
354
|
+
if isinstance(score, str):
|
|
355
|
+
score = int(score)
|
|
356
|
+
score = max(1, min(6, score))
|
|
357
|
+
|
|
358
|
+
# Extract confidence (0.0-1.0)
|
|
359
|
+
confidence = parsed.get("confidence", 0.5)
|
|
360
|
+
if isinstance(confidence, str):
|
|
361
|
+
confidence = float(confidence)
|
|
362
|
+
confidence = max(0.0, min(1.0, confidence))
|
|
363
|
+
|
|
364
|
+
# Extract reasoning
|
|
365
|
+
reasoning = parsed.get("reasoning", {})
|
|
366
|
+
|
|
367
|
+
return {
|
|
368
|
+
"winner": winner,
|
|
369
|
+
"score": score,
|
|
370
|
+
"confidence": confidence,
|
|
371
|
+
"reasoning": reasoning,
|
|
372
|
+
"raw_response": parsed
|
|
373
|
+
}
|