genarena 0.0.1__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. genarena/__init__.py +49 -2
  2. genarena/__main__.py +10 -0
  3. genarena/arena.py +1685 -0
  4. genarena/battle.py +337 -0
  5. genarena/bt_elo.py +507 -0
  6. genarena/cli.py +1581 -0
  7. genarena/data.py +476 -0
  8. genarena/deploy/Dockerfile +22 -0
  9. genarena/deploy/README.md +55 -0
  10. genarena/deploy/__init__.py +5 -0
  11. genarena/deploy/app.py +84 -0
  12. genarena/experiments.py +121 -0
  13. genarena/leaderboard.py +270 -0
  14. genarena/logs.py +409 -0
  15. genarena/models.py +412 -0
  16. genarena/prompts/__init__.py +127 -0
  17. genarena/prompts/mmrb2.py +373 -0
  18. genarena/sampling.py +336 -0
  19. genarena/state.py +656 -0
  20. genarena/sync/__init__.py +105 -0
  21. genarena/sync/auto_commit.py +118 -0
  22. genarena/sync/deploy_ops.py +543 -0
  23. genarena/sync/git_ops.py +422 -0
  24. genarena/sync/hf_ops.py +891 -0
  25. genarena/sync/init_ops.py +431 -0
  26. genarena/sync/packer.py +587 -0
  27. genarena/sync/submit.py +837 -0
  28. genarena/utils.py +103 -0
  29. genarena/validation/__init__.py +19 -0
  30. genarena/validation/schema.py +327 -0
  31. genarena/validation/validator.py +329 -0
  32. genarena/visualize/README.md +148 -0
  33. genarena/visualize/__init__.py +14 -0
  34. genarena/visualize/app.py +938 -0
  35. genarena/visualize/data_loader.py +2430 -0
  36. genarena/visualize/static/app.js +3762 -0
  37. genarena/visualize/static/model_aliases.json +86 -0
  38. genarena/visualize/static/style.css +4104 -0
  39. genarena/visualize/templates/index.html +413 -0
  40. genarena/vlm.py +519 -0
  41. genarena-0.1.1.dist-info/METADATA +178 -0
  42. genarena-0.1.1.dist-info/RECORD +44 -0
  43. {genarena-0.0.1.dist-info → genarena-0.1.1.dist-info}/WHEEL +1 -2
  44. genarena-0.1.1.dist-info/entry_points.txt +2 -0
  45. genarena-0.0.1.dist-info/METADATA +0 -26
  46. genarena-0.0.1.dist-info/RECORD +0 -5
  47. genarena-0.0.1.dist-info/top_level.txt +0 -1
@@ -0,0 +1,373 @@
1
+ # Copyright 2026 Ruihang Li.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See LICENSE file in the project root for details.
4
+
5
+ """MMRB2 prompt implementation for image editing evaluation.
6
+
7
+ This module implements the MMRB2 evaluation prompt for pairwise comparison
8
+ of image editing results. It uses a 1-6 scoring scale and does not allow
9
+ ties in single rounds.
10
+
11
+ Reference: MMRB2 evaluation framework
12
+ """
13
+
14
+ import base64
15
+ import io
16
+ import re
17
+ from typing import Any, Union
18
+
19
+ import json_repair
20
+ from PIL import Image as PILImage
21
+
22
+
23
+ # Whether single-round ties are allowed (mmrb2 requires a winner)
24
+ ALLOW_TIE = False
25
+
26
+
27
+ # The full evaluation prompt text from get_image_edit_prompt()
28
+ PROMPT_TEXT = """You are an expert in image editing quality analysis and AI evaluation. Your role is to act as an objective judge for comparing two AI-generated image editing responses to the same prompt. You will evaluate which response is better based on a comprehensive rubric specifically designed for image editing tasks.
29
+
30
+ **Important Guidelines:**
31
+ - Be completely impartial and avoid any position biases
32
+ - Ensure that the order in which the responses were presented does not influence your decision
33
+ - Do not allow the length of the responses to influence your evaluation
34
+ - Do not favor certain model names or types
35
+ - Be as objective as possible in your assessment
36
+ - Focus on image editing specific factors: faithfulness to editing instructions, preservation of input image elements, and overall editing quality
37
+
38
+ **Understanding the Content Structure:**
39
+ - **[ORIGINAL PROMPT TO MODEL:]**: This is the image editing instruction given to both AI models
40
+ - **[INPUT IMAGE FROM PROMPT:]**: This is the source image provided to both models for editing
41
+ - **[RESPONSE A:]**: The first model's edited image response
42
+ - **[RESPONSE B:]**: The second model's edited image response
43
+
44
+ Your evaluation must be based on a fine-grained rubric that covers the following criteria. For each criterion, you must provide detailed step-by-step reasoning comparing both responses. You will use a 1-6 scoring scale.
45
+
46
+ **Evaluation Criteria:**
47
+ 1. **text_faithfulness:** Which response better adheres to the text editing instruction? Consider how well each response follows the specific editing instructions (e.g., adding objects, changing colors, modifying scenes).
48
+
49
+ 2. **image_faithfulness:** Which response better respects and incorporates the key elements of the input image? Consider how well each response preserves important aspects of the original image (composition, lighting, style, background elements) while making the requested changes.
50
+
51
+ 3. **overall_image_quality:** Which response has better general technical and aesthetic quality, with fewer visual artifacts, distortions, or inconsistencies introduced during the editing process?
52
+
53
+ 4. **text_rendering:** If either response contains rendered text, which one has better text quality (spelling, legibility, integration with the image)? If no text is rendered, state "Not Applicable."
54
+
55
+ **Scoring Rubric:**
56
+ - Score 6 (A is significantly better): Response A is significantly superior across most criteria
57
+ - Score 5 (A is marginally better): Response A is noticeably better across several criteria
58
+ - Score 4 (Unsure or A is negligibly better): Response A is slightly better or roughly equivalent
59
+ - Score 3 (Unsure or B is negligibly better): Response B is slightly better or roughly equivalent
60
+ - Score 2 (B is marginally better): Response B is noticeably better across several criteria
61
+ - Score 1 (B is significantly better): Response B is significantly superior across most criteria
62
+
63
+ **Confidence Assessment:**
64
+ After your evaluation, assess your confidence in this judgment on a scale of 0.0 to 1.0:
65
+
66
+ **CRITICAL**: Be EXTREMELY conservative with confidence scores. Most comparisons should be in the 0.2-0.5 range.
67
+
68
+ - **Very High Confidence (0.8-1.0)**: ONLY for absolutely obvious cases where one response is dramatically better across ALL criteria with zero ambiguity. Use this extremely rarely (less than 10% of cases).
69
+ - **High Confidence (0.6-0.7)**: Clear differences but some uncertainty remains. Use sparingly (less than 20% of cases).
70
+ - **Medium Confidence (0.4-0.5)**: Noticeable differences but significant uncertainty. This should be your DEFAULT range.
71
+ - **Low Confidence (0.2-0.3)**: Very close comparison, difficult to distinguish. Responses are roughly equivalent or have conflicting strengths.
72
+ - **Very Low Confidence (0.0-0.1)**: Essentially indistinguishable responses or major conflicting strengths.
73
+
74
+ **IMPORTANT GUIDELINES**:
75
+ - DEFAULT to 0.3-0.5 range for most comparisons
76
+ - Only use 0.6+ when you are absolutely certain
77
+ - Consider: Could reasonable people disagree on this comparison?
78
+ - Consider: Are there any strengths in the "worse" response?
79
+ - Consider: How obvious would this be to a human evaluator?
80
+ - Remember: Quality assessment is inherently subjective
81
+
82
+ After your reasoning, you will provide a final numerical score, indicate which response is better, and assess your confidence. You must always output your response in the following structured JSON format:
83
+
84
+ {
85
+ "reasoning": {
86
+ "text_faithfulness": "YOUR REASONING HERE",
87
+ "image_faithfulness": "YOUR REASONING HERE",
88
+ "overall_image_quality": "YOUR REASONING HERE",
89
+ "text_rendering": "YOUR REASONING HERE",
90
+ "comparison_summary": "YOUR OVERALL COMPARISON SUMMARY HERE"
91
+ },
92
+ "score": <int 1-6>,
93
+ "better_response": "A" or "B",
94
+ "confidence": <float 0.0-1.0>,
95
+ "confidence_rationale": "YOUR CONFIDENCE ASSESSMENT REASONING HERE"
96
+ }"""
97
+
98
+
99
+ def _encode_image_to_base64(image_source: Union[str, bytes, PILImage.Image, io.BytesIO, dict[str, Any]]) -> str:
100
+ """
101
+ Encode an image to base64.
102
+
103
+ Args:
104
+ image_source: Either a file path (str), raw bytes, PIL.Image object, or BytesIO
105
+
106
+ Returns:
107
+ Base64 encoded string
108
+
109
+ Raises:
110
+ TypeError: If image_source type is not supported
111
+ ValueError: If image_source cannot be converted to bytes
112
+ """
113
+ image_bytes: bytes
114
+
115
+ if isinstance(image_source, str):
116
+ # It's a file path
117
+ with open(image_source, "rb") as f:
118
+ image_bytes = f.read()
119
+ elif isinstance(image_source, io.BytesIO):
120
+ # It's a BytesIO object
121
+ image_source.seek(0)
122
+ image_bytes = image_source.read()
123
+ elif isinstance(image_source, PILImage.Image):
124
+ # It's a PIL Image object (e.g., from HuggingFace datasets)
125
+ buffer = io.BytesIO()
126
+ image_source.save(buffer, format="PNG")
127
+ image_bytes = buffer.getvalue()
128
+ elif isinstance(image_source, dict):
129
+ # It's a dict (e.g., from HuggingFace datasets Image() type)
130
+ if "bytes" in image_source:
131
+ raw = image_source["bytes"]
132
+ if isinstance(raw, bytes):
133
+ image_bytes = raw
134
+ elif isinstance(raw, io.BytesIO):
135
+ raw.seek(0)
136
+ image_bytes = raw.read()
137
+ else:
138
+ # Recurse to handle nested types
139
+ return _encode_image_to_base64(raw)
140
+ elif "path" in image_source and image_source["path"]:
141
+ with open(image_source["path"], "rb") as f:
142
+ image_bytes = f.read()
143
+ else:
144
+ raise ValueError(f"Cannot extract image from dict: {image_source.keys()}")
145
+ elif isinstance(image_source, bytes):
146
+ # It's already bytes - MUST check after more specific types
147
+ image_bytes = image_source
148
+ else:
149
+ # Unknown type - raise error with helpful message
150
+ raise TypeError(
151
+ f"Unsupported image type: {type(image_source).__name__}. "
152
+ f"Expected str (path), bytes, PIL.Image, io.BytesIO, or dict. "
153
+ f"Got: {repr(image_source)[:200]}"
154
+ )
155
+
156
+ # Verify we have valid bytes before encoding
157
+ if not isinstance(image_bytes, bytes):
158
+ raise ValueError(
159
+ f"Failed to convert image to bytes. "
160
+ f"Got {type(image_bytes).__name__} instead. "
161
+ f"Original input was {type(image_source).__name__}"
162
+ )
163
+
164
+ return base64.b64encode(image_bytes).decode("utf-8")
165
+
166
+
167
+ def _get_image_media_type(image_source: Union[str, bytes, PILImage.Image]) -> str:
168
+ """
169
+ Determine the media type of an image.
170
+
171
+ Args:
172
+ image_source: Either a file path (str), raw bytes, or PIL.Image object
173
+
174
+ Returns:
175
+ Media type string (e.g., 'image/png')
176
+ """
177
+ if isinstance(image_source, str):
178
+ ext = image_source.lower().split('.')[-1]
179
+ media_types = {
180
+ 'png': 'image/png',
181
+ 'jpg': 'image/jpeg',
182
+ 'jpeg': 'image/jpeg',
183
+ 'webp': 'image/webp',
184
+ 'gif': 'image/gif',
185
+ }
186
+ return media_types.get(ext, 'image/png')
187
+ elif isinstance(image_source, PILImage.Image):
188
+ # For PIL.Image, we convert to PNG
189
+ return 'image/png'
190
+ else:
191
+ # Try to detect from bytes magic
192
+ if image_source[:8] == b'\x89PNG\r\n\x1a\n':
193
+ return 'image/png'
194
+ elif image_source[:2] == b'\xff\xd8':
195
+ return 'image/jpeg'
196
+ elif image_source[:4] == b'RIFF' and image_source[8:12] == b'WEBP':
197
+ return 'image/webp'
198
+ else:
199
+ return 'image/png'
200
+
201
+
202
+ def _create_image_content(image_source: Union[str, bytes]) -> dict[str, Any]:
203
+ """
204
+ Create an image content block for OpenAI API.
205
+
206
+ Args:
207
+ image_source: Either a file path (str) or raw bytes
208
+
209
+ Returns:
210
+ Image content dict for OpenAI API
211
+ """
212
+ base64_data = _encode_image_to_base64(image_source)
213
+ media_type = _get_image_media_type(image_source)
214
+
215
+ return {
216
+ "type": "image_url",
217
+ "image_url": {
218
+ "url": f"data:{media_type};base64,{base64_data}"
219
+ }
220
+ }
221
+
222
+
223
+ def build_prompt(
224
+ instruction: str,
225
+ input_images: list[Union[str, bytes]],
226
+ output_image_a: Union[str, bytes],
227
+ output_image_b: Union[str, bytes]
228
+ ) -> list[dict[str, Any]]:
229
+ """
230
+ Build the VLM prompt messages for pairwise evaluation.
231
+
232
+ Constructs messages in the format:
233
+ [EVALUATION PROMPT TEXT]
234
+ [ORIGINAL PROMPT TO MODEL:]
235
+ {instruction and input_images}
236
+ [RESPONSE A:]
237
+ {output_image_a}
238
+ [RESPONSE B:]
239
+ {output_image_b}
240
+
241
+ Args:
242
+ instruction: The editing instruction given to models
243
+ input_images: List of input images (file paths or bytes)
244
+ output_image_a: Output from model A (file path or bytes)
245
+ output_image_b: Output from model B (file path or bytes)
246
+
247
+ Returns:
248
+ List of message dicts for OpenAI Chat Completion API
249
+ """
250
+ # Build content list
251
+ content = []
252
+
253
+ # 1. Evaluation prompt
254
+ content.append({
255
+ "type": "text",
256
+ "text": PROMPT_TEXT
257
+ })
258
+
259
+ # 2. Original prompt to model section
260
+ content.append({
261
+ "type": "text",
262
+ "text": "[ORIGINAL PROMPT TO MODEL:]"
263
+ })
264
+
265
+ # Add instruction text
266
+ content.append({
267
+ "type": "text",
268
+ "text": instruction
269
+ })
270
+
271
+ # Add input images if any
272
+ if input_images:
273
+ content.append({
274
+ "type": "text",
275
+ "text": "[INPUT IMAGE FROM PROMPT:]"
276
+ })
277
+ for img in input_images:
278
+ content.append(_create_image_content(img))
279
+
280
+ # 3. Response A
281
+ content.append({
282
+ "type": "text",
283
+ "text": "[RESPONSE A:]"
284
+ })
285
+ content.append(_create_image_content(output_image_a))
286
+
287
+ # 4. Response B
288
+ content.append({
289
+ "type": "text",
290
+ "text": "[RESPONSE B:]"
291
+ })
292
+ content.append(_create_image_content(output_image_b))
293
+
294
+ # Return as OpenAI API format
295
+ return [
296
+ {
297
+ "role": "user",
298
+ "content": content
299
+ }
300
+ ]
301
+
302
+
303
+ def parse_response(response: str) -> dict[str, Any]:
304
+ """
305
+ Parse the VLM judge response.
306
+
307
+ Extracts structured information from VLM's JSON response,
308
+ handling markdown code blocks and minor JSON errors.
309
+
310
+ Args:
311
+ response: Raw response text from VLM
312
+
313
+ Returns:
314
+ Dict containing:
315
+ - winner: "A" or "B" (from better_response field)
316
+ - score: int 1-6
317
+ - confidence: float 0.0-1.0
318
+ - reasoning: dict with evaluation criteria
319
+ - raw_response: the original parsed JSON
320
+
321
+ Raises:
322
+ ValueError: If response cannot be parsed
323
+ """
324
+ # Remove markdown code block formatting
325
+ text = response.strip()
326
+ text = re.sub(r"^```(?:json)?\s*\n?", "", text)
327
+ text = re.sub(r"\n?```\s*$", "", text)
328
+
329
+ # Try to parse JSON with json_repair for fault tolerance
330
+ try:
331
+ parsed = json_repair.loads(text)
332
+ except Exception as e:
333
+ raise ValueError(f"Failed to parse JSON response: {e}\nResponse was:\n{response}")
334
+
335
+ # Extract fields
336
+ better_response = parsed.get("better_response", "")
337
+
338
+ # Normalize winner to uppercase
339
+ if isinstance(better_response, str):
340
+ winner = better_response.upper().strip()
341
+ if winner not in ("A", "B"):
342
+ # Try to extract from text
343
+ if "A" in winner:
344
+ winner = "A"
345
+ elif "B" in winner:
346
+ winner = "B"
347
+ else:
348
+ raise ValueError(f"Invalid better_response value: {better_response}")
349
+ else:
350
+ raise ValueError(f"better_response must be a string, got: {type(better_response)}")
351
+
352
+ # Extract score (1-6)
353
+ score = parsed.get("score", 4)
354
+ if isinstance(score, str):
355
+ score = int(score)
356
+ score = max(1, min(6, score))
357
+
358
+ # Extract confidence (0.0-1.0)
359
+ confidence = parsed.get("confidence", 0.5)
360
+ if isinstance(confidence, str):
361
+ confidence = float(confidence)
362
+ confidence = max(0.0, min(1.0, confidence))
363
+
364
+ # Extract reasoning
365
+ reasoning = parsed.get("reasoning", {})
366
+
367
+ return {
368
+ "winner": winner,
369
+ "score": score,
370
+ "confidence": confidence,
371
+ "reasoning": reasoning,
372
+ "raw_response": parsed
373
+ }