genarena 0.0.1__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. genarena/__init__.py +49 -2
  2. genarena/__main__.py +10 -0
  3. genarena/arena.py +1685 -0
  4. genarena/battle.py +337 -0
  5. genarena/bt_elo.py +507 -0
  6. genarena/cli.py +1581 -0
  7. genarena/data.py +476 -0
  8. genarena/deploy/Dockerfile +22 -0
  9. genarena/deploy/README.md +55 -0
  10. genarena/deploy/__init__.py +5 -0
  11. genarena/deploy/app.py +84 -0
  12. genarena/experiments.py +121 -0
  13. genarena/leaderboard.py +270 -0
  14. genarena/logs.py +409 -0
  15. genarena/models.py +412 -0
  16. genarena/prompts/__init__.py +127 -0
  17. genarena/prompts/mmrb2.py +373 -0
  18. genarena/sampling.py +336 -0
  19. genarena/state.py +656 -0
  20. genarena/sync/__init__.py +105 -0
  21. genarena/sync/auto_commit.py +118 -0
  22. genarena/sync/deploy_ops.py +543 -0
  23. genarena/sync/git_ops.py +422 -0
  24. genarena/sync/hf_ops.py +891 -0
  25. genarena/sync/init_ops.py +431 -0
  26. genarena/sync/packer.py +587 -0
  27. genarena/sync/submit.py +837 -0
  28. genarena/utils.py +103 -0
  29. genarena/validation/__init__.py +19 -0
  30. genarena/validation/schema.py +327 -0
  31. genarena/validation/validator.py +329 -0
  32. genarena/visualize/README.md +148 -0
  33. genarena/visualize/__init__.py +14 -0
  34. genarena/visualize/app.py +938 -0
  35. genarena/visualize/data_loader.py +2430 -0
  36. genarena/visualize/static/app.js +3762 -0
  37. genarena/visualize/static/model_aliases.json +86 -0
  38. genarena/visualize/static/style.css +4104 -0
  39. genarena/visualize/templates/index.html +413 -0
  40. genarena/vlm.py +519 -0
  41. genarena-0.1.1.dist-info/METADATA +178 -0
  42. genarena-0.1.1.dist-info/RECORD +44 -0
  43. {genarena-0.0.1.dist-info → genarena-0.1.1.dist-info}/WHEEL +1 -2
  44. genarena-0.1.1.dist-info/entry_points.txt +2 -0
  45. genarena-0.0.1.dist-info/METADATA +0 -26
  46. genarena-0.0.1.dist-info/RECORD +0 -5
  47. genarena-0.0.1.dist-info/top_level.txt +0 -1
@@ -0,0 +1,329 @@
1
+ # Copyright 2026 Ruihang Li.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See LICENSE file in the project root for details.
4
+
5
+ """
6
+ Validator for GenArena submissions.
7
+
8
+ This module provides functions to validate submission files,
9
+ including downloading and verifying data from HuggingFace.
10
+ Used by the GitHub Actions bot for automated validation.
11
+ """
12
+
13
+ import hashlib
14
+ import json
15
+ import logging
16
+ import os
17
+ import tempfile
18
+ import zipfile
19
+ from dataclasses import dataclass, field
20
+ from typing import Any, Optional
21
+
22
+ from genarena.validation.schema import validate_submission_schema
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ @dataclass
28
+ class ValidationCheck:
29
+ """Single validation check result."""
30
+
31
+ name: str
32
+ passed: bool
33
+ error: Optional[str] = None
34
+
35
+
36
+ @dataclass
37
+ class ValidationReport:
38
+ """Complete validation report for a submission."""
39
+
40
+ status: str # "success" or "failed"
41
+ submission_id: str = ""
42
+ exp_name: str = ""
43
+ subset: str = ""
44
+ models: list[str] = field(default_factory=list)
45
+ new_models: list[str] = field(default_factory=list)
46
+ total_battles: int = 0
47
+ checks: list[ValidationCheck] = field(default_factory=list)
48
+ elo_comparison: dict[str, dict[str, float]] = field(default_factory=dict)
49
+ errors: list[str] = field(default_factory=list)
50
+
51
+ def add_check(self, name: str, passed: bool, error: Optional[str] = None) -> None:
52
+ """Add a validation check result."""
53
+ self.checks.append(ValidationCheck(name=name, passed=passed, error=error))
54
+ if not passed:
55
+ self.status = "failed"
56
+ if error:
57
+ self.errors.append(f"{name}: {error}")
58
+
59
+ def to_dict(self) -> dict[str, Any]:
60
+ """Convert to dictionary for JSON serialization."""
61
+ return {
62
+ "status": self.status,
63
+ "submission_id": self.submission_id,
64
+ "exp_name": self.exp_name,
65
+ "subset": self.subset,
66
+ "models": self.models,
67
+ "new_models": self.new_models,
68
+ "total_battles": self.total_battles,
69
+ "checks": [
70
+ {"name": c.name, "passed": c.passed, "error": c.error}
71
+ for c in self.checks
72
+ ],
73
+ "elo_comparison": self.elo_comparison,
74
+ "errors": self.errors,
75
+ }
76
+
77
+
78
+ def validate_submission_file(
79
+ submission_path: str,
80
+ official_models_path: Optional[str] = None,
81
+ download_data: bool = True,
82
+ ) -> ValidationReport:
83
+ """
84
+ Validate a submission JSON file.
85
+
86
+ This is the main entry point for validating submissions,
87
+ used by the GitHub Actions bot.
88
+
89
+ Args:
90
+ submission_path: Path to submission JSON file
91
+ official_models_path: Path to official_models.json (optional)
92
+ download_data: Whether to download and verify data from HF
93
+
94
+ Returns:
95
+ ValidationReport with all check results
96
+ """
97
+ report = ValidationReport(status="success")
98
+
99
+ # 1. Load and parse JSON
100
+ try:
101
+ with open(submission_path, "r", encoding="utf-8") as f:
102
+ submission = json.load(f)
103
+ report.add_check("JSON parse", True)
104
+ except json.JSONDecodeError as e:
105
+ report.add_check("JSON parse", False, str(e))
106
+ return report
107
+ except IOError as e:
108
+ report.add_check("File read", False, str(e))
109
+ return report
110
+
111
+ # 2. Schema validation
112
+ is_valid, schema_errors = validate_submission_schema(submission)
113
+ if is_valid:
114
+ report.add_check("Schema validation", True)
115
+ else:
116
+ for err in schema_errors:
117
+ report.add_check("Schema validation", False, err)
118
+ return report
119
+
120
+ # Extract basic info
121
+ report.submission_id = submission.get("submission_id", "")
122
+ exp = submission.get("experiment", {})
123
+ report.exp_name = exp.get("exp_name", "")
124
+ report.subset = exp.get("subset", "")
125
+ report.models = exp.get("models", [])
126
+ report.new_models = exp.get("new_models", [])
127
+ report.total_battles = exp.get("total_battles", 0)
128
+
129
+ # 3. Check new models against official list
130
+ if official_models_path and os.path.isfile(official_models_path):
131
+ try:
132
+ with open(official_models_path, "r", encoding="utf-8") as f:
133
+ official_data = json.load(f)
134
+ official_models = set(
135
+ official_data.get("subsets", {})
136
+ .get(report.subset, {})
137
+ .get("models", [])
138
+ )
139
+
140
+ # Verify new_models are actually new
141
+ for model in report.new_models:
142
+ if model in official_models:
143
+ report.add_check(
144
+ f"Model '{model}' is new",
145
+ False,
146
+ "Model already exists in official leaderboard",
147
+ )
148
+ else:
149
+ report.add_check(f"Model '{model}' is new", True)
150
+
151
+ except Exception as e:
152
+ report.add_check(
153
+ "Check official models", False, f"Failed to load official models: {e}"
154
+ )
155
+ else:
156
+ report.add_check(
157
+ "Check official models",
158
+ True,
159
+ "Skipped (no official_models.json provided)",
160
+ )
161
+
162
+ # 4. Download and verify data from HuggingFace
163
+ if download_data:
164
+ data_report = validate_submission_data(submission)
165
+ for check in data_report.checks:
166
+ report.checks.append(check)
167
+ if not check.passed:
168
+ report.status = "failed"
169
+ if check.error:
170
+ report.errors.append(f"{check.name}: {check.error}")
171
+ report.elo_comparison = data_report.elo_comparison
172
+ else:
173
+ report.add_check("Data verification", True, "Skipped (download_data=False)")
174
+
175
+ return report
176
+
177
+
178
+ def validate_submission_data(submission: dict[str, Any]) -> ValidationReport:
179
+ """
180
+ Download and validate submission data from HuggingFace.
181
+
182
+ Downloads the pk_logs ZIP, verifies checksum, extracts battles,
183
+ and recalculates ELO for comparison.
184
+
185
+ Args:
186
+ submission: Submission metadata dictionary
187
+
188
+ Returns:
189
+ ValidationReport with data validation results
190
+ """
191
+ report = ValidationReport(status="success")
192
+
193
+ data_loc = submission.get("data_location", {})
194
+ hf_repo = data_loc.get("hf_repo_id", "")
195
+ hf_revision = data_loc.get("hf_revision", "main")
196
+ files = data_loc.get("files", {})
197
+ pk_logs_info = files.get("pk_logs_zip", {})
198
+
199
+ if not hf_repo or not pk_logs_info:
200
+ report.add_check("Data location", False, "Missing HF repo or file info")
201
+ return report
202
+
203
+ try:
204
+ from huggingface_hub import hf_hub_download
205
+ except ImportError:
206
+ report.add_check(
207
+ "HuggingFace Hub",
208
+ False,
209
+ "huggingface_hub not installed",
210
+ )
211
+ return report
212
+
213
+ with tempfile.TemporaryDirectory() as tmpdir:
214
+ # Download pk_logs ZIP
215
+ try:
216
+ pk_logs_path = hf_hub_download(
217
+ repo_id=hf_repo,
218
+ filename=pk_logs_info["path"],
219
+ repo_type="dataset",
220
+ revision=hf_revision,
221
+ local_dir=tmpdir,
222
+ )
223
+ report.add_check("Download pk_logs", True)
224
+ except Exception as e:
225
+ report.add_check("Download pk_logs", False, str(e))
226
+ return report
227
+
228
+ # Verify SHA256
229
+ expected_sha = pk_logs_info.get("sha256", "")
230
+ try:
231
+ with open(pk_logs_path, "rb") as f:
232
+ actual_sha = hashlib.sha256(f.read()).hexdigest()
233
+
234
+ if actual_sha == expected_sha:
235
+ report.add_check("SHA256 checksum", True)
236
+ else:
237
+ report.add_check(
238
+ "SHA256 checksum",
239
+ False,
240
+ f"Expected {expected_sha[:16]}..., got {actual_sha[:16]}...",
241
+ )
242
+ return report
243
+ except Exception as e:
244
+ report.add_check("SHA256 checksum", False, str(e))
245
+ return report
246
+
247
+ # Extract ZIP
248
+ extract_dir = os.path.join(tmpdir, "extracted")
249
+ try:
250
+ with zipfile.ZipFile(pk_logs_path, "r") as zf:
251
+ zf.extractall(extract_dir)
252
+ report.add_check("Extract ZIP", True)
253
+ except Exception as e:
254
+ report.add_check("Extract ZIP", False, str(e))
255
+ return report
256
+
257
+ # Find battle log files
258
+ # The ZIP structure is: <exp_name>/*.jsonl
259
+ battle_records = []
260
+ try:
261
+ for root, dirs, filenames in os.walk(extract_dir):
262
+ for filename in filenames:
263
+ if filename.endswith(".jsonl") and "raw_outputs" not in root:
264
+ filepath = os.path.join(root, filename)
265
+ with open(filepath, "r", encoding="utf-8") as f:
266
+ for line in f:
267
+ line = line.strip()
268
+ if line:
269
+ try:
270
+ record = json.loads(line)
271
+ battle_records.append(record)
272
+ except json.JSONDecodeError:
273
+ continue
274
+ report.add_check("Parse battle logs", True)
275
+ except Exception as e:
276
+ report.add_check("Parse battle logs", False, str(e))
277
+ return report
278
+
279
+ # Verify battle count
280
+ expected_battles = submission.get("experiment", {}).get("total_battles", 0)
281
+ if len(battle_records) == expected_battles:
282
+ report.add_check("Battle count", True)
283
+ else:
284
+ report.add_check(
285
+ "Battle count",
286
+ False,
287
+ f"Expected {expected_battles}, got {len(battle_records)}",
288
+ )
289
+
290
+ # Recalculate ELO
291
+ try:
292
+ from genarena.bt_elo import compute_bt_elo_ratings
293
+
294
+ battles = [
295
+ (r["model_a"], r["model_b"], r["final_winner"])
296
+ for r in battle_records
297
+ if r.get("model_a") and r.get("model_b") and r.get("final_winner")
298
+ ]
299
+
300
+ if battles:
301
+ recalc_elo = compute_bt_elo_ratings(battles)
302
+ submitted_elo = submission.get("elo_preview", {}).get("ratings", {})
303
+
304
+ all_match = True
305
+ for model, submitted_rating in submitted_elo.items():
306
+ recalc_rating = recalc_elo.get(model, 0)
307
+ report.elo_comparison[model] = {
308
+ "submitted": submitted_rating,
309
+ "recalculated": recalc_rating,
310
+ }
311
+
312
+ # Allow small floating point differences (±1.0)
313
+ diff = abs(submitted_rating - recalc_rating)
314
+ if diff > 1.0:
315
+ report.add_check(
316
+ f"ELO '{model}'",
317
+ False,
318
+ f"Diff: {diff:.1f} (submitted: {submitted_rating:.1f}, "
319
+ f"recalc: {recalc_rating:.1f})",
320
+ )
321
+ all_match = False
322
+
323
+ if all_match:
324
+ report.add_check("ELO verification", True)
325
+
326
+ except Exception as e:
327
+ report.add_check("ELO verification", False, str(e))
328
+
329
+ return report
@@ -0,0 +1,148 @@
1
+ # GenArena Arena Visualizer
2
+
3
+ A web-based visualization tool for browsing and analyzing battle records from GenArena Arena evaluations.
4
+
5
+ ## Features
6
+
7
+ - **Multi-subset Support**: Select and switch between different subsets directly in the web interface
8
+ - **Paginated Browsing**: Efficiently browse large numbers of battle records with pagination
9
+ - **Flexible Filtering**:
10
+ - Filter by model (view all battles involving a specific model)
11
+ - Filter by result (wins/losses/ties for a selected model)
12
+ - Filter by consistency (consistent vs inconsistent VLM judgments)
13
+ - **Detailed Battle View**: Click any battle card to see:
14
+ - Full instruction text
15
+ - Input image and both model outputs side-by-side
16
+ - Complete VLM judge reasoning (original and swapped calls)
17
+ - **Dark Theme**: Modern dark UI designed for extended analysis sessions
18
+
19
+ ## Installation
20
+
21
+ The visualizer requires Flask:
22
+
23
+ ```bash
24
+ pip install flask
25
+ ```
26
+
27
+ ## Usage
28
+
29
+ ### Command Line
30
+
31
+ Start the visualization server using the `genarena serve` command:
32
+
33
+ ```bash
34
+ genarena serve \
35
+ --arena_dir /path/to/arena \
36
+ --data_dir /path/to/data \
37
+ --port 8080 \
38
+ --host 0.0.0.0
39
+ ```
40
+
41
+ ### Arguments
42
+
43
+ | Argument | Required | Default | Description |
44
+ |----------|----------|---------|-------------|
45
+ | `--arena_dir` | Yes | - | Path to the arena directory containing subset folders with battle logs |
46
+ | `--data_dir` | Yes | - | Path to the data directory containing parquet files |
47
+ | `--host` | No | `0.0.0.0` | Host address to bind the server |
48
+ | `--port` | No | `8080` | Port number to listen on |
49
+ | `--debug` | No | `False` | Enable Flask debug mode |
50
+
51
+ ### Example
52
+
53
+ ```bash
54
+ genarena serve \
55
+ --arena_dir /projects/genarena/arena \
56
+ --data_dir /datasets/genarena/data \
57
+ --port 8080
58
+ ```
59
+
60
+ Then open `http://localhost:8080` in your browser.
61
+
62
+ ## Web Interface
63
+
64
+ ### Navigation
65
+
66
+ 1. **Select Subset**: Use the dropdown in the header to choose a subset
67
+ 2. **Select Experiment**: Choose an experiment from the dropdown (populated after subset selection)
68
+ 3. **Browse Battles**: Scroll through the paginated battle cards
69
+
70
+ ### Filtering
71
+
72
+ Use the sidebar filters to narrow down results:
73
+
74
+ - **Model Filter**: Show only battles involving a specific model
75
+ - **Result Filter**: When a model is selected, filter by wins/losses/ties
76
+ - **Consistency Filter**: Show only consistent or inconsistent judgments
77
+
78
+ ### Battle Cards
79
+
80
+ Each card displays:
81
+ - Model names (winner highlighted in green, loser in red)
82
+ - Instruction text (truncated)
83
+ - Thumbnail images: input, model A output, model B output
84
+ - Result badges (Win/Loss/Tie, Consistent/Inconsistent)
85
+
86
+ Click a card to open the detail modal.
87
+
88
+ ### Detail Modal
89
+
90
+ The detail view shows:
91
+ - Full instruction text
92
+ - Large images for comparison
93
+ - Complete VLM judge outputs for both calls (original order and swapped order)
94
+ - Parse results and winner determination
95
+
96
+ ### Keyboard Shortcuts
97
+
98
+ | Key | Action |
99
+ |-----|--------|
100
+ | `j` / `↓` | Next page |
101
+ | `k` / `↑` | Previous page |
102
+ | `Esc` | Close detail modal |
103
+
104
+ ## API Endpoints
105
+
106
+ The visualizer exposes a REST API that can be used programmatically:
107
+
108
+ | Endpoint | Description |
109
+ |----------|-------------|
110
+ | `GET /api/subsets` | List available subsets |
111
+ | `GET /api/subsets/<subset>/info` | Get subset info (models, experiments) |
112
+ | `GET /api/subsets/<subset>/experiments/<exp>/battles` | Get paginated battles |
113
+ | `GET /api/subsets/<subset>/experiments/<exp>/battles/<id>` | Get battle detail |
114
+ | `GET /api/subsets/<subset>/stats` | Get statistics |
115
+ | `GET /images/<subset>/<model>/<index>` | Serve model output image |
116
+ | `GET /images/<subset>/input/<index>` | Serve input image |
117
+
118
+ ### Query Parameters for `/battles`
119
+
120
+ | Parameter | Type | Description |
121
+ |-----------|------|-------------|
122
+ | `page` | int | Page number (1-indexed) |
123
+ | `page_size` | int | Records per page (default: 20) |
124
+ | `model` | string | Filter by model name |
125
+ | `result` | string | Filter by result: `wins`, `losses`, `ties` |
126
+ | `consistent` | string | Filter by consistency: `true`, `false` |
127
+
128
+ ## Directory Structure
129
+
130
+ ```
131
+ visualize/
132
+ ├── __init__.py # Package exports
133
+ ├── app.py # Flask application and routes
134
+ ├── data_loader.py # Data loading and querying logic
135
+ ├── templates/
136
+ │ └── index.html # Main page template
137
+ └── static/
138
+ ├── style.css # Dark theme styles
139
+ └── app.js # Frontend JavaScript
140
+ ```
141
+
142
+ ## Requirements
143
+
144
+ - Python 3.8+
145
+ - Flask
146
+ - GenArena arena with battle logs (pk_logs directory)
147
+ - Parquet dataset with evaluation data
148
+
@@ -0,0 +1,14 @@
1
+ """
2
+ GenArena Arena Visualization Module.
3
+
4
+ Provides a web-based interface for browsing and analyzing battle records.
5
+ """
6
+
7
+ from genarena.visualize.app import create_app
8
+ from genarena.visualize.data_loader import ArenaDataLoader
9
+
10
+ __all__ = [
11
+ "create_app",
12
+ "ArenaDataLoader",
13
+ ]
14
+