genarena 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genarena/__init__.py +49 -2
- genarena/__main__.py +10 -0
- genarena/arena.py +1685 -0
- genarena/battle.py +337 -0
- genarena/bt_elo.py +507 -0
- genarena/cli.py +1581 -0
- genarena/data.py +476 -0
- genarena/deploy/Dockerfile +25 -0
- genarena/deploy/README.md +55 -0
- genarena/deploy/__init__.py +5 -0
- genarena/deploy/app.py +84 -0
- genarena/experiments.py +121 -0
- genarena/leaderboard.py +270 -0
- genarena/logs.py +409 -0
- genarena/models.py +412 -0
- genarena/prompts/__init__.py +127 -0
- genarena/prompts/mmrb2.py +373 -0
- genarena/sampling.py +336 -0
- genarena/state.py +656 -0
- genarena/sync/__init__.py +105 -0
- genarena/sync/auto_commit.py +118 -0
- genarena/sync/deploy_ops.py +543 -0
- genarena/sync/git_ops.py +422 -0
- genarena/sync/hf_ops.py +891 -0
- genarena/sync/init_ops.py +431 -0
- genarena/sync/packer.py +587 -0
- genarena/sync/submit.py +837 -0
- genarena/utils.py +103 -0
- genarena/validation/__init__.py +19 -0
- genarena/validation/schema.py +327 -0
- genarena/validation/validator.py +329 -0
- genarena/visualize/README.md +148 -0
- genarena/visualize/__init__.py +14 -0
- genarena/visualize/app.py +938 -0
- genarena/visualize/data_loader.py +2335 -0
- genarena/visualize/static/app.js +3762 -0
- genarena/visualize/static/model_aliases.json +86 -0
- genarena/visualize/static/style.css +4104 -0
- genarena/visualize/templates/index.html +413 -0
- genarena/vlm.py +519 -0
- genarena-0.1.0.dist-info/METADATA +178 -0
- genarena-0.1.0.dist-info/RECORD +44 -0
- {genarena-0.0.1.dist-info → genarena-0.1.0.dist-info}/WHEEL +1 -2
- genarena-0.1.0.dist-info/entry_points.txt +2 -0
- genarena-0.0.1.dist-info/METADATA +0 -26
- genarena-0.0.1.dist-info/RECORD +0 -5
- genarena-0.0.1.dist-info/top_level.txt +0 -1
genarena/utils.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# Copyright 2026 Ruihang Li.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0.
|
|
3
|
+
# See LICENSE file in the project root for details.
|
|
4
|
+
|
|
5
|
+
"""Utility functions for genarena."""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import threading
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Thread lock for directory creation
|
|
14
|
+
_dir_lock = threading.Lock()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def sanitize_name(name: str) -> str:
|
|
18
|
+
"""
|
|
19
|
+
Sanitize a model name for use in file paths.
|
|
20
|
+
|
|
21
|
+
Replaces special characters (/, \\, :, etc.) with underscores.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
name: The original name (e.g., model path like "org/model-name")
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Sanitized name safe for file system use
|
|
28
|
+
"""
|
|
29
|
+
# Replace common path separators and special characters
|
|
30
|
+
sanitized = re.sub(r'[/\\:*?"<>|]', '_', name)
|
|
31
|
+
# Replace multiple underscores with single one
|
|
32
|
+
sanitized = re.sub(r'_+', '_', sanitized)
|
|
33
|
+
# Strip leading/trailing underscores
|
|
34
|
+
sanitized = sanitized.strip('_')
|
|
35
|
+
return sanitized
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def ensure_dir(path: str) -> None:
|
|
39
|
+
"""
|
|
40
|
+
Create a directory if it doesn't exist (thread-safe).
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
path: Directory path to create
|
|
44
|
+
"""
|
|
45
|
+
with _dir_lock:
|
|
46
|
+
if not os.path.exists(path):
|
|
47
|
+
os.makedirs(path, exist_ok=True)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def timestamp() -> str:
|
|
51
|
+
"""
|
|
52
|
+
Generate a timestamp string in YYYYMMDD_HHMM format.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Timestamp string
|
|
56
|
+
"""
|
|
57
|
+
return datetime.now().strftime("%Y%m%d_%H%M")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_sorted_model_pair(model_a: str, model_b: str) -> tuple[str, str, bool]:
|
|
61
|
+
"""
|
|
62
|
+
Sort a model pair alphabetically to ensure consistent file naming.
|
|
63
|
+
|
|
64
|
+
This ensures that battles between model_a and model_b always use
|
|
65
|
+
the same log file regardless of which model is passed as first argument.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
model_a: First model name
|
|
69
|
+
model_b: Second model name
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Tuple of (sorted_first, sorted_second, swapped) where swapped is True
|
|
73
|
+
if the order was changed
|
|
74
|
+
"""
|
|
75
|
+
if model_a <= model_b:
|
|
76
|
+
return model_a, model_b, False
|
|
77
|
+
else:
|
|
78
|
+
return model_b, model_a, True
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def get_battle_log_filename(model_a: str, model_b: str) -> str:
|
|
82
|
+
"""
|
|
83
|
+
Generate a consistent log filename for a model pair.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
model_a: First model name
|
|
87
|
+
model_b: Second model name
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Filename in format "<sorted_model_a>_vs_<sorted_model_b>.jsonl"
|
|
91
|
+
"""
|
|
92
|
+
first, second, _ = get_sorted_model_pair(model_a, model_b)
|
|
93
|
+
return f"{sanitize_name(first)}_vs_{sanitize_name(second)}.jsonl"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def iso_timestamp() -> str:
|
|
97
|
+
"""
|
|
98
|
+
Generate an ISO 8601 timestamp string.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
ISO format timestamp string (e.g., "2026-01-16T10:30:00Z")
|
|
102
|
+
"""
|
|
103
|
+
return datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Validation module for GenArena submissions."""
|
|
2
|
+
|
|
3
|
+
from genarena.validation.schema import (
|
|
4
|
+
SUBMISSION_SCHEMA,
|
|
5
|
+
validate_submission_schema,
|
|
6
|
+
)
|
|
7
|
+
from genarena.validation.validator import (
|
|
8
|
+
validate_submission_file,
|
|
9
|
+
validate_submission_data,
|
|
10
|
+
ValidationReport,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"SUBMISSION_SCHEMA",
|
|
15
|
+
"validate_submission_schema",
|
|
16
|
+
"validate_submission_file",
|
|
17
|
+
"validate_submission_data",
|
|
18
|
+
"ValidationReport",
|
|
19
|
+
]
|
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
# Copyright 2026 Ruihang Li.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0.
|
|
3
|
+
# See LICENSE file in the project root for details.
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
JSON Schema definition for GenArena submissions.
|
|
7
|
+
|
|
8
|
+
This schema defines the structure of submission metadata files
|
|
9
|
+
that are submitted via GitHub PR to the official leaderboard.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
# JSON Schema for submission metadata
|
|
15
|
+
SUBMISSION_SCHEMA: dict[str, Any] = {
|
|
16
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
17
|
+
"title": "GenArena Submission",
|
|
18
|
+
"description": "Metadata for a GenArena evaluation submission",
|
|
19
|
+
"type": "object",
|
|
20
|
+
"required": [
|
|
21
|
+
"schema_version",
|
|
22
|
+
"submission_id",
|
|
23
|
+
"created_at",
|
|
24
|
+
"submitter",
|
|
25
|
+
"experiment",
|
|
26
|
+
"data_location",
|
|
27
|
+
"elo_preview",
|
|
28
|
+
],
|
|
29
|
+
"properties": {
|
|
30
|
+
"schema_version": {
|
|
31
|
+
"type": "string",
|
|
32
|
+
"description": "Schema version (e.g., '1.0')",
|
|
33
|
+
"pattern": "^\\d+\\.\\d+$",
|
|
34
|
+
},
|
|
35
|
+
"submission_id": {
|
|
36
|
+
"type": "string",
|
|
37
|
+
"description": "Unique submission identifier",
|
|
38
|
+
"pattern": "^sub_\\d{8}T\\d{6}_[a-f0-9]{8}$",
|
|
39
|
+
},
|
|
40
|
+
"created_at": {
|
|
41
|
+
"type": "string",
|
|
42
|
+
"description": "ISO 8601 timestamp of submission creation",
|
|
43
|
+
"format": "date-time",
|
|
44
|
+
},
|
|
45
|
+
"submitter": {
|
|
46
|
+
"type": "object",
|
|
47
|
+
"required": ["github_username"],
|
|
48
|
+
"properties": {
|
|
49
|
+
"github_username": {
|
|
50
|
+
"type": "string",
|
|
51
|
+
"description": "GitHub username of submitter",
|
|
52
|
+
"minLength": 1,
|
|
53
|
+
},
|
|
54
|
+
"contact": {
|
|
55
|
+
"type": "string",
|
|
56
|
+
"description": "Optional contact email",
|
|
57
|
+
"format": "email",
|
|
58
|
+
},
|
|
59
|
+
},
|
|
60
|
+
},
|
|
61
|
+
"experiment": {
|
|
62
|
+
"type": "object",
|
|
63
|
+
"required": [
|
|
64
|
+
"exp_name",
|
|
65
|
+
"subset",
|
|
66
|
+
"models",
|
|
67
|
+
"new_models",
|
|
68
|
+
"total_battles",
|
|
69
|
+
],
|
|
70
|
+
"properties": {
|
|
71
|
+
"exp_name": {
|
|
72
|
+
"type": "string",
|
|
73
|
+
"description": "Experiment name (must end with _yyyymmdd)",
|
|
74
|
+
"pattern": "^.+_\\d{8}$",
|
|
75
|
+
},
|
|
76
|
+
"subset": {
|
|
77
|
+
"type": "string",
|
|
78
|
+
"description": "Subset name (e.g., 'basic')",
|
|
79
|
+
"minLength": 1,
|
|
80
|
+
},
|
|
81
|
+
"models": {
|
|
82
|
+
"type": "array",
|
|
83
|
+
"description": "List of all model names in the experiment",
|
|
84
|
+
"items": {"type": "string"},
|
|
85
|
+
"minItems": 2,
|
|
86
|
+
},
|
|
87
|
+
"new_models": {
|
|
88
|
+
"type": "array",
|
|
89
|
+
"description": "List of new model names (not in official leaderboard)",
|
|
90
|
+
"items": {"type": "string"},
|
|
91
|
+
"minItems": 1,
|
|
92
|
+
},
|
|
93
|
+
"existing_models": {
|
|
94
|
+
"type": "array",
|
|
95
|
+
"description": "List of existing model names (already in official)",
|
|
96
|
+
"items": {"type": "string"},
|
|
97
|
+
},
|
|
98
|
+
"model_pairs": {
|
|
99
|
+
"type": "array",
|
|
100
|
+
"description": "List of model pairs evaluated",
|
|
101
|
+
"items": {
|
|
102
|
+
"type": "array",
|
|
103
|
+
"items": {"type": "string"},
|
|
104
|
+
"minItems": 2,
|
|
105
|
+
"maxItems": 2,
|
|
106
|
+
},
|
|
107
|
+
},
|
|
108
|
+
"total_battles": {
|
|
109
|
+
"type": "integer",
|
|
110
|
+
"description": "Total number of battles",
|
|
111
|
+
"minimum": 1,
|
|
112
|
+
},
|
|
113
|
+
"battles_per_pair": {
|
|
114
|
+
"type": "object",
|
|
115
|
+
"description": "Battle count per model pair",
|
|
116
|
+
"additionalProperties": {"type": "integer"},
|
|
117
|
+
},
|
|
118
|
+
},
|
|
119
|
+
},
|
|
120
|
+
"data_location": {
|
|
121
|
+
"type": "object",
|
|
122
|
+
"required": ["hf_repo_id", "files"],
|
|
123
|
+
"properties": {
|
|
124
|
+
"hf_repo_id": {
|
|
125
|
+
"type": "string",
|
|
126
|
+
"description": "HuggingFace repository ID",
|
|
127
|
+
"pattern": "^[\\w.-]+/[\\w.-]+$",
|
|
128
|
+
},
|
|
129
|
+
"hf_revision": {
|
|
130
|
+
"type": "string",
|
|
131
|
+
"description": "HuggingFace revision/branch",
|
|
132
|
+
"default": "main",
|
|
133
|
+
},
|
|
134
|
+
"files": {
|
|
135
|
+
"type": "object",
|
|
136
|
+
"required": ["models_zip", "pk_logs_zip"],
|
|
137
|
+
"properties": {
|
|
138
|
+
"models_zip": {
|
|
139
|
+
"$ref": "#/$defs/file_info",
|
|
140
|
+
},
|
|
141
|
+
"pk_logs_zip": {
|
|
142
|
+
"$ref": "#/$defs/file_info",
|
|
143
|
+
},
|
|
144
|
+
},
|
|
145
|
+
},
|
|
146
|
+
},
|
|
147
|
+
},
|
|
148
|
+
"elo_preview": {
|
|
149
|
+
"type": "object",
|
|
150
|
+
"required": ["ratings"],
|
|
151
|
+
"properties": {
|
|
152
|
+
"ratings": {
|
|
153
|
+
"type": "object",
|
|
154
|
+
"description": "ELO ratings by model",
|
|
155
|
+
"additionalProperties": {"type": "number"},
|
|
156
|
+
},
|
|
157
|
+
"ci_95": {
|
|
158
|
+
"type": "object",
|
|
159
|
+
"description": "95% confidence intervals by model",
|
|
160
|
+
"additionalProperties": {
|
|
161
|
+
"type": "array",
|
|
162
|
+
"items": {"type": "number"},
|
|
163
|
+
"minItems": 2,
|
|
164
|
+
"maxItems": 2,
|
|
165
|
+
},
|
|
166
|
+
},
|
|
167
|
+
},
|
|
168
|
+
},
|
|
169
|
+
"evaluation_config": {
|
|
170
|
+
"type": "object",
|
|
171
|
+
"description": "Evaluation configuration used",
|
|
172
|
+
"properties": {
|
|
173
|
+
"judge_model": {
|
|
174
|
+
"type": "string",
|
|
175
|
+
"description": "VLM judge model name",
|
|
176
|
+
},
|
|
177
|
+
"prompt_module": {
|
|
178
|
+
"type": "string",
|
|
179
|
+
"description": "Prompt module name",
|
|
180
|
+
},
|
|
181
|
+
"temperature": {
|
|
182
|
+
"type": "number",
|
|
183
|
+
"description": "VLM temperature",
|
|
184
|
+
"minimum": 0,
|
|
185
|
+
},
|
|
186
|
+
"position_debiasing": {
|
|
187
|
+
"type": "boolean",
|
|
188
|
+
"description": "Whether position debiasing was used",
|
|
189
|
+
},
|
|
190
|
+
},
|
|
191
|
+
},
|
|
192
|
+
"title": {
|
|
193
|
+
"type": "string",
|
|
194
|
+
"description": "Submission title",
|
|
195
|
+
},
|
|
196
|
+
"description": {
|
|
197
|
+
"type": "string",
|
|
198
|
+
"description": "Submission description",
|
|
199
|
+
},
|
|
200
|
+
"verification": {
|
|
201
|
+
"type": "object",
|
|
202
|
+
"properties": {
|
|
203
|
+
"local_validation_passed": {
|
|
204
|
+
"type": "boolean",
|
|
205
|
+
"description": "Whether local validation passed",
|
|
206
|
+
},
|
|
207
|
+
"genarena_version": {
|
|
208
|
+
"type": "string",
|
|
209
|
+
"description": "genarena version used for submission",
|
|
210
|
+
},
|
|
211
|
+
},
|
|
212
|
+
},
|
|
213
|
+
},
|
|
214
|
+
"$defs": {
|
|
215
|
+
"file_info": {
|
|
216
|
+
"type": "object",
|
|
217
|
+
"required": ["path", "sha256", "size_bytes"],
|
|
218
|
+
"properties": {
|
|
219
|
+
"path": {
|
|
220
|
+
"type": "string",
|
|
221
|
+
"description": "File path in HF repo",
|
|
222
|
+
},
|
|
223
|
+
"sha256": {
|
|
224
|
+
"type": "string",
|
|
225
|
+
"description": "SHA256 checksum",
|
|
226
|
+
"pattern": "^[a-f0-9]{64}$",
|
|
227
|
+
},
|
|
228
|
+
"size_bytes": {
|
|
229
|
+
"type": "integer",
|
|
230
|
+
"description": "File size in bytes",
|
|
231
|
+
"minimum": 1,
|
|
232
|
+
},
|
|
233
|
+
},
|
|
234
|
+
},
|
|
235
|
+
},
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def validate_submission_schema(submission: dict[str, Any]) -> tuple[bool, list[str]]:
|
|
240
|
+
"""
|
|
241
|
+
Validate submission against JSON schema.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
submission: Submission metadata dictionary
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Tuple of (is_valid, list of error messages)
|
|
248
|
+
"""
|
|
249
|
+
try:
|
|
250
|
+
import jsonschema
|
|
251
|
+
except ImportError:
|
|
252
|
+
# If jsonschema is not available, do basic validation
|
|
253
|
+
return _basic_validation(submission)
|
|
254
|
+
|
|
255
|
+
errors: list[str] = []
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
jsonschema.validate(instance=submission, schema=SUBMISSION_SCHEMA)
|
|
259
|
+
return True, []
|
|
260
|
+
except jsonschema.ValidationError as e:
|
|
261
|
+
errors.append(f"Schema validation error: {e.message}")
|
|
262
|
+
if e.path:
|
|
263
|
+
errors.append(f" at path: {'.'.join(str(p) for p in e.path)}")
|
|
264
|
+
return False, errors
|
|
265
|
+
except jsonschema.SchemaError as e:
|
|
266
|
+
errors.append(f"Schema error: {e.message}")
|
|
267
|
+
return False, errors
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _basic_validation(submission: dict[str, Any]) -> tuple[bool, list[str]]:
|
|
271
|
+
"""Basic validation without jsonschema library."""
|
|
272
|
+
errors: list[str] = []
|
|
273
|
+
|
|
274
|
+
required_fields = [
|
|
275
|
+
"schema_version",
|
|
276
|
+
"submission_id",
|
|
277
|
+
"created_at",
|
|
278
|
+
"submitter",
|
|
279
|
+
"experiment",
|
|
280
|
+
"data_location",
|
|
281
|
+
"elo_preview",
|
|
282
|
+
]
|
|
283
|
+
|
|
284
|
+
for field in required_fields:
|
|
285
|
+
if field not in submission:
|
|
286
|
+
errors.append(f"Missing required field: {field}")
|
|
287
|
+
|
|
288
|
+
if errors:
|
|
289
|
+
return False, errors
|
|
290
|
+
|
|
291
|
+
# Check submitter
|
|
292
|
+
if "github_username" not in submission.get("submitter", {}):
|
|
293
|
+
errors.append("Missing submitter.github_username")
|
|
294
|
+
|
|
295
|
+
# Check experiment
|
|
296
|
+
exp = submission.get("experiment", {})
|
|
297
|
+
exp_required = ["exp_name", "subset", "models", "new_models", "total_battles"]
|
|
298
|
+
for field in exp_required:
|
|
299
|
+
if field not in exp:
|
|
300
|
+
errors.append(f"Missing experiment.{field}")
|
|
301
|
+
|
|
302
|
+
# Check new_models is not empty
|
|
303
|
+
if not exp.get("new_models"):
|
|
304
|
+
errors.append("experiment.new_models must have at least one model")
|
|
305
|
+
|
|
306
|
+
# Check data_location
|
|
307
|
+
data_loc = submission.get("data_location", {})
|
|
308
|
+
if "hf_repo_id" not in data_loc:
|
|
309
|
+
errors.append("Missing data_location.hf_repo_id")
|
|
310
|
+
if "files" not in data_loc:
|
|
311
|
+
errors.append("Missing data_location.files")
|
|
312
|
+
else:
|
|
313
|
+
files = data_loc.get("files", {})
|
|
314
|
+
for zip_type in ["models_zip", "pk_logs_zip"]:
|
|
315
|
+
if zip_type not in files:
|
|
316
|
+
errors.append(f"Missing data_location.files.{zip_type}")
|
|
317
|
+
else:
|
|
318
|
+
file_info = files[zip_type]
|
|
319
|
+
for field in ["path", "sha256", "size_bytes"]:
|
|
320
|
+
if field not in file_info:
|
|
321
|
+
errors.append(f"Missing data_location.files.{zip_type}.{field}")
|
|
322
|
+
|
|
323
|
+
# Check elo_preview
|
|
324
|
+
if "ratings" not in submission.get("elo_preview", {}):
|
|
325
|
+
errors.append("Missing elo_preview.ratings")
|
|
326
|
+
|
|
327
|
+
return len(errors) == 0, errors
|