genarena 0.0.1__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. genarena/__init__.py +49 -2
  2. genarena/__main__.py +10 -0
  3. genarena/arena.py +1685 -0
  4. genarena/battle.py +337 -0
  5. genarena/bt_elo.py +507 -0
  6. genarena/cli.py +1581 -0
  7. genarena/data.py +476 -0
  8. genarena/deploy/Dockerfile +22 -0
  9. genarena/deploy/README.md +55 -0
  10. genarena/deploy/__init__.py +5 -0
  11. genarena/deploy/app.py +84 -0
  12. genarena/experiments.py +121 -0
  13. genarena/leaderboard.py +270 -0
  14. genarena/logs.py +409 -0
  15. genarena/models.py +412 -0
  16. genarena/prompts/__init__.py +127 -0
  17. genarena/prompts/mmrb2.py +373 -0
  18. genarena/sampling.py +336 -0
  19. genarena/state.py +656 -0
  20. genarena/sync/__init__.py +105 -0
  21. genarena/sync/auto_commit.py +118 -0
  22. genarena/sync/deploy_ops.py +543 -0
  23. genarena/sync/git_ops.py +422 -0
  24. genarena/sync/hf_ops.py +891 -0
  25. genarena/sync/init_ops.py +431 -0
  26. genarena/sync/packer.py +587 -0
  27. genarena/sync/submit.py +837 -0
  28. genarena/utils.py +103 -0
  29. genarena/validation/__init__.py +19 -0
  30. genarena/validation/schema.py +327 -0
  31. genarena/validation/validator.py +329 -0
  32. genarena/visualize/README.md +148 -0
  33. genarena/visualize/__init__.py +14 -0
  34. genarena/visualize/app.py +938 -0
  35. genarena/visualize/data_loader.py +2430 -0
  36. genarena/visualize/static/app.js +3762 -0
  37. genarena/visualize/static/model_aliases.json +86 -0
  38. genarena/visualize/static/style.css +4104 -0
  39. genarena/visualize/templates/index.html +413 -0
  40. genarena/vlm.py +519 -0
  41. genarena-0.1.1.dist-info/METADATA +178 -0
  42. genarena-0.1.1.dist-info/RECORD +44 -0
  43. {genarena-0.0.1.dist-info → genarena-0.1.1.dist-info}/WHEEL +1 -2
  44. genarena-0.1.1.dist-info/entry_points.txt +2 -0
  45. genarena-0.0.1.dist-info/METADATA +0 -26
  46. genarena-0.0.1.dist-info/RECORD +0 -5
  47. genarena-0.0.1.dist-info/top_level.txt +0 -1
genarena/utils.py ADDED
@@ -0,0 +1,103 @@
1
+ # Copyright 2026 Ruihang Li.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See LICENSE file in the project root for details.
4
+
5
+ """Utility functions for genarena."""
6
+
7
+ import os
8
+ import re
9
+ import threading
10
+ from datetime import datetime
11
+
12
+
13
+ # Thread lock for directory creation
14
+ _dir_lock = threading.Lock()
15
+
16
+
17
+ def sanitize_name(name: str) -> str:
18
+ """
19
+ Sanitize a model name for use in file paths.
20
+
21
+ Replaces special characters (/, \\, :, etc.) with underscores.
22
+
23
+ Args:
24
+ name: The original name (e.g., model path like "org/model-name")
25
+
26
+ Returns:
27
+ Sanitized name safe for file system use
28
+ """
29
+ # Replace common path separators and special characters
30
+ sanitized = re.sub(r'[/\\:*?"<>|]', '_', name)
31
+ # Replace multiple underscores with single one
32
+ sanitized = re.sub(r'_+', '_', sanitized)
33
+ # Strip leading/trailing underscores
34
+ sanitized = sanitized.strip('_')
35
+ return sanitized
36
+
37
+
38
+ def ensure_dir(path: str) -> None:
39
+ """
40
+ Create a directory if it doesn't exist (thread-safe).
41
+
42
+ Args:
43
+ path: Directory path to create
44
+ """
45
+ with _dir_lock:
46
+ if not os.path.exists(path):
47
+ os.makedirs(path, exist_ok=True)
48
+
49
+
50
+ def timestamp() -> str:
51
+ """
52
+ Generate a timestamp string in YYYYMMDD_HHMM format.
53
+
54
+ Returns:
55
+ Timestamp string
56
+ """
57
+ return datetime.now().strftime("%Y%m%d_%H%M")
58
+
59
+
60
+ def get_sorted_model_pair(model_a: str, model_b: str) -> tuple[str, str, bool]:
61
+ """
62
+ Sort a model pair alphabetically to ensure consistent file naming.
63
+
64
+ This ensures that battles between model_a and model_b always use
65
+ the same log file regardless of which model is passed as first argument.
66
+
67
+ Args:
68
+ model_a: First model name
69
+ model_b: Second model name
70
+
71
+ Returns:
72
+ Tuple of (sorted_first, sorted_second, swapped) where swapped is True
73
+ if the order was changed
74
+ """
75
+ if model_a <= model_b:
76
+ return model_a, model_b, False
77
+ else:
78
+ return model_b, model_a, True
79
+
80
+
81
+ def get_battle_log_filename(model_a: str, model_b: str) -> str:
82
+ """
83
+ Generate a consistent log filename for a model pair.
84
+
85
+ Args:
86
+ model_a: First model name
87
+ model_b: Second model name
88
+
89
+ Returns:
90
+ Filename in format "<sorted_model_a>_vs_<sorted_model_b>.jsonl"
91
+ """
92
+ first, second, _ = get_sorted_model_pair(model_a, model_b)
93
+ return f"{sanitize_name(first)}_vs_{sanitize_name(second)}.jsonl"
94
+
95
+
96
+ def iso_timestamp() -> str:
97
+ """
98
+ Generate an ISO 8601 timestamp string.
99
+
100
+ Returns:
101
+ ISO format timestamp string (e.g., "2026-01-16T10:30:00Z")
102
+ """
103
+ return datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
@@ -0,0 +1,19 @@
1
+ """Validation module for GenArena submissions."""
2
+
3
+ from genarena.validation.schema import (
4
+ SUBMISSION_SCHEMA,
5
+ validate_submission_schema,
6
+ )
7
+ from genarena.validation.validator import (
8
+ validate_submission_file,
9
+ validate_submission_data,
10
+ ValidationReport,
11
+ )
12
+
13
+ __all__ = [
14
+ "SUBMISSION_SCHEMA",
15
+ "validate_submission_schema",
16
+ "validate_submission_file",
17
+ "validate_submission_data",
18
+ "ValidationReport",
19
+ ]
@@ -0,0 +1,327 @@
1
+ # Copyright 2026 Ruihang Li.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See LICENSE file in the project root for details.
4
+
5
+ """
6
+ JSON Schema definition for GenArena submissions.
7
+
8
+ This schema defines the structure of submission metadata files
9
+ that are submitted via GitHub PR to the official leaderboard.
10
+ """
11
+
12
+ from typing import Any
13
+
14
+ # JSON Schema for submission metadata
15
+ SUBMISSION_SCHEMA: dict[str, Any] = {
16
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
17
+ "title": "GenArena Submission",
18
+ "description": "Metadata for a GenArena evaluation submission",
19
+ "type": "object",
20
+ "required": [
21
+ "schema_version",
22
+ "submission_id",
23
+ "created_at",
24
+ "submitter",
25
+ "experiment",
26
+ "data_location",
27
+ "elo_preview",
28
+ ],
29
+ "properties": {
30
+ "schema_version": {
31
+ "type": "string",
32
+ "description": "Schema version (e.g., '1.0')",
33
+ "pattern": "^\\d+\\.\\d+$",
34
+ },
35
+ "submission_id": {
36
+ "type": "string",
37
+ "description": "Unique submission identifier",
38
+ "pattern": "^sub_\\d{8}T\\d{6}_[a-f0-9]{8}$",
39
+ },
40
+ "created_at": {
41
+ "type": "string",
42
+ "description": "ISO 8601 timestamp of submission creation",
43
+ "format": "date-time",
44
+ },
45
+ "submitter": {
46
+ "type": "object",
47
+ "required": ["github_username"],
48
+ "properties": {
49
+ "github_username": {
50
+ "type": "string",
51
+ "description": "GitHub username of submitter",
52
+ "minLength": 1,
53
+ },
54
+ "contact": {
55
+ "type": "string",
56
+ "description": "Optional contact email",
57
+ "format": "email",
58
+ },
59
+ },
60
+ },
61
+ "experiment": {
62
+ "type": "object",
63
+ "required": [
64
+ "exp_name",
65
+ "subset",
66
+ "models",
67
+ "new_models",
68
+ "total_battles",
69
+ ],
70
+ "properties": {
71
+ "exp_name": {
72
+ "type": "string",
73
+ "description": "Experiment name (must end with _yyyymmdd)",
74
+ "pattern": "^.+_\\d{8}$",
75
+ },
76
+ "subset": {
77
+ "type": "string",
78
+ "description": "Subset name (e.g., 'basic')",
79
+ "minLength": 1,
80
+ },
81
+ "models": {
82
+ "type": "array",
83
+ "description": "List of all model names in the experiment",
84
+ "items": {"type": "string"},
85
+ "minItems": 2,
86
+ },
87
+ "new_models": {
88
+ "type": "array",
89
+ "description": "List of new model names (not in official leaderboard)",
90
+ "items": {"type": "string"},
91
+ "minItems": 1,
92
+ },
93
+ "existing_models": {
94
+ "type": "array",
95
+ "description": "List of existing model names (already in official)",
96
+ "items": {"type": "string"},
97
+ },
98
+ "model_pairs": {
99
+ "type": "array",
100
+ "description": "List of model pairs evaluated",
101
+ "items": {
102
+ "type": "array",
103
+ "items": {"type": "string"},
104
+ "minItems": 2,
105
+ "maxItems": 2,
106
+ },
107
+ },
108
+ "total_battles": {
109
+ "type": "integer",
110
+ "description": "Total number of battles",
111
+ "minimum": 1,
112
+ },
113
+ "battles_per_pair": {
114
+ "type": "object",
115
+ "description": "Battle count per model pair",
116
+ "additionalProperties": {"type": "integer"},
117
+ },
118
+ },
119
+ },
120
+ "data_location": {
121
+ "type": "object",
122
+ "required": ["hf_repo_id", "files"],
123
+ "properties": {
124
+ "hf_repo_id": {
125
+ "type": "string",
126
+ "description": "HuggingFace repository ID",
127
+ "pattern": "^[\\w.-]+/[\\w.-]+$",
128
+ },
129
+ "hf_revision": {
130
+ "type": "string",
131
+ "description": "HuggingFace revision/branch",
132
+ "default": "main",
133
+ },
134
+ "files": {
135
+ "type": "object",
136
+ "required": ["models_zip", "pk_logs_zip"],
137
+ "properties": {
138
+ "models_zip": {
139
+ "$ref": "#/$defs/file_info",
140
+ },
141
+ "pk_logs_zip": {
142
+ "$ref": "#/$defs/file_info",
143
+ },
144
+ },
145
+ },
146
+ },
147
+ },
148
+ "elo_preview": {
149
+ "type": "object",
150
+ "required": ["ratings"],
151
+ "properties": {
152
+ "ratings": {
153
+ "type": "object",
154
+ "description": "ELO ratings by model",
155
+ "additionalProperties": {"type": "number"},
156
+ },
157
+ "ci_95": {
158
+ "type": "object",
159
+ "description": "95% confidence intervals by model",
160
+ "additionalProperties": {
161
+ "type": "array",
162
+ "items": {"type": "number"},
163
+ "minItems": 2,
164
+ "maxItems": 2,
165
+ },
166
+ },
167
+ },
168
+ },
169
+ "evaluation_config": {
170
+ "type": "object",
171
+ "description": "Evaluation configuration used",
172
+ "properties": {
173
+ "judge_model": {
174
+ "type": "string",
175
+ "description": "VLM judge model name",
176
+ },
177
+ "prompt_module": {
178
+ "type": "string",
179
+ "description": "Prompt module name",
180
+ },
181
+ "temperature": {
182
+ "type": "number",
183
+ "description": "VLM temperature",
184
+ "minimum": 0,
185
+ },
186
+ "position_debiasing": {
187
+ "type": "boolean",
188
+ "description": "Whether position debiasing was used",
189
+ },
190
+ },
191
+ },
192
+ "title": {
193
+ "type": "string",
194
+ "description": "Submission title",
195
+ },
196
+ "description": {
197
+ "type": "string",
198
+ "description": "Submission description",
199
+ },
200
+ "verification": {
201
+ "type": "object",
202
+ "properties": {
203
+ "local_validation_passed": {
204
+ "type": "boolean",
205
+ "description": "Whether local validation passed",
206
+ },
207
+ "genarena_version": {
208
+ "type": "string",
209
+ "description": "genarena version used for submission",
210
+ },
211
+ },
212
+ },
213
+ },
214
+ "$defs": {
215
+ "file_info": {
216
+ "type": "object",
217
+ "required": ["path", "sha256", "size_bytes"],
218
+ "properties": {
219
+ "path": {
220
+ "type": "string",
221
+ "description": "File path in HF repo",
222
+ },
223
+ "sha256": {
224
+ "type": "string",
225
+ "description": "SHA256 checksum",
226
+ "pattern": "^[a-f0-9]{64}$",
227
+ },
228
+ "size_bytes": {
229
+ "type": "integer",
230
+ "description": "File size in bytes",
231
+ "minimum": 1,
232
+ },
233
+ },
234
+ },
235
+ },
236
+ }
237
+
238
+
239
+ def validate_submission_schema(submission: dict[str, Any]) -> tuple[bool, list[str]]:
240
+ """
241
+ Validate submission against JSON schema.
242
+
243
+ Args:
244
+ submission: Submission metadata dictionary
245
+
246
+ Returns:
247
+ Tuple of (is_valid, list of error messages)
248
+ """
249
+ try:
250
+ import jsonschema
251
+ except ImportError:
252
+ # If jsonschema is not available, do basic validation
253
+ return _basic_validation(submission)
254
+
255
+ errors: list[str] = []
256
+
257
+ try:
258
+ jsonschema.validate(instance=submission, schema=SUBMISSION_SCHEMA)
259
+ return True, []
260
+ except jsonschema.ValidationError as e:
261
+ errors.append(f"Schema validation error: {e.message}")
262
+ if e.path:
263
+ errors.append(f" at path: {'.'.join(str(p) for p in e.path)}")
264
+ return False, errors
265
+ except jsonschema.SchemaError as e:
266
+ errors.append(f"Schema error: {e.message}")
267
+ return False, errors
268
+
269
+
270
+ def _basic_validation(submission: dict[str, Any]) -> tuple[bool, list[str]]:
271
+ """Basic validation without jsonschema library."""
272
+ errors: list[str] = []
273
+
274
+ required_fields = [
275
+ "schema_version",
276
+ "submission_id",
277
+ "created_at",
278
+ "submitter",
279
+ "experiment",
280
+ "data_location",
281
+ "elo_preview",
282
+ ]
283
+
284
+ for field in required_fields:
285
+ if field not in submission:
286
+ errors.append(f"Missing required field: {field}")
287
+
288
+ if errors:
289
+ return False, errors
290
+
291
+ # Check submitter
292
+ if "github_username" not in submission.get("submitter", {}):
293
+ errors.append("Missing submitter.github_username")
294
+
295
+ # Check experiment
296
+ exp = submission.get("experiment", {})
297
+ exp_required = ["exp_name", "subset", "models", "new_models", "total_battles"]
298
+ for field in exp_required:
299
+ if field not in exp:
300
+ errors.append(f"Missing experiment.{field}")
301
+
302
+ # Check new_models is not empty
303
+ if not exp.get("new_models"):
304
+ errors.append("experiment.new_models must have at least one model")
305
+
306
+ # Check data_location
307
+ data_loc = submission.get("data_location", {})
308
+ if "hf_repo_id" not in data_loc:
309
+ errors.append("Missing data_location.hf_repo_id")
310
+ if "files" not in data_loc:
311
+ errors.append("Missing data_location.files")
312
+ else:
313
+ files = data_loc.get("files", {})
314
+ for zip_type in ["models_zip", "pk_logs_zip"]:
315
+ if zip_type not in files:
316
+ errors.append(f"Missing data_location.files.{zip_type}")
317
+ else:
318
+ file_info = files[zip_type]
319
+ for field in ["path", "sha256", "size_bytes"]:
320
+ if field not in file_info:
321
+ errors.append(f"Missing data_location.files.{zip_type}.{field}")
322
+
323
+ # Check elo_preview
324
+ if "ratings" not in submission.get("elo_preview", {}):
325
+ errors.append("Missing elo_preview.ratings")
326
+
327
+ return len(errors) == 0, errors