genarena 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genarena/__init__.py +49 -2
- genarena/__main__.py +10 -0
- genarena/arena.py +1685 -0
- genarena/battle.py +337 -0
- genarena/bt_elo.py +507 -0
- genarena/cli.py +1581 -0
- genarena/data.py +476 -0
- genarena/deploy/Dockerfile +25 -0
- genarena/deploy/README.md +55 -0
- genarena/deploy/__init__.py +5 -0
- genarena/deploy/app.py +84 -0
- genarena/experiments.py +121 -0
- genarena/leaderboard.py +270 -0
- genarena/logs.py +409 -0
- genarena/models.py +412 -0
- genarena/prompts/__init__.py +127 -0
- genarena/prompts/mmrb2.py +373 -0
- genarena/sampling.py +336 -0
- genarena/state.py +656 -0
- genarena/sync/__init__.py +105 -0
- genarena/sync/auto_commit.py +118 -0
- genarena/sync/deploy_ops.py +543 -0
- genarena/sync/git_ops.py +422 -0
- genarena/sync/hf_ops.py +891 -0
- genarena/sync/init_ops.py +431 -0
- genarena/sync/packer.py +587 -0
- genarena/sync/submit.py +837 -0
- genarena/utils.py +103 -0
- genarena/validation/__init__.py +19 -0
- genarena/validation/schema.py +327 -0
- genarena/validation/validator.py +329 -0
- genarena/visualize/README.md +148 -0
- genarena/visualize/__init__.py +14 -0
- genarena/visualize/app.py +938 -0
- genarena/visualize/data_loader.py +2335 -0
- genarena/visualize/static/app.js +3762 -0
- genarena/visualize/static/model_aliases.json +86 -0
- genarena/visualize/static/style.css +4104 -0
- genarena/visualize/templates/index.html +413 -0
- genarena/vlm.py +519 -0
- genarena-0.1.0.dist-info/METADATA +178 -0
- genarena-0.1.0.dist-info/RECORD +44 -0
- {genarena-0.0.1.dist-info → genarena-0.1.0.dist-info}/WHEEL +1 -2
- genarena-0.1.0.dist-info/entry_points.txt +2 -0
- genarena-0.0.1.dist-info/METADATA +0 -26
- genarena-0.0.1.dist-info/RECORD +0 -5
- genarena-0.0.1.dist-info/top_level.txt +0 -1
genarena/logs.py
ADDED
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
# Copyright 2026 Ruihang Li.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0.
|
|
3
|
+
# See LICENSE file in the project root for details.
|
|
4
|
+
|
|
5
|
+
"""Logging module for battle results and audit trails."""
|
|
6
|
+
|
|
7
|
+
import fcntl
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
from typing import Any, Optional
|
|
11
|
+
|
|
12
|
+
from genarena.battle import BattleResult
|
|
13
|
+
from genarena.utils import ensure_dir, get_sorted_model_pair, iso_timestamp, sanitize_name
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BattleLogger:
|
|
17
|
+
"""
|
|
18
|
+
Logger for battle results (slim format for ELO calculation).
|
|
19
|
+
|
|
20
|
+
Stores minimal information needed for ELO scoring:
|
|
21
|
+
- model_a, model_b
|
|
22
|
+
- sample_index
|
|
23
|
+
- final_winner
|
|
24
|
+
- is_consistent
|
|
25
|
+
- timestamp
|
|
26
|
+
|
|
27
|
+
Thread-safe file writes using file locking.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, exp_dir: str):
|
|
31
|
+
"""
|
|
32
|
+
Initialize the battle logger.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
exp_dir: Experiment directory path (e.g., pk_logs/<exp_name>/)
|
|
36
|
+
"""
|
|
37
|
+
self.exp_dir = exp_dir
|
|
38
|
+
ensure_dir(exp_dir)
|
|
39
|
+
|
|
40
|
+
def _get_log_path(self, model_a: str, model_b: str) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Get the log file path for a model pair.
|
|
43
|
+
|
|
44
|
+
Models are sorted alphabetically to ensure consistent file naming.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
model_a: First model name
|
|
48
|
+
model_b: Second model name
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Path to the jsonl log file
|
|
52
|
+
"""
|
|
53
|
+
first, second, _ = get_sorted_model_pair(model_a, model_b)
|
|
54
|
+
filename = f"{sanitize_name(first)}_vs_{sanitize_name(second)}.jsonl"
|
|
55
|
+
return os.path.join(self.exp_dir, filename)
|
|
56
|
+
|
|
57
|
+
def log(
|
|
58
|
+
self,
|
|
59
|
+
model_a: str,
|
|
60
|
+
model_b: str,
|
|
61
|
+
sample_index: int,
|
|
62
|
+
final_winner: str,
|
|
63
|
+
is_consistent: bool
|
|
64
|
+
) -> None:
|
|
65
|
+
"""
|
|
66
|
+
Log a battle result.
|
|
67
|
+
|
|
68
|
+
Thread-safe append to the model pair's log file.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
model_a: First model name
|
|
72
|
+
model_b: Second model name
|
|
73
|
+
sample_index: Data sample index
|
|
74
|
+
final_winner: Winner ("model_a", "model_b", or "tie")
|
|
75
|
+
is_consistent: Whether both VLM calls agreed
|
|
76
|
+
"""
|
|
77
|
+
log_path = self._get_log_path(model_a, model_b)
|
|
78
|
+
|
|
79
|
+
# Ensure winner uses consistent naming based on sorted order
|
|
80
|
+
first, second, swapped = get_sorted_model_pair(model_a, model_b)
|
|
81
|
+
|
|
82
|
+
# Convert winner to sorted model names
|
|
83
|
+
if final_winner == "model_a":
|
|
84
|
+
sorted_winner = second if swapped else first
|
|
85
|
+
elif final_winner == "model_b":
|
|
86
|
+
sorted_winner = first if swapped else second
|
|
87
|
+
else:
|
|
88
|
+
sorted_winner = "tie"
|
|
89
|
+
|
|
90
|
+
record = {
|
|
91
|
+
"model_a": first,
|
|
92
|
+
"model_b": second,
|
|
93
|
+
"sample_index": sample_index,
|
|
94
|
+
"final_winner": sorted_winner,
|
|
95
|
+
"is_consistent": is_consistent,
|
|
96
|
+
"timestamp": iso_timestamp()
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# Thread-safe write with file locking
|
|
100
|
+
with open(log_path, "a", encoding="utf-8") as f:
|
|
101
|
+
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
|
102
|
+
try:
|
|
103
|
+
f.write(json.dumps(record) + "\n")
|
|
104
|
+
finally:
|
|
105
|
+
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
|
|
106
|
+
|
|
107
|
+
def log_battle_result(self, result: BattleResult) -> None:
|
|
108
|
+
"""
|
|
109
|
+
Log a BattleResult object.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
result: BattleResult from battle execution
|
|
113
|
+
"""
|
|
114
|
+
self.log(
|
|
115
|
+
model_a=result.model_a,
|
|
116
|
+
model_b=result.model_b,
|
|
117
|
+
sample_index=result.sample_index,
|
|
118
|
+
final_winner=result.final_winner,
|
|
119
|
+
is_consistent=result.is_consistent
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class AuditLogger:
|
|
124
|
+
"""
|
|
125
|
+
Audit logger for detailed VLM outputs.
|
|
126
|
+
|
|
127
|
+
Stores complete information for debugging and verification:
|
|
128
|
+
- Raw VLM responses
|
|
129
|
+
- Parsed results
|
|
130
|
+
- Parse success/error status
|
|
131
|
+
- Final outcome
|
|
132
|
+
|
|
133
|
+
Thread-safe file writes using file locking.
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
def __init__(self, exp_dir: str):
|
|
137
|
+
"""
|
|
138
|
+
Initialize the audit logger.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
exp_dir: Experiment directory path (e.g., pk_logs/<exp_name>/)
|
|
142
|
+
"""
|
|
143
|
+
self.raw_outputs_dir = os.path.join(exp_dir, "raw_outputs")
|
|
144
|
+
ensure_dir(self.raw_outputs_dir)
|
|
145
|
+
|
|
146
|
+
def _get_log_path(self, model_a: str, model_b: str) -> str:
|
|
147
|
+
"""
|
|
148
|
+
Get the audit log file path for a model pair.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
model_a: First model name
|
|
152
|
+
model_b: Second model name
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Path to the jsonl audit log file
|
|
156
|
+
"""
|
|
157
|
+
first, second, _ = get_sorted_model_pair(model_a, model_b)
|
|
158
|
+
filename = f"{sanitize_name(first)}_vs_{sanitize_name(second)}.jsonl"
|
|
159
|
+
return os.path.join(self.raw_outputs_dir, filename)
|
|
160
|
+
|
|
161
|
+
def log(
|
|
162
|
+
self,
|
|
163
|
+
model_a: str,
|
|
164
|
+
model_b: str,
|
|
165
|
+
sample_index: int,
|
|
166
|
+
original_call: dict[str, Any],
|
|
167
|
+
swapped_call: dict[str, Any],
|
|
168
|
+
final_winner: str,
|
|
169
|
+
is_consistent: bool
|
|
170
|
+
) -> None:
|
|
171
|
+
"""
|
|
172
|
+
Log detailed audit information.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
model_a: First model name
|
|
176
|
+
model_b: Second model name
|
|
177
|
+
sample_index: Data sample index
|
|
178
|
+
original_call: Dict with raw_response, parsed_result, parse_success, parse_error
|
|
179
|
+
swapped_call: Dict with raw_response, parsed_result, parse_success, parse_error
|
|
180
|
+
final_winner: Winner ("model_a", "model_b", or "tie")
|
|
181
|
+
is_consistent: Whether both VLM calls agreed
|
|
182
|
+
"""
|
|
183
|
+
log_path = self._get_log_path(model_a, model_b)
|
|
184
|
+
|
|
185
|
+
# Use sorted model names for consistency
|
|
186
|
+
first, second, swapped = get_sorted_model_pair(model_a, model_b)
|
|
187
|
+
|
|
188
|
+
# Convert winner to sorted model names
|
|
189
|
+
if final_winner == "model_a":
|
|
190
|
+
sorted_winner = second if swapped else first
|
|
191
|
+
elif final_winner == "model_b":
|
|
192
|
+
sorted_winner = first if swapped else second
|
|
193
|
+
else:
|
|
194
|
+
sorted_winner = "tie"
|
|
195
|
+
|
|
196
|
+
record = {
|
|
197
|
+
"model_a": first,
|
|
198
|
+
"model_b": second,
|
|
199
|
+
"sample_index": sample_index,
|
|
200
|
+
"timestamp": iso_timestamp(),
|
|
201
|
+
"original_call": original_call,
|
|
202
|
+
"swapped_call": swapped_call,
|
|
203
|
+
"final_winner": sorted_winner,
|
|
204
|
+
"is_consistent": is_consistent
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
# Thread-safe write with file locking
|
|
208
|
+
with open(log_path, "a", encoding="utf-8") as f:
|
|
209
|
+
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
|
210
|
+
try:
|
|
211
|
+
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
212
|
+
finally:
|
|
213
|
+
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
|
|
214
|
+
|
|
215
|
+
def log_battle_result(self, result: BattleResult) -> None:
|
|
216
|
+
"""
|
|
217
|
+
Log a BattleResult object with full audit details.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
result: BattleResult from battle execution
|
|
221
|
+
"""
|
|
222
|
+
# Convert CallResult to dict
|
|
223
|
+
original_call = {
|
|
224
|
+
"raw_response": result.original_call.raw_response,
|
|
225
|
+
"parsed_result": result.original_call.parsed_result,
|
|
226
|
+
"parse_success": result.original_call.parse_success,
|
|
227
|
+
"parse_error": result.original_call.parse_error
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
swapped_call = {
|
|
231
|
+
"raw_response": result.swapped_call.raw_response,
|
|
232
|
+
"parsed_result": result.swapped_call.parsed_result,
|
|
233
|
+
"parse_success": result.swapped_call.parse_success,
|
|
234
|
+
"parse_error": result.swapped_call.parse_error
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
self.log(
|
|
238
|
+
model_a=result.model_a,
|
|
239
|
+
model_b=result.model_b,
|
|
240
|
+
sample_index=result.sample_index,
|
|
241
|
+
original_call=original_call,
|
|
242
|
+
swapped_call=swapped_call,
|
|
243
|
+
final_winner=result.final_winner,
|
|
244
|
+
is_consistent=result.is_consistent
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def load_battle_history(pk_logs_dir: str) -> set[tuple[str, str, int]]:
|
|
249
|
+
"""
|
|
250
|
+
Load completed battle records from all experiment directories.
|
|
251
|
+
|
|
252
|
+
Used for checkpoint/resume functionality to skip already-completed battles.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
pk_logs_dir: Path to pk_logs directory
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Set of (model_a, model_b, sample_index) tuples representing completed battles
|
|
259
|
+
(model names are in sorted order)
|
|
260
|
+
"""
|
|
261
|
+
completed: set[tuple[str, str, int]] = set()
|
|
262
|
+
|
|
263
|
+
if not os.path.isdir(pk_logs_dir):
|
|
264
|
+
return completed
|
|
265
|
+
|
|
266
|
+
# Iterate over all experiment directories
|
|
267
|
+
for exp_name in os.listdir(pk_logs_dir):
|
|
268
|
+
exp_dir = os.path.join(pk_logs_dir, exp_name)
|
|
269
|
+
if not os.path.isdir(exp_dir):
|
|
270
|
+
continue
|
|
271
|
+
|
|
272
|
+
# Iterate over all jsonl files (excluding raw_outputs subdirectory)
|
|
273
|
+
for filename in os.listdir(exp_dir):
|
|
274
|
+
if not filename.endswith(".jsonl"):
|
|
275
|
+
continue
|
|
276
|
+
|
|
277
|
+
filepath = os.path.join(exp_dir, filename)
|
|
278
|
+
if not os.path.isfile(filepath):
|
|
279
|
+
continue
|
|
280
|
+
|
|
281
|
+
# Read and parse each line
|
|
282
|
+
try:
|
|
283
|
+
with open(filepath, "r", encoding="utf-8") as f:
|
|
284
|
+
for line in f:
|
|
285
|
+
line = line.strip()
|
|
286
|
+
if not line:
|
|
287
|
+
continue
|
|
288
|
+
try:
|
|
289
|
+
record = json.loads(line)
|
|
290
|
+
model_a = record.get("model_a", "")
|
|
291
|
+
model_b = record.get("model_b", "")
|
|
292
|
+
sample_index = record.get("sample_index", -1)
|
|
293
|
+
|
|
294
|
+
if model_a and model_b and sample_index >= 0:
|
|
295
|
+
# Models should already be sorted in the log
|
|
296
|
+
completed.add((model_a, model_b, sample_index))
|
|
297
|
+
except json.JSONDecodeError:
|
|
298
|
+
continue
|
|
299
|
+
except Exception:
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
return completed
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def load_battle_records(pk_logs_dir: str, exp_name: Optional[str] = None):
|
|
306
|
+
"""
|
|
307
|
+
Load battle records from log files.
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
pk_logs_dir: Path to pk_logs directory
|
|
311
|
+
exp_name: Specific experiment name (optional, loads all if None)
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
List of battle record dicts
|
|
315
|
+
"""
|
|
316
|
+
records = []
|
|
317
|
+
|
|
318
|
+
if not os.path.isdir(pk_logs_dir):
|
|
319
|
+
return records
|
|
320
|
+
|
|
321
|
+
# Determine which directories to scan
|
|
322
|
+
if exp_name:
|
|
323
|
+
exp_dirs = [os.path.join(pk_logs_dir, exp_name)]
|
|
324
|
+
else:
|
|
325
|
+
exp_dirs = [
|
|
326
|
+
os.path.join(pk_logs_dir, name)
|
|
327
|
+
for name in os.listdir(pk_logs_dir)
|
|
328
|
+
if os.path.isdir(os.path.join(pk_logs_dir, name))
|
|
329
|
+
]
|
|
330
|
+
|
|
331
|
+
for exp_dir in exp_dirs:
|
|
332
|
+
if not os.path.isdir(exp_dir):
|
|
333
|
+
continue
|
|
334
|
+
|
|
335
|
+
for filename in os.listdir(exp_dir):
|
|
336
|
+
if not filename.endswith(".jsonl"):
|
|
337
|
+
continue
|
|
338
|
+
|
|
339
|
+
filepath = os.path.join(exp_dir, filename)
|
|
340
|
+
if not os.path.isfile(filepath):
|
|
341
|
+
continue
|
|
342
|
+
|
|
343
|
+
try:
|
|
344
|
+
with open(filepath, "r", encoding="utf-8") as f:
|
|
345
|
+
for line in f:
|
|
346
|
+
line = line.strip()
|
|
347
|
+
if not line:
|
|
348
|
+
continue
|
|
349
|
+
try:
|
|
350
|
+
record = json.loads(line)
|
|
351
|
+
records.append(record)
|
|
352
|
+
except json.JSONDecodeError:
|
|
353
|
+
continue
|
|
354
|
+
except Exception:
|
|
355
|
+
continue
|
|
356
|
+
|
|
357
|
+
return records
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def count_battles_per_pair(pk_logs_dir: str) -> dict[tuple[str, str], int]:
|
|
361
|
+
"""
|
|
362
|
+
Count the number of completed battles for each model pair.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
pk_logs_dir: Path to pk_logs directory
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
Dict mapping (model_a, model_b) tuples (sorted) to battle count
|
|
369
|
+
"""
|
|
370
|
+
counts: dict[tuple[str, str], int] = {}
|
|
371
|
+
|
|
372
|
+
if not os.path.isdir(pk_logs_dir):
|
|
373
|
+
return counts
|
|
374
|
+
|
|
375
|
+
# Iterate over all experiment directories
|
|
376
|
+
for exp_name in os.listdir(pk_logs_dir):
|
|
377
|
+
exp_dir = os.path.join(pk_logs_dir, exp_name)
|
|
378
|
+
if not os.path.isdir(exp_dir):
|
|
379
|
+
continue
|
|
380
|
+
|
|
381
|
+
for filename in os.listdir(exp_dir):
|
|
382
|
+
if not filename.endswith(".jsonl"):
|
|
383
|
+
continue
|
|
384
|
+
|
|
385
|
+
filepath = os.path.join(exp_dir, filename)
|
|
386
|
+
if not os.path.isfile(filepath):
|
|
387
|
+
continue
|
|
388
|
+
|
|
389
|
+
try:
|
|
390
|
+
with open(filepath, "r", encoding="utf-8") as f:
|
|
391
|
+
for line in f:
|
|
392
|
+
line = line.strip()
|
|
393
|
+
if not line:
|
|
394
|
+
continue
|
|
395
|
+
try:
|
|
396
|
+
record = json.loads(line)
|
|
397
|
+
model_a = record.get("model_a", "")
|
|
398
|
+
model_b = record.get("model_b", "")
|
|
399
|
+
|
|
400
|
+
if model_a and model_b:
|
|
401
|
+
# Ensure sorted order
|
|
402
|
+
key = (min(model_a, model_b), max(model_a, model_b))
|
|
403
|
+
counts[key] = counts.get(key, 0) + 1
|
|
404
|
+
except json.JSONDecodeError:
|
|
405
|
+
continue
|
|
406
|
+
except Exception:
|
|
407
|
+
continue
|
|
408
|
+
|
|
409
|
+
return counts
|