genarena 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. genarena/__init__.py +49 -2
  2. genarena/__main__.py +10 -0
  3. genarena/arena.py +1685 -0
  4. genarena/battle.py +337 -0
  5. genarena/bt_elo.py +507 -0
  6. genarena/cli.py +1581 -0
  7. genarena/data.py +476 -0
  8. genarena/deploy/Dockerfile +25 -0
  9. genarena/deploy/README.md +55 -0
  10. genarena/deploy/__init__.py +5 -0
  11. genarena/deploy/app.py +84 -0
  12. genarena/experiments.py +121 -0
  13. genarena/leaderboard.py +270 -0
  14. genarena/logs.py +409 -0
  15. genarena/models.py +412 -0
  16. genarena/prompts/__init__.py +127 -0
  17. genarena/prompts/mmrb2.py +373 -0
  18. genarena/sampling.py +336 -0
  19. genarena/state.py +656 -0
  20. genarena/sync/__init__.py +105 -0
  21. genarena/sync/auto_commit.py +118 -0
  22. genarena/sync/deploy_ops.py +543 -0
  23. genarena/sync/git_ops.py +422 -0
  24. genarena/sync/hf_ops.py +891 -0
  25. genarena/sync/init_ops.py +431 -0
  26. genarena/sync/packer.py +587 -0
  27. genarena/sync/submit.py +837 -0
  28. genarena/utils.py +103 -0
  29. genarena/validation/__init__.py +19 -0
  30. genarena/validation/schema.py +327 -0
  31. genarena/validation/validator.py +329 -0
  32. genarena/visualize/README.md +148 -0
  33. genarena/visualize/__init__.py +14 -0
  34. genarena/visualize/app.py +938 -0
  35. genarena/visualize/data_loader.py +2335 -0
  36. genarena/visualize/static/app.js +3762 -0
  37. genarena/visualize/static/model_aliases.json +86 -0
  38. genarena/visualize/static/style.css +4104 -0
  39. genarena/visualize/templates/index.html +413 -0
  40. genarena/vlm.py +519 -0
  41. genarena-0.1.0.dist-info/METADATA +178 -0
  42. genarena-0.1.0.dist-info/RECORD +44 -0
  43. {genarena-0.0.1.dist-info → genarena-0.1.0.dist-info}/WHEEL +1 -2
  44. genarena-0.1.0.dist-info/entry_points.txt +2 -0
  45. genarena-0.0.1.dist-info/METADATA +0 -26
  46. genarena-0.0.1.dist-info/RECORD +0 -5
  47. genarena-0.0.1.dist-info/top_level.txt +0 -1
genarena/logs.py ADDED
@@ -0,0 +1,409 @@
1
+ # Copyright 2026 Ruihang Li.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See LICENSE file in the project root for details.
4
+
5
+ """Logging module for battle results and audit trails."""
6
+
7
+ import fcntl
8
+ import json
9
+ import os
10
+ from typing import Any, Optional
11
+
12
+ from genarena.battle import BattleResult
13
+ from genarena.utils import ensure_dir, get_sorted_model_pair, iso_timestamp, sanitize_name
14
+
15
+
16
+ class BattleLogger:
17
+ """
18
+ Logger for battle results (slim format for ELO calculation).
19
+
20
+ Stores minimal information needed for ELO scoring:
21
+ - model_a, model_b
22
+ - sample_index
23
+ - final_winner
24
+ - is_consistent
25
+ - timestamp
26
+
27
+ Thread-safe file writes using file locking.
28
+ """
29
+
30
+ def __init__(self, exp_dir: str):
31
+ """
32
+ Initialize the battle logger.
33
+
34
+ Args:
35
+ exp_dir: Experiment directory path (e.g., pk_logs/<exp_name>/)
36
+ """
37
+ self.exp_dir = exp_dir
38
+ ensure_dir(exp_dir)
39
+
40
+ def _get_log_path(self, model_a: str, model_b: str) -> str:
41
+ """
42
+ Get the log file path for a model pair.
43
+
44
+ Models are sorted alphabetically to ensure consistent file naming.
45
+
46
+ Args:
47
+ model_a: First model name
48
+ model_b: Second model name
49
+
50
+ Returns:
51
+ Path to the jsonl log file
52
+ """
53
+ first, second, _ = get_sorted_model_pair(model_a, model_b)
54
+ filename = f"{sanitize_name(first)}_vs_{sanitize_name(second)}.jsonl"
55
+ return os.path.join(self.exp_dir, filename)
56
+
57
+ def log(
58
+ self,
59
+ model_a: str,
60
+ model_b: str,
61
+ sample_index: int,
62
+ final_winner: str,
63
+ is_consistent: bool
64
+ ) -> None:
65
+ """
66
+ Log a battle result.
67
+
68
+ Thread-safe append to the model pair's log file.
69
+
70
+ Args:
71
+ model_a: First model name
72
+ model_b: Second model name
73
+ sample_index: Data sample index
74
+ final_winner: Winner ("model_a", "model_b", or "tie")
75
+ is_consistent: Whether both VLM calls agreed
76
+ """
77
+ log_path = self._get_log_path(model_a, model_b)
78
+
79
+ # Ensure winner uses consistent naming based on sorted order
80
+ first, second, swapped = get_sorted_model_pair(model_a, model_b)
81
+
82
+ # Convert winner to sorted model names
83
+ if final_winner == "model_a":
84
+ sorted_winner = second if swapped else first
85
+ elif final_winner == "model_b":
86
+ sorted_winner = first if swapped else second
87
+ else:
88
+ sorted_winner = "tie"
89
+
90
+ record = {
91
+ "model_a": first,
92
+ "model_b": second,
93
+ "sample_index": sample_index,
94
+ "final_winner": sorted_winner,
95
+ "is_consistent": is_consistent,
96
+ "timestamp": iso_timestamp()
97
+ }
98
+
99
+ # Thread-safe write with file locking
100
+ with open(log_path, "a", encoding="utf-8") as f:
101
+ fcntl.flock(f.fileno(), fcntl.LOCK_EX)
102
+ try:
103
+ f.write(json.dumps(record) + "\n")
104
+ finally:
105
+ fcntl.flock(f.fileno(), fcntl.LOCK_UN)
106
+
107
+ def log_battle_result(self, result: BattleResult) -> None:
108
+ """
109
+ Log a BattleResult object.
110
+
111
+ Args:
112
+ result: BattleResult from battle execution
113
+ """
114
+ self.log(
115
+ model_a=result.model_a,
116
+ model_b=result.model_b,
117
+ sample_index=result.sample_index,
118
+ final_winner=result.final_winner,
119
+ is_consistent=result.is_consistent
120
+ )
121
+
122
+
123
+ class AuditLogger:
124
+ """
125
+ Audit logger for detailed VLM outputs.
126
+
127
+ Stores complete information for debugging and verification:
128
+ - Raw VLM responses
129
+ - Parsed results
130
+ - Parse success/error status
131
+ - Final outcome
132
+
133
+ Thread-safe file writes using file locking.
134
+ """
135
+
136
+ def __init__(self, exp_dir: str):
137
+ """
138
+ Initialize the audit logger.
139
+
140
+ Args:
141
+ exp_dir: Experiment directory path (e.g., pk_logs/<exp_name>/)
142
+ """
143
+ self.raw_outputs_dir = os.path.join(exp_dir, "raw_outputs")
144
+ ensure_dir(self.raw_outputs_dir)
145
+
146
+ def _get_log_path(self, model_a: str, model_b: str) -> str:
147
+ """
148
+ Get the audit log file path for a model pair.
149
+
150
+ Args:
151
+ model_a: First model name
152
+ model_b: Second model name
153
+
154
+ Returns:
155
+ Path to the jsonl audit log file
156
+ """
157
+ first, second, _ = get_sorted_model_pair(model_a, model_b)
158
+ filename = f"{sanitize_name(first)}_vs_{sanitize_name(second)}.jsonl"
159
+ return os.path.join(self.raw_outputs_dir, filename)
160
+
161
+ def log(
162
+ self,
163
+ model_a: str,
164
+ model_b: str,
165
+ sample_index: int,
166
+ original_call: dict[str, Any],
167
+ swapped_call: dict[str, Any],
168
+ final_winner: str,
169
+ is_consistent: bool
170
+ ) -> None:
171
+ """
172
+ Log detailed audit information.
173
+
174
+ Args:
175
+ model_a: First model name
176
+ model_b: Second model name
177
+ sample_index: Data sample index
178
+ original_call: Dict with raw_response, parsed_result, parse_success, parse_error
179
+ swapped_call: Dict with raw_response, parsed_result, parse_success, parse_error
180
+ final_winner: Winner ("model_a", "model_b", or "tie")
181
+ is_consistent: Whether both VLM calls agreed
182
+ """
183
+ log_path = self._get_log_path(model_a, model_b)
184
+
185
+ # Use sorted model names for consistency
186
+ first, second, swapped = get_sorted_model_pair(model_a, model_b)
187
+
188
+ # Convert winner to sorted model names
189
+ if final_winner == "model_a":
190
+ sorted_winner = second if swapped else first
191
+ elif final_winner == "model_b":
192
+ sorted_winner = first if swapped else second
193
+ else:
194
+ sorted_winner = "tie"
195
+
196
+ record = {
197
+ "model_a": first,
198
+ "model_b": second,
199
+ "sample_index": sample_index,
200
+ "timestamp": iso_timestamp(),
201
+ "original_call": original_call,
202
+ "swapped_call": swapped_call,
203
+ "final_winner": sorted_winner,
204
+ "is_consistent": is_consistent
205
+ }
206
+
207
+ # Thread-safe write with file locking
208
+ with open(log_path, "a", encoding="utf-8") as f:
209
+ fcntl.flock(f.fileno(), fcntl.LOCK_EX)
210
+ try:
211
+ f.write(json.dumps(record, ensure_ascii=False) + "\n")
212
+ finally:
213
+ fcntl.flock(f.fileno(), fcntl.LOCK_UN)
214
+
215
+ def log_battle_result(self, result: BattleResult) -> None:
216
+ """
217
+ Log a BattleResult object with full audit details.
218
+
219
+ Args:
220
+ result: BattleResult from battle execution
221
+ """
222
+ # Convert CallResult to dict
223
+ original_call = {
224
+ "raw_response": result.original_call.raw_response,
225
+ "parsed_result": result.original_call.parsed_result,
226
+ "parse_success": result.original_call.parse_success,
227
+ "parse_error": result.original_call.parse_error
228
+ }
229
+
230
+ swapped_call = {
231
+ "raw_response": result.swapped_call.raw_response,
232
+ "parsed_result": result.swapped_call.parsed_result,
233
+ "parse_success": result.swapped_call.parse_success,
234
+ "parse_error": result.swapped_call.parse_error
235
+ }
236
+
237
+ self.log(
238
+ model_a=result.model_a,
239
+ model_b=result.model_b,
240
+ sample_index=result.sample_index,
241
+ original_call=original_call,
242
+ swapped_call=swapped_call,
243
+ final_winner=result.final_winner,
244
+ is_consistent=result.is_consistent
245
+ )
246
+
247
+
248
+ def load_battle_history(pk_logs_dir: str) -> set[tuple[str, str, int]]:
249
+ """
250
+ Load completed battle records from all experiment directories.
251
+
252
+ Used for checkpoint/resume functionality to skip already-completed battles.
253
+
254
+ Args:
255
+ pk_logs_dir: Path to pk_logs directory
256
+
257
+ Returns:
258
+ Set of (model_a, model_b, sample_index) tuples representing completed battles
259
+ (model names are in sorted order)
260
+ """
261
+ completed: set[tuple[str, str, int]] = set()
262
+
263
+ if not os.path.isdir(pk_logs_dir):
264
+ return completed
265
+
266
+ # Iterate over all experiment directories
267
+ for exp_name in os.listdir(pk_logs_dir):
268
+ exp_dir = os.path.join(pk_logs_dir, exp_name)
269
+ if not os.path.isdir(exp_dir):
270
+ continue
271
+
272
+ # Iterate over all jsonl files (excluding raw_outputs subdirectory)
273
+ for filename in os.listdir(exp_dir):
274
+ if not filename.endswith(".jsonl"):
275
+ continue
276
+
277
+ filepath = os.path.join(exp_dir, filename)
278
+ if not os.path.isfile(filepath):
279
+ continue
280
+
281
+ # Read and parse each line
282
+ try:
283
+ with open(filepath, "r", encoding="utf-8") as f:
284
+ for line in f:
285
+ line = line.strip()
286
+ if not line:
287
+ continue
288
+ try:
289
+ record = json.loads(line)
290
+ model_a = record.get("model_a", "")
291
+ model_b = record.get("model_b", "")
292
+ sample_index = record.get("sample_index", -1)
293
+
294
+ if model_a and model_b and sample_index >= 0:
295
+ # Models should already be sorted in the log
296
+ completed.add((model_a, model_b, sample_index))
297
+ except json.JSONDecodeError:
298
+ continue
299
+ except Exception:
300
+ continue
301
+
302
+ return completed
303
+
304
+
305
+ def load_battle_records(pk_logs_dir: str, exp_name: Optional[str] = None):
306
+ """
307
+ Load battle records from log files.
308
+
309
+ Args:
310
+ pk_logs_dir: Path to pk_logs directory
311
+ exp_name: Specific experiment name (optional, loads all if None)
312
+
313
+ Returns:
314
+ List of battle record dicts
315
+ """
316
+ records = []
317
+
318
+ if not os.path.isdir(pk_logs_dir):
319
+ return records
320
+
321
+ # Determine which directories to scan
322
+ if exp_name:
323
+ exp_dirs = [os.path.join(pk_logs_dir, exp_name)]
324
+ else:
325
+ exp_dirs = [
326
+ os.path.join(pk_logs_dir, name)
327
+ for name in os.listdir(pk_logs_dir)
328
+ if os.path.isdir(os.path.join(pk_logs_dir, name))
329
+ ]
330
+
331
+ for exp_dir in exp_dirs:
332
+ if not os.path.isdir(exp_dir):
333
+ continue
334
+
335
+ for filename in os.listdir(exp_dir):
336
+ if not filename.endswith(".jsonl"):
337
+ continue
338
+
339
+ filepath = os.path.join(exp_dir, filename)
340
+ if not os.path.isfile(filepath):
341
+ continue
342
+
343
+ try:
344
+ with open(filepath, "r", encoding="utf-8") as f:
345
+ for line in f:
346
+ line = line.strip()
347
+ if not line:
348
+ continue
349
+ try:
350
+ record = json.loads(line)
351
+ records.append(record)
352
+ except json.JSONDecodeError:
353
+ continue
354
+ except Exception:
355
+ continue
356
+
357
+ return records
358
+
359
+
360
+ def count_battles_per_pair(pk_logs_dir: str) -> dict[tuple[str, str], int]:
361
+ """
362
+ Count the number of completed battles for each model pair.
363
+
364
+ Args:
365
+ pk_logs_dir: Path to pk_logs directory
366
+
367
+ Returns:
368
+ Dict mapping (model_a, model_b) tuples (sorted) to battle count
369
+ """
370
+ counts: dict[tuple[str, str], int] = {}
371
+
372
+ if not os.path.isdir(pk_logs_dir):
373
+ return counts
374
+
375
+ # Iterate over all experiment directories
376
+ for exp_name in os.listdir(pk_logs_dir):
377
+ exp_dir = os.path.join(pk_logs_dir, exp_name)
378
+ if not os.path.isdir(exp_dir):
379
+ continue
380
+
381
+ for filename in os.listdir(exp_dir):
382
+ if not filename.endswith(".jsonl"):
383
+ continue
384
+
385
+ filepath = os.path.join(exp_dir, filename)
386
+ if not os.path.isfile(filepath):
387
+ continue
388
+
389
+ try:
390
+ with open(filepath, "r", encoding="utf-8") as f:
391
+ for line in f:
392
+ line = line.strip()
393
+ if not line:
394
+ continue
395
+ try:
396
+ record = json.loads(line)
397
+ model_a = record.get("model_a", "")
398
+ model_b = record.get("model_b", "")
399
+
400
+ if model_a and model_b:
401
+ # Ensure sorted order
402
+ key = (min(model_a, model_b), max(model_a, model_b))
403
+ counts[key] = counts.get(key, 0) + 1
404
+ except json.JSONDecodeError:
405
+ continue
406
+ except Exception:
407
+ continue
408
+
409
+ return counts