genarena 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. genarena/__init__.py +49 -2
  2. genarena/__main__.py +10 -0
  3. genarena/arena.py +1685 -0
  4. genarena/battle.py +337 -0
  5. genarena/bt_elo.py +507 -0
  6. genarena/cli.py +1581 -0
  7. genarena/data.py +476 -0
  8. genarena/deploy/Dockerfile +25 -0
  9. genarena/deploy/README.md +55 -0
  10. genarena/deploy/__init__.py +5 -0
  11. genarena/deploy/app.py +84 -0
  12. genarena/experiments.py +121 -0
  13. genarena/leaderboard.py +270 -0
  14. genarena/logs.py +409 -0
  15. genarena/models.py +412 -0
  16. genarena/prompts/__init__.py +127 -0
  17. genarena/prompts/mmrb2.py +373 -0
  18. genarena/sampling.py +336 -0
  19. genarena/state.py +656 -0
  20. genarena/sync/__init__.py +105 -0
  21. genarena/sync/auto_commit.py +118 -0
  22. genarena/sync/deploy_ops.py +543 -0
  23. genarena/sync/git_ops.py +422 -0
  24. genarena/sync/hf_ops.py +891 -0
  25. genarena/sync/init_ops.py +431 -0
  26. genarena/sync/packer.py +587 -0
  27. genarena/sync/submit.py +837 -0
  28. genarena/utils.py +103 -0
  29. genarena/validation/__init__.py +19 -0
  30. genarena/validation/schema.py +327 -0
  31. genarena/validation/validator.py +329 -0
  32. genarena/visualize/README.md +148 -0
  33. genarena/visualize/__init__.py +14 -0
  34. genarena/visualize/app.py +938 -0
  35. genarena/visualize/data_loader.py +2335 -0
  36. genarena/visualize/static/app.js +3762 -0
  37. genarena/visualize/static/model_aliases.json +86 -0
  38. genarena/visualize/static/style.css +4104 -0
  39. genarena/visualize/templates/index.html +413 -0
  40. genarena/vlm.py +519 -0
  41. genarena-0.1.0.dist-info/METADATA +178 -0
  42. genarena-0.1.0.dist-info/RECORD +44 -0
  43. {genarena-0.0.1.dist-info → genarena-0.1.0.dist-info}/WHEEL +1 -2
  44. genarena-0.1.0.dist-info/entry_points.txt +2 -0
  45. genarena-0.0.1.dist-info/METADATA +0 -26
  46. genarena-0.0.1.dist-info/RECORD +0 -5
  47. genarena-0.0.1.dist-info/top_level.txt +0 -1
@@ -0,0 +1,938 @@
1
+ # Copyright 2026 Ruihang Li.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See LICENSE file in the project root for details.
4
+
5
+ """Flask application for arena visualization."""
6
+
7
+ import io
8
+ import os
9
+
10
+ from flask import Flask, jsonify, render_template, request, send_file, abort, redirect
11
+
12
+ from genarena.visualize.data_loader import ArenaDataLoader
13
+
14
+
15
+ def create_app(arena_dir: str, data_dir: str) -> Flask:
16
+ """
17
+ Create and configure the Flask application.
18
+
19
+ Args:
20
+ arena_dir: Path to arena directory
21
+ data_dir: Path to data directory
22
+
23
+ Returns:
24
+ Configured Flask app
25
+ """
26
+ # Get the directory containing this file for templates/static
27
+ app_dir = os.path.dirname(os.path.abspath(__file__))
28
+
29
+ app = Flask(
30
+ __name__,
31
+ template_folder=os.path.join(app_dir, "templates"),
32
+ static_folder=os.path.join(app_dir, "static"),
33
+ )
34
+
35
+ # Store paths in config
36
+ app.config["ARENA_DIR"] = arena_dir
37
+ app.config["DATA_DIR"] = data_dir
38
+
39
+ # Create data loader
40
+ data_loader = ArenaDataLoader(arena_dir, data_dir)
41
+
42
+ # ========== Page Routes ==========
43
+
44
+ @app.route("/")
45
+ def index():
46
+ """Main page."""
47
+ return render_template("index.html")
48
+
49
+ # ========== API Routes ==========
50
+
51
+ @app.route("/api/subsets")
52
+ def api_subsets():
53
+ """Get list of available subsets."""
54
+ subsets = data_loader.discover_subsets()
55
+ return jsonify({"subsets": subsets})
56
+
57
+ @app.route("/api/subsets/<subset>/info")
58
+ def api_subset_info(subset: str):
59
+ """Get information about a subset."""
60
+ info = data_loader.get_subset_info(subset)
61
+ if not info:
62
+ return jsonify({"error": "Subset not found"}), 404
63
+
64
+ return jsonify({
65
+ "name": info.name,
66
+ "models": info.models,
67
+ "experiments": info.experiments,
68
+ "total_battles": info.total_battles,
69
+ "min_input_images": info.min_input_images,
70
+ "max_input_images": info.max_input_images,
71
+ "prompt_sources": info.prompt_sources,
72
+ })
73
+
74
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/battles")
75
+ def api_battles(subset: str, exp_name: str):
76
+ """Get paginated battle records."""
77
+ # Parse query parameters
78
+ page = request.args.get("page", 1, type=int)
79
+ page_size = request.args.get("page_size", 20, type=int)
80
+ result_filter = request.args.get("result", None, type=str)
81
+ consistency = request.args.get("consistent", None, type=str)
82
+ min_images = request.args.get("min_images", None, type=int)
83
+ max_images = request.args.get("max_images", None, type=int)
84
+ prompt_source = request.args.get("prompt_source", None, type=str)
85
+
86
+ # Support multiple models (comma-separated or multiple params)
87
+ models_param = request.args.get("models", None, type=str)
88
+ models = None
89
+ if models_param:
90
+ models = [m.strip() for m in models_param.split(",") if m.strip()]
91
+
92
+ # Convert consistency filter
93
+ consistency_filter = None
94
+ if consistency == "true":
95
+ consistency_filter = True
96
+ elif consistency == "false":
97
+ consistency_filter = False
98
+
99
+ # Get battles
100
+ records, total = data_loader.get_battles(
101
+ subset=subset,
102
+ exp_name=exp_name,
103
+ page=page,
104
+ page_size=page_size,
105
+ models=models,
106
+ result_filter=result_filter,
107
+ consistency_filter=consistency_filter,
108
+ min_images=min_images,
109
+ max_images=max_images,
110
+ prompt_source=prompt_source,
111
+ )
112
+
113
+ return jsonify({
114
+ "battles": [r.to_dict() for r in records],
115
+ "total": total,
116
+ "page": page,
117
+ "page_size": page_size,
118
+ "total_pages": (total + page_size - 1) // page_size,
119
+ })
120
+
121
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/battles/<path:battle_id>")
122
+ def api_battle_detail(subset: str, exp_name: str, battle_id: str):
123
+ """Get detailed battle record."""
124
+ # Parse battle_id: model_a_vs_model_b:sample_index
125
+ try:
126
+ parts = battle_id.rsplit(":", 1)
127
+ sample_index = int(parts[1])
128
+ model_part = parts[0]
129
+
130
+ # Split model names
131
+ if "_vs_" in model_part:
132
+ models = model_part.split("_vs_")
133
+ model_a, model_b = models[0], models[1]
134
+ else:
135
+ return jsonify({"error": "Invalid battle_id format"}), 400
136
+ except (ValueError, IndexError):
137
+ return jsonify({"error": "Invalid battle_id format"}), 400
138
+
139
+ record = data_loader.get_battle_detail(
140
+ subset, exp_name, model_a, model_b, sample_index
141
+ )
142
+
143
+ if not record:
144
+ return jsonify({"error": "Battle not found"}), 404
145
+
146
+ return jsonify(record.to_detail_dict())
147
+
148
+ @app.route("/api/subsets/<subset>/stats")
149
+ def api_stats(subset: str):
150
+ """Get statistics for a subset."""
151
+ exp_name = request.args.get("exp_name", None, type=str)
152
+ stats = data_loader.get_stats(subset, exp_name)
153
+
154
+ if not stats:
155
+ return jsonify({"error": "Subset not found"}), 404
156
+
157
+ return jsonify(stats)
158
+
159
+ @app.route("/api/subsets/<subset>/leaderboard")
160
+ def api_elo_leaderboard(subset: str):
161
+ """Get ELO leaderboard for a subset."""
162
+ # Support multiple models filter (comma-separated)
163
+ models_param = request.args.get("models", None, type=str)
164
+ filter_models = None
165
+ if models_param:
166
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
167
+
168
+ leaderboard = data_loader.get_elo_leaderboard(subset, filter_models)
169
+ return jsonify({"leaderboard": leaderboard})
170
+
171
+ @app.route("/api/subsets/<subset>/models/<path:model>/stats")
172
+ def api_model_stats(subset: str, model: str):
173
+ """Get detailed statistics for a specific model including win rates against all opponents."""
174
+ exp_name = request.args.get("exp_name", "__all__", type=str)
175
+ stats = data_loader.get_model_vs_stats(subset, model, exp_name)
176
+
177
+ if not stats:
178
+ return jsonify({"error": "Model not found"}), 404
179
+
180
+ return jsonify(stats)
181
+
182
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/h2h")
183
+ def api_head_to_head(subset: str, exp_name: str):
184
+ """Get head-to-head statistics between two models."""
185
+ model_a = request.args.get("model_a", None, type=str)
186
+ model_b = request.args.get("model_b", None, type=str)
187
+
188
+ if not model_a or not model_b:
189
+ return jsonify({"error": "model_a and model_b are required"}), 400
190
+
191
+ h2h = data_loader.get_head_to_head(subset, exp_name, model_a, model_b)
192
+ return jsonify(h2h)
193
+
194
+ @app.route("/api/subsets/<subset>/samples/<int:sample_index>/input_count")
195
+ def api_input_image_count(subset: str, sample_index: int):
196
+ """Get the number of input images for a sample."""
197
+ count = data_loader.get_input_image_count(subset, sample_index)
198
+ return jsonify({"count": count})
199
+
200
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/samples/<int:sample_index>/all_models")
201
+ def api_sample_all_models(subset: str, exp_name: str, sample_index: int):
202
+ """Get all model outputs for a specific sample, sorted by win rate."""
203
+ # Support multiple models filter (comma-separated)
204
+ models_param = request.args.get("models", None, type=str)
205
+ filter_models = None
206
+ if models_param:
207
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
208
+
209
+ # stats_scope: 'filtered' = only count battles between filtered models
210
+ # 'all' = count all battles (but show only filtered models)
211
+ stats_scope = request.args.get("stats_scope", "filtered", type=str)
212
+
213
+ result = data_loader.get_sample_all_models(
214
+ subset, exp_name, sample_index, filter_models, stats_scope
215
+ )
216
+
217
+ if not result:
218
+ return jsonify({"error": "Sample not found"}), 404
219
+
220
+ return jsonify(result)
221
+
222
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/samples/<int:sample_index>/models/<path:model>/battles")
223
+ def api_model_battles_for_sample(subset: str, exp_name: str, sample_index: int, model: str):
224
+ """Get all battle records for a specific model on a specific sample."""
225
+ # Parse optional opponent models filter (comma-separated)
226
+ opponents_param = request.args.get("opponents", None, type=str)
227
+ opponent_models = None
228
+ if opponents_param:
229
+ opponent_models = [m.strip() for m in opponents_param.split(",") if m.strip()]
230
+
231
+ result = data_loader.get_model_battles_for_sample(
232
+ subset=subset,
233
+ exp_name=exp_name,
234
+ sample_index=sample_index,
235
+ model=model,
236
+ opponent_models=opponent_models,
237
+ )
238
+
239
+ return jsonify(result)
240
+
241
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/prompts")
242
+ def api_prompts(subset: str, exp_name: str):
243
+ """Get paginated list of prompts/samples with all model outputs."""
244
+ # Parse query parameters
245
+ page = request.args.get("page", 1, type=int)
246
+ page_size = request.args.get("page_size", 10, type=int)
247
+ min_images = request.args.get("min_images", None, type=int)
248
+ max_images = request.args.get("max_images", None, type=int)
249
+ prompt_source = request.args.get("prompt_source", None, type=str)
250
+
251
+ # Support multiple models filter (comma-separated)
252
+ models_param = request.args.get("models", None, type=str)
253
+ filter_models = None
254
+ if models_param:
255
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
256
+
257
+ # Get prompts
258
+ prompts, total = data_loader.get_prompts(
259
+ subset=subset,
260
+ exp_name=exp_name,
261
+ page=page,
262
+ page_size=page_size,
263
+ min_images=min_images,
264
+ max_images=max_images,
265
+ prompt_source=prompt_source,
266
+ filter_models=filter_models,
267
+ )
268
+
269
+ return jsonify({
270
+ "prompts": prompts,
271
+ "total": total,
272
+ "page": page,
273
+ "page_size": page_size,
274
+ "total_pages": (total + page_size - 1) // page_size,
275
+ })
276
+
277
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/search")
278
+ def api_search(subset: str, exp_name: str):
279
+ """Search battles by text query (full-text search across instruction, task_type, prompt_source, metadata)."""
280
+ # Parse query parameters
281
+ query = request.args.get("q", "", type=str)
282
+ page = request.args.get("page", 1, type=int)
283
+ page_size = request.args.get("page_size", 20, type=int)
284
+ consistency = request.args.get("consistent", None, type=str)
285
+
286
+ # Support multiple models (comma-separated)
287
+ models_param = request.args.get("models", None, type=str)
288
+ models = None
289
+ if models_param:
290
+ models = [m.strip() for m in models_param.split(",") if m.strip()]
291
+
292
+ # Convert consistency filter
293
+ consistency_filter = None
294
+ if consistency == "true":
295
+ consistency_filter = True
296
+ elif consistency == "false":
297
+ consistency_filter = False
298
+
299
+ # Search battles
300
+ records, total = data_loader.search_battles(
301
+ subset=subset,
302
+ exp_name=exp_name,
303
+ query=query,
304
+ page=page,
305
+ page_size=page_size,
306
+ models=models,
307
+ consistency_filter=consistency_filter,
308
+ )
309
+
310
+ return jsonify({
311
+ "battles": [r.to_dict() for r in records],
312
+ "total": total,
313
+ "page": page,
314
+ "page_size": page_size,
315
+ "total_pages": (total + page_size - 1) // page_size,
316
+ "query": query,
317
+ })
318
+
319
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/search/prompts")
320
+ def api_search_prompts(subset: str, exp_name: str):
321
+ """Search prompts by text query."""
322
+ # Parse query parameters
323
+ query = request.args.get("q", "", type=str)
324
+ page = request.args.get("page", 1, type=int)
325
+ page_size = request.args.get("page_size", 10, type=int)
326
+
327
+ # Support multiple models filter (comma-separated)
328
+ models_param = request.args.get("models", None, type=str)
329
+ filter_models = None
330
+ if models_param:
331
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
332
+
333
+ # Search prompts
334
+ prompts, total = data_loader.search_prompts(
335
+ subset=subset,
336
+ exp_name=exp_name,
337
+ query=query,
338
+ page=page,
339
+ page_size=page_size,
340
+ filter_models=filter_models,
341
+ )
342
+
343
+ return jsonify({
344
+ "prompts": prompts,
345
+ "total": total,
346
+ "page": page,
347
+ "page_size": page_size,
348
+ "total_pages": (total + page_size - 1) // page_size,
349
+ "query": query,
350
+ })
351
+
352
+ @app.route("/api/subsets/<subset>/matrix")
353
+ def api_win_rate_matrix(subset: str):
354
+ """Get win rate matrix for all model pairs."""
355
+ exp_name = request.args.get("exp_name", "__all__", type=str)
356
+
357
+ # Support model filter (comma-separated)
358
+ models_param = request.args.get("models", None, type=str)
359
+ filter_models = None
360
+ if models_param:
361
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
362
+
363
+ result = data_loader.get_win_rate_matrix(subset, exp_name, filter_models)
364
+ return jsonify(result)
365
+
366
+ @app.route("/api/subsets/<subset>/leaderboard/by-source")
367
+ def api_elo_by_source(subset: str):
368
+ """Get ELO rankings grouped by prompt source."""
369
+ exp_name = request.args.get("exp_name", "__all__", type=str)
370
+ result = data_loader.get_elo_by_source(subset, exp_name)
371
+ return jsonify(result)
372
+
373
+ @app.route("/api/subsets/<subset>/elo-history")
374
+ def api_elo_history(subset: str):
375
+ """Get ELO history over time."""
376
+ exp_name = request.args.get("exp_name", "__all__", type=str)
377
+ granularity = request.args.get("granularity", "day", type=str)
378
+
379
+ # Support model filter (comma-separated)
380
+ models_param = request.args.get("models", None, type=str)
381
+ filter_models = None
382
+ if models_param:
383
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
384
+
385
+ result = data_loader.get_elo_history(subset, exp_name, granularity, filter_models)
386
+ return jsonify(result)
387
+
388
+ @app.route("/api/overview/leaderboards")
389
+ def api_overview_leaderboards():
390
+ """Get leaderboard data for all subsets (for Overview page)."""
391
+ result = data_loader.get_all_subsets_leaderboards()
392
+ return jsonify(result)
393
+
394
+ @app.route("/api/cross-subset/info")
395
+ def api_cross_subset_info():
396
+ """Get information about models across multiple subsets."""
397
+ subsets_param = request.args.get("subsets", "", type=str)
398
+ if not subsets_param:
399
+ return jsonify({"error": "subsets parameter is required"}), 400
400
+
401
+ subsets = [s.strip() for s in subsets_param.split(",") if s.strip()]
402
+ if len(subsets) < 1:
403
+ return jsonify({"error": "At least 1 subset required"}), 400
404
+
405
+ result = data_loader.get_cross_subset_info(subsets)
406
+ return jsonify(result)
407
+
408
+ @app.route("/api/cross-subset/elo")
409
+ def api_cross_subset_elo():
410
+ """Compute ELO rankings across multiple subsets."""
411
+ subsets_param = request.args.get("subsets", "", type=str)
412
+ if not subsets_param:
413
+ return jsonify({"error": "subsets parameter is required"}), 400
414
+
415
+ subsets = [s.strip() for s in subsets_param.split(",") if s.strip()]
416
+ if len(subsets) < 1:
417
+ return jsonify({"error": "At least 1 subset required"}), 400
418
+
419
+ exp_name = request.args.get("exp_name", "__all__", type=str)
420
+ model_scope = request.args.get("model_scope", "all", type=str)
421
+
422
+ result = data_loader.get_cross_subset_elo(subsets, exp_name, model_scope)
423
+ return jsonify(result)
424
+
425
+ # ========== Image Routes ==========
426
+
427
+ @app.route("/images/<subset>/<model>/<int:sample_index>")
428
+ def serve_model_image(subset: str, model: str, sample_index: int):
429
+ """Serve model output image."""
430
+ image_path = data_loader.get_image_path(subset, model, sample_index)
431
+
432
+ if not image_path or not os.path.isfile(image_path):
433
+ abort(404)
434
+
435
+ # Determine mime type
436
+ ext = os.path.splitext(image_path)[1].lower()
437
+ mime_types = {
438
+ ".png": "image/png",
439
+ ".jpg": "image/jpeg",
440
+ ".jpeg": "image/jpeg",
441
+ ".webp": "image/webp",
442
+ }
443
+ mimetype = mime_types.get(ext, "image/png")
444
+
445
+ return send_file(
446
+ image_path,
447
+ mimetype=mimetype,
448
+ max_age=3600, # Cache for 1 hour
449
+ )
450
+
451
+ @app.route("/images/<subset>/input/<int:sample_index>")
452
+ @app.route("/images/<subset>/input/<int:sample_index>/<int:img_idx>")
453
+ def serve_input_image(subset: str, sample_index: int, img_idx: int = 0):
454
+ """Serve input image from parquet dataset. Supports multiple images via img_idx."""
455
+ image_bytes = data_loader.get_input_image_by_idx(subset, sample_index, img_idx)
456
+
457
+ if not image_bytes:
458
+ abort(404)
459
+
460
+ return send_file(
461
+ io.BytesIO(image_bytes),
462
+ mimetype="image/png",
463
+ max_age=3600,
464
+ )
465
+
466
+ return app
467
+
468
+
469
+ def run_server(
470
+ arena_dir: str,
471
+ data_dir: str,
472
+ host: str = "0.0.0.0",
473
+ port: int = 8080,
474
+ debug: bool = False,
475
+ ):
476
+ """
477
+ Run the visualization server.
478
+
479
+ Args:
480
+ arena_dir: Path to arena directory
481
+ data_dir: Path to data directory
482
+ host: Host to bind to
483
+ port: Port to listen on
484
+ debug: Enable debug mode
485
+ """
486
+ import logging
487
+ logging.basicConfig(
488
+ level=logging.INFO,
489
+ format="%(asctime)s [%(levelname)s] %(message)s",
490
+ datefmt="%H:%M:%S"
491
+ )
492
+
493
+ print(f"\n{'='*60}")
494
+ print(f" GenArena Arena Visualizer")
495
+ print(f"{'='*60}")
496
+ print(f" Arena Dir: {arena_dir}")
497
+ print(f" Data Dir: {data_dir}")
498
+ print(f"{'='*60}")
499
+ print(f" Preloading data (this may take a while)...")
500
+ print(f"{'='*60}\n")
501
+
502
+ app = create_app(arena_dir, data_dir)
503
+
504
+ print(f"\n{'='*60}")
505
+ print(f" Server ready: http://{host}:{port}")
506
+ print(f"{'='*60}\n")
507
+
508
+ app.run(host=host, port=port, debug=debug, threaded=True)
509
+
510
+
511
+ def create_hf_app(
512
+ arena_dir: str,
513
+ data_dir: str,
514
+ hf_repo: str,
515
+ image_files: list[str],
516
+ ) -> Flask:
517
+ """
518
+ Create Flask app for HuggingFace Spaces deployment.
519
+
520
+ This version uses HF CDN URLs for model output images instead of
521
+ serving them from local filesystem.
522
+
523
+ Args:
524
+ arena_dir: Path to arena directory (metadata only, no images)
525
+ data_dir: Path to data directory containing parquet files
526
+ hf_repo: HuggingFace repo ID for image CDN URLs
527
+ image_files: List of image file paths in the HF repo
528
+
529
+ Returns:
530
+ Configured Flask app for HF Spaces
531
+ """
532
+ from genarena.visualize.data_loader import HFArenaDataLoader
533
+
534
+ # Get the directory containing this file for templates/static
535
+ app_dir = os.path.dirname(os.path.abspath(__file__))
536
+
537
+ app = Flask(
538
+ __name__,
539
+ template_folder=os.path.join(app_dir, "templates"),
540
+ static_folder=os.path.join(app_dir, "static"),
541
+ )
542
+
543
+ # Store config
544
+ app.config["ARENA_DIR"] = arena_dir
545
+ app.config["DATA_DIR"] = data_dir
546
+ app.config["USE_HF_CDN"] = True
547
+ app.config["HF_REPO"] = hf_repo
548
+
549
+ # Create HF data loader
550
+ data_loader = HFArenaDataLoader(arena_dir, data_dir, hf_repo, image_files)
551
+
552
+ # ========== Page Routes ==========
553
+
554
+ @app.route("/")
555
+ def index():
556
+ """Main page."""
557
+ return render_template("index.html")
558
+
559
+ # ========== API Routes ==========
560
+ # Copy all API routes from create_app - they work the same way
561
+
562
+ @app.route("/api/subsets")
563
+ def api_subsets():
564
+ """Get list of available subsets."""
565
+ subsets = data_loader.discover_subsets()
566
+ return jsonify({"subsets": subsets})
567
+
568
+ @app.route("/api/subsets/<subset>/info")
569
+ def api_subset_info(subset: str):
570
+ """Get information about a subset."""
571
+ info = data_loader.get_subset_info(subset)
572
+ if not info:
573
+ return jsonify({"error": "Subset not found"}), 404
574
+
575
+ return jsonify({
576
+ "name": info.name,
577
+ "models": info.models,
578
+ "experiments": info.experiments,
579
+ "total_battles": info.total_battles,
580
+ "min_input_images": info.min_input_images,
581
+ "max_input_images": info.max_input_images,
582
+ "prompt_sources": info.prompt_sources,
583
+ })
584
+
585
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/battles")
586
+ def api_battles(subset: str, exp_name: str):
587
+ """Get paginated battle records."""
588
+ page = request.args.get("page", 1, type=int)
589
+ page_size = request.args.get("page_size", 20, type=int)
590
+ result_filter = request.args.get("result", None, type=str)
591
+ consistency = request.args.get("consistent", None, type=str)
592
+ min_images = request.args.get("min_images", None, type=int)
593
+ max_images = request.args.get("max_images", None, type=int)
594
+ prompt_source = request.args.get("prompt_source", None, type=str)
595
+
596
+ models_param = request.args.get("models", None, type=str)
597
+ models = None
598
+ if models_param:
599
+ models = [m.strip() for m in models_param.split(",") if m.strip()]
600
+
601
+ consistency_filter = None
602
+ if consistency == "true":
603
+ consistency_filter = True
604
+ elif consistency == "false":
605
+ consistency_filter = False
606
+
607
+ records, total = data_loader.get_battles(
608
+ subset=subset,
609
+ exp_name=exp_name,
610
+ page=page,
611
+ page_size=page_size,
612
+ models=models,
613
+ result_filter=result_filter,
614
+ consistency_filter=consistency_filter,
615
+ min_images=min_images,
616
+ max_images=max_images,
617
+ prompt_source=prompt_source,
618
+ )
619
+
620
+ return jsonify({
621
+ "battles": [r.to_dict() for r in records],
622
+ "total": total,
623
+ "page": page,
624
+ "page_size": page_size,
625
+ "total_pages": (total + page_size - 1) // page_size,
626
+ })
627
+
628
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/battles/<path:battle_id>")
629
+ def api_battle_detail(subset: str, exp_name: str, battle_id: str):
630
+ """Get detailed battle record."""
631
+ try:
632
+ parts = battle_id.rsplit(":", 1)
633
+ sample_index = int(parts[1])
634
+ model_part = parts[0]
635
+
636
+ if "_vs_" in model_part:
637
+ models = model_part.split("_vs_")
638
+ model_a, model_b = models[0], models[1]
639
+ else:
640
+ return jsonify({"error": "Invalid battle_id format"}), 400
641
+ except (ValueError, IndexError):
642
+ return jsonify({"error": "Invalid battle_id format"}), 400
643
+
644
+ record = data_loader.get_battle_detail(
645
+ subset, exp_name, model_a, model_b, sample_index
646
+ )
647
+
648
+ if not record:
649
+ return jsonify({"error": "Battle not found"}), 404
650
+
651
+ return jsonify(record.to_detail_dict())
652
+
653
+ @app.route("/api/subsets/<subset>/stats")
654
+ def api_stats(subset: str):
655
+ """Get statistics for a subset."""
656
+ exp_name = request.args.get("exp_name", None, type=str)
657
+ stats = data_loader.get_stats(subset, exp_name)
658
+
659
+ if not stats:
660
+ return jsonify({"error": "Subset not found"}), 404
661
+
662
+ return jsonify(stats)
663
+
664
+ @app.route("/api/subsets/<subset>/leaderboard")
665
+ def api_elo_leaderboard(subset: str):
666
+ """Get ELO leaderboard for a subset."""
667
+ models_param = request.args.get("models", None, type=str)
668
+ filter_models = None
669
+ if models_param:
670
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
671
+
672
+ leaderboard = data_loader.get_elo_leaderboard(subset, filter_models)
673
+ return jsonify({"leaderboard": leaderboard})
674
+
675
+ @app.route("/api/subsets/<subset>/models/<path:model>/stats")
676
+ def api_model_stats(subset: str, model: str):
677
+ """Get detailed statistics for a specific model."""
678
+ exp_name = request.args.get("exp_name", "__all__", type=str)
679
+ stats = data_loader.get_model_vs_stats(subset, model, exp_name)
680
+
681
+ if not stats:
682
+ return jsonify({"error": "Model not found"}), 404
683
+
684
+ return jsonify(stats)
685
+
686
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/h2h")
687
+ def api_head_to_head(subset: str, exp_name: str):
688
+ """Get head-to-head statistics between two models."""
689
+ model_a = request.args.get("model_a", None, type=str)
690
+ model_b = request.args.get("model_b", None, type=str)
691
+
692
+ if not model_a or not model_b:
693
+ return jsonify({"error": "model_a and model_b are required"}), 400
694
+
695
+ h2h = data_loader.get_head_to_head(subset, exp_name, model_a, model_b)
696
+ return jsonify(h2h)
697
+
698
+ @app.route("/api/subsets/<subset>/samples/<int:sample_index>/input_count")
699
+ def api_input_image_count(subset: str, sample_index: int):
700
+ """Get the number of input images for a sample."""
701
+ count = data_loader.get_input_image_count(subset, sample_index)
702
+ return jsonify({"count": count})
703
+
704
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/samples/<int:sample_index>/all_models")
705
+ def api_sample_all_models(subset: str, exp_name: str, sample_index: int):
706
+ """Get all model outputs for a specific sample."""
707
+ models_param = request.args.get("models", None, type=str)
708
+ filter_models = None
709
+ if models_param:
710
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
711
+
712
+ stats_scope = request.args.get("stats_scope", "filtered", type=str)
713
+
714
+ result = data_loader.get_sample_all_models(
715
+ subset, exp_name, sample_index, filter_models, stats_scope
716
+ )
717
+
718
+ if not result:
719
+ return jsonify({"error": "Sample not found"}), 404
720
+
721
+ return jsonify(result)
722
+
723
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/samples/<int:sample_index>/models/<path:model>/battles")
724
+ def api_model_battles_for_sample(subset: str, exp_name: str, sample_index: int, model: str):
725
+ """Get all battle records for a specific model on a specific sample."""
726
+ opponents_param = request.args.get("opponents", None, type=str)
727
+ opponent_models = None
728
+ if opponents_param:
729
+ opponent_models = [m.strip() for m in opponents_param.split(",") if m.strip()]
730
+
731
+ result = data_loader.get_model_battles_for_sample(
732
+ subset=subset,
733
+ exp_name=exp_name,
734
+ sample_index=sample_index,
735
+ model=model,
736
+ opponent_models=opponent_models,
737
+ )
738
+
739
+ return jsonify(result)
740
+
741
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/prompts")
742
+ def api_prompts(subset: str, exp_name: str):
743
+ """Get paginated list of prompts/samples."""
744
+ page = request.args.get("page", 1, type=int)
745
+ page_size = request.args.get("page_size", 10, type=int)
746
+ min_images = request.args.get("min_images", None, type=int)
747
+ max_images = request.args.get("max_images", None, type=int)
748
+ prompt_source = request.args.get("prompt_source", None, type=str)
749
+
750
+ models_param = request.args.get("models", None, type=str)
751
+ filter_models = None
752
+ if models_param:
753
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
754
+
755
+ prompts, total = data_loader.get_prompts(
756
+ subset=subset,
757
+ exp_name=exp_name,
758
+ page=page,
759
+ page_size=page_size,
760
+ min_images=min_images,
761
+ max_images=max_images,
762
+ prompt_source=prompt_source,
763
+ filter_models=filter_models,
764
+ )
765
+
766
+ return jsonify({
767
+ "prompts": prompts,
768
+ "total": total,
769
+ "page": page,
770
+ "page_size": page_size,
771
+ "total_pages": (total + page_size - 1) // page_size,
772
+ })
773
+
774
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/search")
775
+ def api_search(subset: str, exp_name: str):
776
+ """Search battles by text query."""
777
+ query = request.args.get("q", "", type=str)
778
+ page = request.args.get("page", 1, type=int)
779
+ page_size = request.args.get("page_size", 20, type=int)
780
+ consistency = request.args.get("consistent", None, type=str)
781
+
782
+ models_param = request.args.get("models", None, type=str)
783
+ models = None
784
+ if models_param:
785
+ models = [m.strip() for m in models_param.split(",") if m.strip()]
786
+
787
+ consistency_filter = None
788
+ if consistency == "true":
789
+ consistency_filter = True
790
+ elif consistency == "false":
791
+ consistency_filter = False
792
+
793
+ records, total = data_loader.search_battles(
794
+ subset=subset,
795
+ exp_name=exp_name,
796
+ query=query,
797
+ page=page,
798
+ page_size=page_size,
799
+ models=models,
800
+ consistency_filter=consistency_filter,
801
+ )
802
+
803
+ return jsonify({
804
+ "battles": [r.to_dict() for r in records],
805
+ "total": total,
806
+ "page": page,
807
+ "page_size": page_size,
808
+ "total_pages": (total + page_size - 1) // page_size,
809
+ "query": query,
810
+ })
811
+
812
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/search/prompts")
813
+ def api_search_prompts(subset: str, exp_name: str):
814
+ """Search prompts by text query."""
815
+ query = request.args.get("q", "", type=str)
816
+ page = request.args.get("page", 1, type=int)
817
+ page_size = request.args.get("page_size", 10, type=int)
818
+
819
+ models_param = request.args.get("models", None, type=str)
820
+ filter_models = None
821
+ if models_param:
822
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
823
+
824
+ prompts, total = data_loader.search_prompts(
825
+ subset=subset,
826
+ exp_name=exp_name,
827
+ query=query,
828
+ page=page,
829
+ page_size=page_size,
830
+ filter_models=filter_models,
831
+ )
832
+
833
+ return jsonify({
834
+ "prompts": prompts,
835
+ "total": total,
836
+ "page": page,
837
+ "page_size": page_size,
838
+ "total_pages": (total + page_size - 1) // page_size,
839
+ "query": query,
840
+ })
841
+
842
+ @app.route("/api/subsets/<subset>/matrix")
843
+ def api_win_rate_matrix(subset: str):
844
+ """Get win rate matrix for all model pairs."""
845
+ exp_name = request.args.get("exp_name", "__all__", type=str)
846
+
847
+ models_param = request.args.get("models", None, type=str)
848
+ filter_models = None
849
+ if models_param:
850
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
851
+
852
+ result = data_loader.get_win_rate_matrix(subset, exp_name, filter_models)
853
+ return jsonify(result)
854
+
855
+ @app.route("/api/subsets/<subset>/leaderboard/by-source")
856
+ def api_elo_by_source(subset: str):
857
+ """Get ELO rankings grouped by prompt source."""
858
+ exp_name = request.args.get("exp_name", "__all__", type=str)
859
+ result = data_loader.get_elo_by_source(subset, exp_name)
860
+ return jsonify(result)
861
+
862
+ @app.route("/api/subsets/<subset>/elo-history")
863
+ def api_elo_history(subset: str):
864
+ """Get ELO history over time."""
865
+ exp_name = request.args.get("exp_name", "__all__", type=str)
866
+ granularity = request.args.get("granularity", "day", type=str)
867
+
868
+ models_param = request.args.get("models", None, type=str)
869
+ filter_models = None
870
+ if models_param:
871
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
872
+
873
+ result = data_loader.get_elo_history(subset, exp_name, granularity, filter_models)
874
+ return jsonify(result)
875
+
876
+ @app.route("/api/overview/leaderboards")
877
+ def api_overview_leaderboards():
878
+ """Get leaderboard data for all subsets."""
879
+ result = data_loader.get_all_subsets_leaderboards()
880
+ return jsonify(result)
881
+
882
+ @app.route("/api/cross-subset/info")
883
+ def api_cross_subset_info():
884
+ """Get information about models across multiple subsets."""
885
+ subsets_param = request.args.get("subsets", "", type=str)
886
+ if not subsets_param:
887
+ return jsonify({"error": "subsets parameter is required"}), 400
888
+
889
+ subsets = [s.strip() for s in subsets_param.split(",") if s.strip()]
890
+ if len(subsets) < 1:
891
+ return jsonify({"error": "At least 1 subset required"}), 400
892
+
893
+ result = data_loader.get_cross_subset_info(subsets)
894
+ return jsonify(result)
895
+
896
+ @app.route("/api/cross-subset/elo")
897
+ def api_cross_subset_elo():
898
+ """Compute ELO rankings across multiple subsets."""
899
+ subsets_param = request.args.get("subsets", "", type=str)
900
+ if not subsets_param:
901
+ return jsonify({"error": "subsets parameter is required"}), 400
902
+
903
+ subsets = [s.strip() for s in subsets_param.split(",") if s.strip()]
904
+ if len(subsets) < 1:
905
+ return jsonify({"error": "At least 1 subset required"}), 400
906
+
907
+ exp_name = request.args.get("exp_name", "__all__", type=str)
908
+ model_scope = request.args.get("model_scope", "all", type=str)
909
+
910
+ result = data_loader.get_cross_subset_elo(subsets, exp_name, model_scope)
911
+ return jsonify(result)
912
+
913
+ # ========== Image Routes ==========
914
+
915
+ @app.route("/images/<subset>/<model>/<int:sample_index>")
916
+ def serve_model_image(subset: str, model: str, sample_index: int):
917
+ """Redirect to HF CDN for model output images."""
918
+ url = data_loader.get_model_image_url(subset, model, sample_index)
919
+ if url:
920
+ return redirect(url)
921
+ abort(404)
922
+
923
+ @app.route("/images/<subset>/input/<int:sample_index>")
924
+ @app.route("/images/<subset>/input/<int:sample_index>/<int:img_idx>")
925
+ def serve_input_image(subset: str, sample_index: int, img_idx: int = 0):
926
+ """Serve input image from parquet dataset."""
927
+ image_bytes = data_loader.get_input_image_by_idx(subset, sample_index, img_idx)
928
+
929
+ if not image_bytes:
930
+ abort(404)
931
+
932
+ return send_file(
933
+ io.BytesIO(image_bytes),
934
+ mimetype="image/png",
935
+ max_age=3600,
936
+ )
937
+
938
+ return app