genarena 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,11 +7,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
7
7
  git \
8
8
  && rm -rf /var/lib/apt/lists/*
9
9
 
10
- # Copy project files
11
- COPY . .
12
-
13
10
  # Install Python dependencies
14
- RUN pip install --no-cache-dir -e .[web]
11
+ RUN pip install genarena[web]
15
12
 
16
13
  # Download parquet benchmark data from HuggingFace
17
14
  # This dataset contains the prompt/benchmark data (not arena battle results)
@@ -22,4 +19,4 @@ RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(
22
19
  EXPOSE 7860
23
20
 
24
21
  # Start the application
25
- CMD ["python", "genarena/deploy/app.py"]
22
+ CMD ["python", "-m", "genarena.deploy.app"]
@@ -2267,14 +2267,15 @@ class HFArenaDataLoader(ArenaDataLoader):
2267
2267
  preload: If True, preload all data at initialization
2268
2268
  """
2269
2269
  self.hf_repo = hf_repo
2270
- self._image_url_index = self._build_image_index(image_files)
2270
+ # Build both indexes at once
2271
+ self._image_url_index, self._subset_models_index = self._build_image_index(image_files)
2271
2272
  super().__init__(arena_dir, data_dir, preload=preload)
2272
2273
 
2273
2274
  def _build_image_index(
2274
2275
  self, image_files: list[str]
2275
- ) -> dict[tuple[str, str, int], str]:
2276
+ ) -> tuple[dict[tuple[str, str, int], str], dict[str, list[str]]]:
2276
2277
  """
2277
- Build index: (subset, model, sample_index) -> hf_file_path
2278
+ Build indexes from HF image file list.
2278
2279
 
2279
2280
  Expected path format: {subset}/models/{exp_name}/{model}/{index}.png
2280
2281
 
@@ -2282,11 +2283,14 @@ class HFArenaDataLoader(ArenaDataLoader):
2282
2283
  image_files: List of image file paths from HF repo
2283
2284
 
2284
2285
  Returns:
2285
- Dict mapping (subset, model, sample_index) to HF file path
2286
+ Tuple of:
2287
+ - Dict mapping (subset, model, sample_index) to HF file path
2288
+ - Dict mapping subset to sorted list of model names
2286
2289
  """
2287
2290
  from genarena.models import parse_image_index
2288
2291
 
2289
2292
  index: dict[tuple[str, str, int], str] = {}
2293
+ subset_models: dict[str, set[str]] = {}
2290
2294
 
2291
2295
  for path in image_files:
2292
2296
  parts = path.split("/")
@@ -2300,9 +2304,18 @@ class HFArenaDataLoader(ArenaDataLoader):
2300
2304
  if idx is not None:
2301
2305
  # If duplicate, later entries overwrite earlier ones
2302
2306
  index[(subset, model, idx)] = path
2307
+ # Also track subset -> models mapping
2308
+ if subset not in subset_models:
2309
+ subset_models[subset] = set()
2310
+ subset_models[subset].add(model)
2311
+
2312
+ # Convert sets to sorted lists
2313
+ subset_models_sorted: dict[str, list[str]] = {
2314
+ subset: sorted(models) for subset, models in subset_models.items()
2315
+ }
2303
2316
 
2304
- logger.info(f"Built image URL index with {len(index)} entries")
2305
- return index
2317
+ logger.info(f"Built image URL index with {len(index)} entries across {len(subset_models_sorted)} subsets")
2318
+ return index, subset_models_sorted
2306
2319
 
2307
2320
  def get_model_image_url(
2308
2321
  self, subset: str, model: str, sample_index: int
@@ -2333,3 +2346,159 @@ class HFArenaDataLoader(ArenaDataLoader):
2333
2346
  """
2334
2347
  # Return None to indicate image should be fetched via CDN
2335
2348
  return None
2349
+
2350
+ def _get_available_models_for_subset(self, subset: str) -> list[str]:
2351
+ """
2352
+ Get list of models that have images in the HF CDN for this subset.
2353
+
2354
+ Uses pre-built index for O(1) lookup.
2355
+
2356
+ Returns:
2357
+ List of model names (sorted)
2358
+ """
2359
+ return self._subset_models_index.get(subset, [])
2360
+
2361
+ def _has_model_image(self, subset: str, model: str, sample_index: int) -> bool:
2362
+ """
2363
+ Check if a model has an image for a specific sample in the HF CDN.
2364
+
2365
+ Args:
2366
+ subset: Subset name
2367
+ model: Model name
2368
+ sample_index: Sample index
2369
+
2370
+ Returns:
2371
+ True if image exists in CDN index
2372
+ """
2373
+ return (subset, model, sample_index) in self._image_url_index
2374
+
2375
+ def get_subset_info(self, subset: str) -> Optional[SubsetInfo]:
2376
+ """
2377
+ Get information about a subset.
2378
+
2379
+ Override for HF deployment to use CDN image index for models list.
2380
+
2381
+ Args:
2382
+ subset: Subset name
2383
+
2384
+ Returns:
2385
+ SubsetInfo or None if subset doesn't exist
2386
+ """
2387
+ if subset in self._subset_info_cache:
2388
+ return self._subset_info_cache[subset]
2389
+
2390
+ subset_path = os.path.join(self.arena_dir, subset)
2391
+ if not os.path.isdir(subset_path):
2392
+ return None
2393
+
2394
+ # Get models from CDN index instead of local file system
2395
+ models = self._get_available_models_for_subset(subset)
2396
+
2397
+ # Get experiments
2398
+ pk_logs_dir = os.path.join(subset_path, "pk_logs")
2399
+ experiments = []
2400
+ if os.path.isdir(pk_logs_dir):
2401
+ for name in os.listdir(pk_logs_dir):
2402
+ exp_path = os.path.join(pk_logs_dir, name)
2403
+ if os.path.isdir(exp_path):
2404
+ # Check for battle logs
2405
+ has_logs = any(
2406
+ f.endswith(".jsonl")
2407
+ for f in os.listdir(exp_path)
2408
+ if os.path.isfile(os.path.join(exp_path, f))
2409
+ )
2410
+ if has_logs:
2411
+ experiments.append(name)
2412
+ experiments.sort()
2413
+
2414
+ # Load state
2415
+ state_path = os.path.join(subset_path, "arena", "state.json")
2416
+ state = load_state(state_path)
2417
+
2418
+ # Get image count range
2419
+ img_range = self._image_count_range.get(subset, (1, 1))
2420
+
2421
+ # Get prompt sources
2422
+ prompt_sources = self._prompt_sources.get(subset, [])
2423
+
2424
+ info = SubsetInfo(
2425
+ name=subset,
2426
+ models=models,
2427
+ experiments=experiments,
2428
+ total_battles=state.total_battles,
2429
+ state=state,
2430
+ min_input_images=img_range[0],
2431
+ max_input_images=img_range[1],
2432
+ prompt_sources=prompt_sources,
2433
+ )
2434
+
2435
+ self._subset_info_cache[subset] = info
2436
+ return info
2437
+
2438
+ def get_sample_all_models(
2439
+ self, subset: str, exp_name: str, sample_index: int,
2440
+ filter_models: Optional[list[str]] = None,
2441
+ stats_scope: str = "filtered"
2442
+ ) -> dict[str, Any]:
2443
+ """
2444
+ Get all model outputs for a specific sample, sorted by win rate.
2445
+
2446
+ Override for HF deployment to use CDN image index instead of local files.
2447
+
2448
+ Args:
2449
+ subset: Subset name
2450
+ exp_name: Experiment name
2451
+ sample_index: Sample index
2452
+ filter_models: Optional list of models to filter (show only these models)
2453
+ stats_scope: 'filtered' = only count battles between filtered models,
2454
+ 'all' = count all battles (but show only filtered models)
2455
+
2456
+ Returns:
2457
+ Dict with sample info and all model outputs sorted by win rate
2458
+ """
2459
+ # Get sample metadata
2460
+ sample_meta = self._get_sample_data(subset, sample_index)
2461
+
2462
+ # Determine which models to use for stats calculation
2463
+ stats_filter = filter_models if stats_scope == "filtered" else None
2464
+ model_stats = self.get_model_win_stats(subset, exp_name, sample_index, stats_filter)
2465
+
2466
+ # Get all models that have outputs in CDN (O(1) lookup)
2467
+ available_models_list = self._get_available_models_for_subset(subset)
2468
+
2469
+ # Apply filter if specified
2470
+ if filter_models:
2471
+ filter_set = set(filter_models)
2472
+ available_models_list = [m for m in available_models_list if m in filter_set]
2473
+
2474
+ # Build model info for models that have images for this sample
2475
+ available_models = []
2476
+ for model in available_models_list:
2477
+ # Check if model has image for this sample in CDN index
2478
+ if self._has_model_image(subset, model, sample_index):
2479
+ stats = model_stats.get(model, {
2480
+ "wins": 0, "losses": 0, "ties": 0, "total": 0, "win_rate": 0
2481
+ })
2482
+ available_models.append({
2483
+ "model": model,
2484
+ "wins": stats["wins"],
2485
+ "losses": stats["losses"],
2486
+ "ties": stats["ties"],
2487
+ "total": stats["total"],
2488
+ "win_rate": stats["win_rate"],
2489
+ })
2490
+
2491
+ # Sort by win rate (descending), then by wins (descending), then by model name
2492
+ available_models.sort(key=lambda x: (-x["win_rate"], -x["wins"], x["model"]))
2493
+
2494
+ return {
2495
+ "subset": subset,
2496
+ "exp_name": exp_name,
2497
+ "sample_index": sample_index,
2498
+ "instruction": sample_meta.get("instruction", ""),
2499
+ "task_type": sample_meta.get("task_type", ""),
2500
+ "input_image_count": sample_meta.get("input_image_count", 1),
2501
+ "prompt_source": sample_meta.get("prompt_source"),
2502
+ "original_metadata": sample_meta.get("original_metadata"),
2503
+ "models": available_models,
2504
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: genarena
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: GenArena Arena Evaluation - VLM-based pairwise image generation evaluation
5
5
  Author: GenArena Team
6
6
  License: Apache-2.0
@@ -13,7 +13,7 @@ genarena/sampling.py,sha256=v3AeOASfrxBYyPGy3tlAgBetK5F1_AhdJa9HgB9-XQM,11607
13
13
  genarena/state.py,sha256=SK93_ACqHVS3FpHchp5Oj-UvwjDcPZv-ACBX3Cc-P8Q,24095
14
14
  genarena/utils.py,sha256=ppzphYoNryjBMQlgS4GAGC2lw1nmdE_zN4RTcDQk5Y8,2685
15
15
  genarena/vlm.py,sha256=kfgLtSd2wJ077O-VxlNbvRv70Hgg-jWN5ZcICruaZBw,18249
16
- genarena/deploy/Dockerfile,sha256=UN3lm5WgFIxR4plsLKqhFXnvsqZ-ZuDC7KvRQ5E9Qiw,710
16
+ genarena/deploy/Dockerfile,sha256=sbYetDT5ajJHJxwcO5DZzMaqlOZddHXLcUXN8zHaIMY,670
17
17
  genarena/deploy/README.md,sha256=7KcPVY73_5Gotr6a-E24xgeVxe5fokuT4KlupQead8w,1576
18
18
  genarena/deploy/__init__.py,sha256=BpXfurQ84w_Qr_C8Joy0Oh_9HCU--5cMSt4wvxsPV8Y,122
19
19
  genarena/deploy/app.py,sha256=BPifFGz9p0J7-TFw19JPuzmxamp-hdNvVpIxcvHAPsc,2716
@@ -33,12 +33,12 @@ genarena/validation/validator.py,sha256=-yfVMXJBOSMmfajczGjpW3K0Xe1LFHPbdXh5cuMM
33
33
  genarena/visualize/README.md,sha256=8YOEBRicm35G6wEbA-qBbHBkZwozl0Zdl8zNqmb-t_Y,4525
34
34
  genarena/visualize/__init__.py,sha256=Id0QCPo_QuxjZOG7QuqttdzNCwmDFrH26eeYqHLn-JU,283
35
35
  genarena/visualize/app.py,sha256=2TbGuH22zV2U3Fm8LjZLTxsoVeZHRmMqsBlSZ1xhz1A,34903
36
- genarena/visualize/data_loader.py,sha256=C28qx26iJT_cJbAJfRDVKprB9S6nZK63kbfzh87ofpk,86107
36
+ genarena/visualize/data_loader.py,sha256=J38DRcM4Byd6cH5vhlMcN6U7-TN-wGkQ_CmJevdSfgU,92588
37
37
  genarena/visualize/static/app.js,sha256=g2sdB9zfa_Nee-sQ-JJOWOGKJeihD31LpWyg-vSB6JA,144584
38
38
  genarena/visualize/static/model_aliases.json,sha256=iZQ4IIm-Vv2ly8XSPT2QPmDHM4PlnJS3RTdskbfhQME,1594
39
39
  genarena/visualize/static/style.css,sha256=nIAyGr9PpY9C-wGR5TGPgHB1g9KRCWN-iEEN8F1tbdk,78265
40
40
  genarena/visualize/templates/index.html,sha256=cJoFWkXVXP9MDee16vq-ufMhbs89cVwaTVhS4RKMW1E,21725
41
- genarena-0.1.0.dist-info/METADATA,sha256=IJs5QDs2nGpGSxho3j4-zqfSY3GQu1iycZg6VFKV_Qc,6065
42
- genarena-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
43
- genarena-0.1.0.dist-info/entry_points.txt,sha256=yEZL7896wPLpHS9dWMQ82V5-04PJaYkm48mb7dNdlhM,47
44
- genarena-0.1.0.dist-info/RECORD,,
41
+ genarena-0.1.2.dist-info/METADATA,sha256=DwWRNwiZzgaKYwr8StwBKvzNqzkcex8fni-zu-8_Qnw,6065
42
+ genarena-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
43
+ genarena-0.1.2.dist-info/entry_points.txt,sha256=yEZL7896wPLpHS9dWMQ82V5-04PJaYkm48mb7dNdlhM,47
44
+ genarena-0.1.2.dist-info/RECORD,,