PyPI - genarena - Versions diffs - 0.0.1__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

genarena 0.0.1py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

genarena/__init__.py +49 -2
genarena/__main__.py +10 -0
genarena/arena.py +1685 -0
genarena/battle.py +337 -0
genarena/bt_elo.py +507 -0
genarena/cli.py +1581 -0
genarena/data.py +476 -0
genarena/deploy/Dockerfile +22 -0
genarena/deploy/README.md +55 -0
genarena/deploy/__init__.py +5 -0
genarena/deploy/app.py +84 -0
genarena/experiments.py +121 -0
genarena/leaderboard.py +270 -0
genarena/logs.py +409 -0
genarena/models.py +412 -0
genarena/prompts/__init__.py +127 -0
genarena/prompts/mmrb2.py +373 -0
genarena/sampling.py +336 -0
genarena/state.py +656 -0
genarena/sync/__init__.py +105 -0
genarena/sync/auto_commit.py +118 -0
genarena/sync/deploy_ops.py +543 -0
genarena/sync/git_ops.py +422 -0
genarena/sync/hf_ops.py +891 -0
genarena/sync/init_ops.py +431 -0
genarena/sync/packer.py +587 -0
genarena/sync/submit.py +837 -0
genarena/utils.py +103 -0
genarena/validation/__init__.py +19 -0
genarena/validation/schema.py +327 -0
genarena/validation/validator.py +329 -0
genarena/visualize/README.md +148 -0
genarena/visualize/__init__.py +14 -0
genarena/visualize/app.py +938 -0
genarena/visualize/data_loader.py +2430 -0
genarena/visualize/static/app.js +3762 -0
genarena/visualize/static/model_aliases.json +86 -0
genarena/visualize/static/style.css +4104 -0
genarena/visualize/templates/index.html +413 -0
genarena/vlm.py +519 -0
genarena-0.1.1.dist-info/METADATA +178 -0
genarena-0.1.1.dist-info/RECORD +44 -0
{genarena-0.0.1.dist-info → genarena-0.1.1.dist-info}/WHEEL +1 -2
genarena-0.1.1.dist-info/entry_points.txt +2 -0
genarena-0.0.1.dist-info/METADATA +0 -26
genarena-0.0.1.dist-info/RECORD +0 -5
genarena-0.0.1.dist-info/top_level.txt +0 -1

genarena/validation/validator.py ADDED Viewed

@@ -0,0 +1,329 @@
+# Copyright 2026 Ruihang Li.
+# Licensed under the Apache License, Version 2.0.
+# See LICENSE file in the project root for details.
+"""
+Validator for GenArena submissions.
+This module provides functions to validate submission files,
+including downloading and verifying data from HuggingFace.
+Used by the GitHub Actions bot for automated validation.
+"""
+import hashlib
+import json
+import logging
+import os
+import tempfile
+import zipfile
+from dataclasses import dataclass, field
+from typing import Any, Optional
+from genarena.validation.schema import validate_submission_schema
+logger = logging.getLogger(__name__)
+@dataclass
+class ValidationCheck:
+    """Single validation check result."""
+    name: str
+    passed: bool
+    error: Optional[str] = None
+@dataclass
+class ValidationReport:
+    """Complete validation report for a submission."""
+    status: str  # "success" or "failed"
+    submission_id: str = ""
+    exp_name: str = ""
+    subset: str = ""
+    models: list[str] = field(default_factory=list)
+    new_models: list[str] = field(default_factory=list)
+    total_battles: int = 0
+    checks: list[ValidationCheck] = field(default_factory=list)
+    elo_comparison: dict[str, dict[str, float]] = field(default_factory=dict)
+    errors: list[str] = field(default_factory=list)
+    def add_check(self, name: str, passed: bool, error: Optional[str] = None) -> None:
+        """Add a validation check result."""
+        self.checks.append(ValidationCheck(name=name, passed=passed, error=error))
+        if not passed:
+            self.status = "failed"
+            if error:
+                self.errors.append(f"{name}: {error}")
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "status": self.status,
+            "submission_id": self.submission_id,
+            "exp_name": self.exp_name,
+            "subset": self.subset,
+            "models": self.models,
+            "new_models": self.new_models,
+            "total_battles": self.total_battles,
+            "checks": [
+                {"name": c.name, "passed": c.passed, "error": c.error}
+                for c in self.checks
+            ],
+            "elo_comparison": self.elo_comparison,
+            "errors": self.errors,
+        }
+def validate_submission_file(
+    submission_path: str,
+    official_models_path: Optional[str] = None,
+    download_data: bool = True,
+) -> ValidationReport:
+    """
+    Validate a submission JSON file.
+    This is the main entry point for validating submissions,
+    used by the GitHub Actions bot.
+    Args:
+        submission_path: Path to submission JSON file
+        official_models_path: Path to official_models.json (optional)
+        download_data: Whether to download and verify data from HF
+    Returns:
+        ValidationReport with all check results
+    """
+    report = ValidationReport(status="success")
+    # 1. Load and parse JSON
+    try:
+        with open(submission_path, "r", encoding="utf-8") as f:
+            submission = json.load(f)
+        report.add_check("JSON parse", True)
+    except json.JSONDecodeError as e:
+        report.add_check("JSON parse", False, str(e))
+        return report
+    except IOError as e:
+        report.add_check("File read", False, str(e))
+        return report
+    # 2. Schema validation
+    is_valid, schema_errors = validate_submission_schema(submission)
+    if is_valid:
+        report.add_check("Schema validation", True)
+    else:
+        for err in schema_errors:
+            report.add_check("Schema validation", False, err)
+        return report
+    # Extract basic info
+    report.submission_id = submission.get("submission_id", "")
+    exp = submission.get("experiment", {})
+    report.exp_name = exp.get("exp_name", "")
+    report.subset = exp.get("subset", "")
+    report.models = exp.get("models", [])
+    report.new_models = exp.get("new_models", [])
+    report.total_battles = exp.get("total_battles", 0)
+    # 3. Check new models against official list
+    if official_models_path and os.path.isfile(official_models_path):
+        try:
+            with open(official_models_path, "r", encoding="utf-8") as f:
+                official_data = json.load(f)
+            official_models = set(
+                official_data.get("subsets", {})
+                .get(report.subset, {})
+                .get("models", [])
+            )
+            # Verify new_models are actually new
+            for model in report.new_models:
+                if model in official_models:
+                    report.add_check(
+                        f"Model '{model}' is new",
+                        False,
+                        "Model already exists in official leaderboard",
+                    )
+                else:
+                    report.add_check(f"Model '{model}' is new", True)
+        except Exception as e:
+            report.add_check(
+                "Check official models", False, f"Failed to load official models: {e}"
+            )
+    else:
+        report.add_check(
+            "Check official models",
+            True,
+            "Skipped (no official_models.json provided)",
+        )
+    # 4. Download and verify data from HuggingFace
+    if download_data:
+        data_report = validate_submission_data(submission)
+        for check in data_report.checks:
+            report.checks.append(check)
+            if not check.passed:
+                report.status = "failed"
+                if check.error:
+                    report.errors.append(f"{check.name}: {check.error}")
+        report.elo_comparison = data_report.elo_comparison
+    else:
+        report.add_check("Data verification", True, "Skipped (download_data=False)")
+    return report
+def validate_submission_data(submission: dict[str, Any]) -> ValidationReport:
+    """
+    Download and validate submission data from HuggingFace.
+    Downloads the pk_logs ZIP, verifies checksum, extracts battles,
+    and recalculates ELO for comparison.
+    Args:
+        submission: Submission metadata dictionary
+    Returns:
+        ValidationReport with data validation results
+    """
+    report = ValidationReport(status="success")
+    data_loc = submission.get("data_location", {})
+    hf_repo = data_loc.get("hf_repo_id", "")
+    hf_revision = data_loc.get("hf_revision", "main")
+    files = data_loc.get("files", {})
+    pk_logs_info = files.get("pk_logs_zip", {})
+    if not hf_repo or not pk_logs_info:
+        report.add_check("Data location", False, "Missing HF repo or file info")
+        return report
+    try:
+        from huggingface_hub import hf_hub_download
+    except ImportError:
+        report.add_check(
+            "HuggingFace Hub",
+            False,
+            "huggingface_hub not installed",
+        )
+        return report
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # Download pk_logs ZIP
+        try:
+            pk_logs_path = hf_hub_download(
+                repo_id=hf_repo,
+                filename=pk_logs_info["path"],
+                repo_type="dataset",
+                revision=hf_revision,
+                local_dir=tmpdir,
+            )
+            report.add_check("Download pk_logs", True)
+        except Exception as e:
+            report.add_check("Download pk_logs", False, str(e))
+            return report
+        # Verify SHA256
+        expected_sha = pk_logs_info.get("sha256", "")
+        try:
+            with open(pk_logs_path, "rb") as f:
+                actual_sha = hashlib.sha256(f.read()).hexdigest()
+            if actual_sha == expected_sha:
+                report.add_check("SHA256 checksum", True)
+            else:
+                report.add_check(
+                    "SHA256 checksum",
+                    False,
+                    f"Expected {expected_sha[:16]}..., got {actual_sha[:16]}...",
+                )
+                return report
+        except Exception as e:
+            report.add_check("SHA256 checksum", False, str(e))
+            return report
+        # Extract ZIP
+        extract_dir = os.path.join(tmpdir, "extracted")
+        try:
+            with zipfile.ZipFile(pk_logs_path, "r") as zf:
+                zf.extractall(extract_dir)
+            report.add_check("Extract ZIP", True)
+        except Exception as e:
+            report.add_check("Extract ZIP", False, str(e))
+            return report
+        # Find battle log files
+        # The ZIP structure is: <exp_name>/*.jsonl
+        battle_records = []
+        try:
+            for root, dirs, filenames in os.walk(extract_dir):
+                for filename in filenames:
+                    if filename.endswith(".jsonl") and "raw_outputs" not in root:
+                        filepath = os.path.join(root, filename)
+                        with open(filepath, "r", encoding="utf-8") as f:
+                            for line in f:
+                                line = line.strip()
+                                if line:
+                                    try:
+                                        record = json.loads(line)
+                                        battle_records.append(record)
+                                    except json.JSONDecodeError:
+                                        continue
+            report.add_check("Parse battle logs", True)
+        except Exception as e:
+            report.add_check("Parse battle logs", False, str(e))
+            return report
+        # Verify battle count
+        expected_battles = submission.get("experiment", {}).get("total_battles", 0)
+        if len(battle_records) == expected_battles:
+            report.add_check("Battle count", True)
+        else:
+            report.add_check(
+                "Battle count",
+                False,
+                f"Expected {expected_battles}, got {len(battle_records)}",
+            )
+        # Recalculate ELO
+        try:
+            from genarena.bt_elo import compute_bt_elo_ratings
+            battles = [
+                (r["model_a"], r["model_b"], r["final_winner"])
+                for r in battle_records
+                if r.get("model_a") and r.get("model_b") and r.get("final_winner")
+            ]
+            if battles:
+                recalc_elo = compute_bt_elo_ratings(battles)
+                submitted_elo = submission.get("elo_preview", {}).get("ratings", {})
+                all_match = True
+                for model, submitted_rating in submitted_elo.items():
+                    recalc_rating = recalc_elo.get(model, 0)
+                    report.elo_comparison[model] = {
+                        "submitted": submitted_rating,
+                        "recalculated": recalc_rating,
+                    }
+                    # Allow small floating point differences (±1.0)
+                    diff = abs(submitted_rating - recalc_rating)
+                    if diff > 1.0:
+                        report.add_check(
+                            f"ELO '{model}'",
+                            False,
+                            f"Diff: {diff:.1f} (submitted: {submitted_rating:.1f}, "
+                            f"recalc: {recalc_rating:.1f})",
+                        )
+                        all_match = False
+                if all_match:
+                    report.add_check("ELO verification", True)
+        except Exception as e:
+            report.add_check("ELO verification", False, str(e))
+    return report

genarena/visualize/README.md ADDED Viewed

@@ -0,0 +1,148 @@
+# GenArena Arena Visualizer
+A web-based visualization tool for browsing and analyzing battle records from GenArena Arena evaluations.
+## Features
+- **Multi-subset Support**: Select and switch between different subsets directly in the web interface
+- **Paginated Browsing**: Efficiently browse large numbers of battle records with pagination
+- **Flexible Filtering**:
+  - Filter by model (view all battles involving a specific model)
+  - Filter by result (wins/losses/ties for a selected model)
+  - Filter by consistency (consistent vs inconsistent VLM judgments)
+- **Detailed Battle View**: Click any battle card to see:
+  - Full instruction text
+  - Input image and both model outputs side-by-side
+  - Complete VLM judge reasoning (original and swapped calls)
+- **Dark Theme**: Modern dark UI designed for extended analysis sessions
+## Installation
+The visualizer requires Flask:
+```bash
+pip install flask
+```
+## Usage
+### Command Line
+Start the visualization server using the `genarena serve` command:
+```bash
+genarena serve \
+  --arena_dir /path/to/arena \
+  --data_dir /path/to/data \
+  --port 8080 \
+  --host 0.0.0.0
+```
+### Arguments
+| Argument | Required | Default | Description |
+|----------|----------|---------|-------------|
+| `--arena_dir` | Yes | - | Path to the arena directory containing subset folders with battle logs |
+| `--data_dir` | Yes | - | Path to the data directory containing parquet files |
+| `--host` | No | `0.0.0.0` | Host address to bind the server |
+| `--port` | No | `8080` | Port number to listen on |
+| `--debug` | No | `False` | Enable Flask debug mode |
+### Example
+```bash
+genarena serve \
+  --arena_dir /projects/genarena/arena \
+  --data_dir /datasets/genarena/data \
+  --port 8080
+```
+Then open `http://localhost:8080` in your browser.
+## Web Interface
+### Navigation
+1. **Select Subset**: Use the dropdown in the header to choose a subset
+2. **Select Experiment**: Choose an experiment from the dropdown (populated after subset selection)
+3. **Browse Battles**: Scroll through the paginated battle cards
+### Filtering
+Use the sidebar filters to narrow down results:
+- **Model Filter**: Show only battles involving a specific model
+- **Result Filter**: When a model is selected, filter by wins/losses/ties
+- **Consistency Filter**: Show only consistent or inconsistent judgments
+### Battle Cards
+Each card displays:
+- Model names (winner highlighted in green, loser in red)
+- Instruction text (truncated)
+- Thumbnail images: input, model A output, model B output
+- Result badges (Win/Loss/Tie, Consistent/Inconsistent)
+Click a card to open the detail modal.
+### Detail Modal
+The detail view shows:
+- Full instruction text
+- Large images for comparison
+- Complete VLM judge outputs for both calls (original order and swapped order)
+- Parse results and winner determination
+### Keyboard Shortcuts
+| Key | Action |
+|-----|--------|
+| `j` / `↓` | Next page |
+| `k` / `↑` | Previous page |
+| `Esc` | Close detail modal |
+## API Endpoints
+The visualizer exposes a REST API that can be used programmatically:
+| Endpoint | Description |
+|----------|-------------|
+| `GET /api/subsets` | List available subsets |
+| `GET /api/subsets/<subset>/info` | Get subset info (models, experiments) |
+| `GET /api/subsets/<subset>/experiments/<exp>/battles` | Get paginated battles |
+| `GET /api/subsets/<subset>/experiments/<exp>/battles/<id>` | Get battle detail |
+| `GET /api/subsets/<subset>/stats` | Get statistics |
+| `GET /images/<subset>/<model>/<index>` | Serve model output image |
+| `GET /images/<subset>/input/<index>` | Serve input image |
+### Query Parameters for `/battles`
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `page` | int | Page number (1-indexed) |
+| `page_size` | int | Records per page (default: 20) |
+| `model` | string | Filter by model name |
+| `result` | string | Filter by result: `wins`, `losses`, `ties` |
+| `consistent` | string | Filter by consistency: `true`, `false` |
+## Directory Structure
+```
+visualize/
+├── __init__.py       # Package exports
+├── app.py            # Flask application and routes
+├── data_loader.py    # Data loading and querying logic
+├── templates/
+│   └── index.html    # Main page template
+└── static/
+    ├── style.css     # Dark theme styles
+    └── app.js        # Frontend JavaScript
+```
+## Requirements
+- Python 3.8+
+- Flask
+- GenArena arena with battle logs (pk_logs directory)
+- Parquet dataset with evaluation data

genarena/visualize/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""
+GenArena Arena Visualization Module.
+Provides a web-based interface for browsing and analyzing battle records.
+"""
+from genarena.visualize.app import create_app
+from genarena.visualize.data_loader import ArenaDataLoader
+__all__ = [
+    "create_app",
+    "ArenaDataLoader",
+]

genarena 0.0.1__py3-none-any.whl → 0.1.1__py3-none-any.whl

genarena 0.0.1py3-none-any.whl → 0.1.1py3-none-any.whl