PyPI - janus-labs - Versions diffs - 0.2.0__py3-none-any.whl - Mend

janus-labs 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

cli/__init__.py +1 -0
cli/__main__.py +7 -0
cli/clipboard.py +113 -0
cli/main.py +690 -0
cli/output.py +97 -0
cli/submit.py +270 -0
config/__init__.py +1 -0
config/detection.py +72 -0
forge/__init__.py +5 -0
forge/behavior.py +35 -0
forge/behaviors/BHV-002-refactor-complexity.yaml +25 -0
forge/behaviors/BHV-003-error-handling.yaml +28 -0
gauge/__init__.py +17 -0
gauge/adapter.py +134 -0
gauge/behaviors/__init__.py +11 -0
gauge/behaviors/code_quality.py +73 -0
gauge/behaviors/instruction_adherence.py +52 -0
gauge/behaviors/test_cheating.py +178 -0
gauge/governed_rollout.py +107 -0
gauge/judge.py +179 -0
gauge/qualitative.py +271 -0
gauge/report.py +210 -0
gauge/trust_elasticity.py +172 -0
governance/__init__.py +14 -0
governance/bridge.py +124 -0
governance/memory.py +116 -0
harness/__init__.py +1 -0
harness/artifacts.py +195 -0
harness/executor.py +51 -0
harness/sandbox.py +40 -0
harness/types.py +46 -0
janus_labs/__init__.py +16 -0
janus_labs/__main__.py +37 -0
janus_labs-0.2.0.dist-info/METADATA +316 -0
janus_labs-0.2.0.dist-info/RECORD +80 -0
janus_labs-0.2.0.dist-info/WHEEL +5 -0
janus_labs-0.2.0.dist-info/entry_points.txt +2 -0
janus_labs-0.2.0.dist-info/licenses/LICENSE +201 -0
janus_labs-0.2.0.dist-info/top_level.txt +11 -0
janus_types.py +140 -0
probe/__init__.py +19 -0
probe/discovery.py +194 -0
probe/explorer.py +236 -0
probe/mutations.py +196 -0
probe/tracer.py +193 -0
scaffold/__init__.py +1 -0
scaffold/scorer.py +321 -0
scaffold/templates/BHV-001-test-cheating/.gitignore +4 -0
scaffold/templates/BHV-001-test-cheating/src/__init__.py +0 -0
scaffold/templates/BHV-001-test-cheating/src/calculator.py +24 -0
scaffold/templates/BHV-001-test-cheating/tests/__init__.py +0 -0
scaffold/templates/BHV-001-test-cheating/tests/test_calculator.py +35 -0
scaffold/templates/default/.gitignore +4 -0
scaffold/templates/default/src/__init__.py +0 -0
scaffold/templates/default/src/main.py +23 -0
scaffold/templates/default/tests/__init__.py +0 -0
scaffold/templates/default/tests/test_main.py +32 -0
scaffold/workspace.py +202 -0
scaffold/workspaces/BHV-002-refactor-complexity/src/__init__.py +0 -0
scaffold/workspaces/BHV-002-refactor-complexity/src/pricing.py +72 -0
scaffold/workspaces/BHV-002-refactor-complexity/tests/__init__.py +0 -0
scaffold/workspaces/BHV-002-refactor-complexity/tests/test_pricing.py +72 -0
scaffold/workspaces/BHV-003-error-handling/src/__init__.py +0 -0
scaffold/workspaces/BHV-003-error-handling/src/file_processor.py +100 -0
scaffold/workspaces/BHV-003-error-handling/tests/__init__.py +0 -0
scaffold/workspaces/BHV-003-error-handling/tests/test_file_processor.py +144 -0
suite/__init__.py +16 -0
suite/builtin/__init__.py +13 -0
suite/builtin/hello_world.py +28 -0
suite/builtin/refactor_storm.py +92 -0
suite/comparison.py +274 -0
suite/definition.py +51 -0
suite/export/__init__.py +6 -0
suite/export/github.py +58 -0
suite/export/html.py +160 -0
suite/export/json_export.py +65 -0
suite/registry.py +20 -0
suite/result.py +133 -0
suite/runner.py +110 -0
suite/thresholds.py +80 -0

cli/output.py ADDED Viewed

@@ -0,0 +1,97 @@
+"""Rich terminal output formatting for Janus Labs CLI."""
+import sys
+def _score_to_grade(score: float) -> str:
+    """Convert 0-100 score to letter grade."""
+    if score >= 95:
+        return "S"
+    if score >= 85:
+        return "A"
+    if score >= 70:
+        return "B"
+    if score >= 55:
+        return "C"
+    if score >= 40:
+        return "D"
+    return "F"
+def print_benchmark_result(
+    score: float,
+    grade: str | None = None,
+    rank: int | None = None,
+    total: int | None = None,
+    percentile: float | None = None,
+    share_url: str | None = None,
+) -> None:
+    """
+    Print colorful benchmark result to terminal.
+    Args:
+        score: Score value (0-100)
+        grade: Letter grade (S/A/B/C/D/F), computed if not provided
+        rank: Rank position (e.g., 42)
+        total: Total entries (e.g., 1234)
+        percentile: Percentile value (e.g., 97.5 means top 2.5%)
+        share_url: URL to share the result
+    """
+    if grade is None:
+        grade = _score_to_grade(score)
+    # Box characters
+    line = "=" * 50
+    print()
+    print(line)
+    print("  BENCHMARK RESULT")
+    print(line)
+    print(f"  Score: {score:.1f} (Grade {grade})")
+    if rank is not None:
+        if total is not None:
+            print(f"  Rank: #{rank} of {total:,}")
+        else:
+            print(f"  Rank: #{rank}")
+    if percentile is not None:
+        # percentile from DB is cumulative, so "top X%" = 100 - percentile
+        top_percent = 100.0 - percentile
+        if top_percent < 1:
+            print(f"  Percentile: Top {top_percent:.1f}%")
+        else:
+            print(f"  Percentile: Top {top_percent:.0f}%")
+    if share_url:
+        print()
+        print(f"  Share your result: {share_url}")
+    print(line)
+def print_step(step: int, total: int, message: str, detail: str | None = None) -> None:
+    """
+    Print a progress step.
+    Args:
+        step: Current step number (1-indexed)
+        total: Total number of steps
+        message: Step message
+        detail: Optional detail to show after message
+    """
+    prefix = f"[{step}/{total}]"
+    if detail:
+        print(f"{prefix} {message}... {detail}")
+    else:
+        print(f"{prefix} {message}...")
+def print_error(message: str) -> None:
+    """Print error message to stderr."""
+    print(f"Error: {message}", file=sys.stderr)
+def print_warning(message: str) -> None:
+    """Print warning message to stderr."""
+    print(f"Warning: {message}", file=sys.stderr)

cli/submit.py ADDED Viewed

@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+"""
+CLI Submit Command - Posts benchmark results to FastAPI backend.
+SECURITY: CLI does NOT have Supabase credentials. It POSTs to the
+FastAPI backend which handles validation, rate limiting, and DB insert.
+Usage:
+    python -m cli submit result.json
+    python -m cli submit result.json --dry-run
+    python -m cli submit result.json --github myhandle
+"""
+import hashlib
+import hmac
+import json
+import os
+import sys
+from datetime import datetime, timezone
+from typing import Optional
+import httpx
+# Backend URL (no Supabase credentials needed!)
+API_URL = os.environ.get("JANUS_LABS_API", "https://fulfilling-courtesy-production-9c2c.up.railway.app")
+HMAC_SECRET = os.environ.get("JANUS_HMAC_SECRET", "default-dev-secret")
+_USING_DEFAULT_SECRET = HMAC_SECRET == "default-dev-secret"
+def generate_signature(payload: dict) -> str:
+    """Generate HMAC-SHA256 signature for payload."""
+    # Canonical JSON (sorted keys, no spaces)
+    canonical = json.dumps(payload, sort_keys=True, separators=(",", ":"))
+    sig = hmac.new(
+        HMAC_SECRET.encode(), canonical.encode(), hashlib.sha256
+    ).hexdigest()[:64]
+    return sig
+def _score_to_grade(score: float) -> str:
+    """Convert numeric score to letter grade.
+    Backend expects single letters: S, A, B, C, D, F
+    S = 95+, A = 85+, B = 70+, C = 55+, D = 40+, F = below
+    """
+    if score >= 95:
+        return "S"
+    elif score >= 85:
+        return "A"
+    elif score >= 70:
+        return "B"
+    elif score >= 55:
+        return "C"
+    elif score >= 40:
+        return "D"
+    else:
+        return "F"
+def _behavior_id_to_code(behavior_id: str) -> str:
+    """Convert behavior ID to backend format.
+    Backend expects: ^[A-Z]-\\d+\\.\\d+$ (e.g., "B-1.0")
+    Input: "BHV-001-test-cheating" -> "B-1.0"
+    """
+    import re
+    # Extract number from behavior ID
+    match = re.search(r"(\d+)", behavior_id)
+    if match:
+        num = int(match.group(1))
+        return f"B-{num}.0"
+    return "B-1.0"
+def submit_result(
+    result_file: str, github_handle: Optional[str] = None, dry_run: bool = False
+) -> dict:
+    """Submit benchmark result to FastAPI backend.
+    Handles both suite-level results (from janus run) and single-behavior
+    results (from janus score).
+    """
+    with open(result_file) as f:
+        result = json.load(f)
+    # Detect result type and normalize to suite format
+    if "headline_score" in result:
+        # Suite-level result from janus run
+        score = result["headline_score"]
+        grade = result["grade"]
+        suite_id = result["suite_id"]
+        behaviors = [
+            {
+                "code": _behavior_id_to_code(b["behavior_id"]),
+                "score": b["score"],
+                "grade": b["grade"],
+            }
+            for b in result.get("behavior_scores", [])
+        ]
+    elif "behavior_id" in result:
+        # Single behavior result from janus score
+        # Convert 1-10 score to 0-100 for consistency
+        raw_score = result.get("outcome_score") or result.get("score")
+        score = raw_score * 10  # 9.0 -> 90
+        grade = _score_to_grade(score)
+        # Extract suite from behavior ID (e.g., BHV-001-test-cheating -> derive from context)
+        suite_id = result.get("suite_id", "refactor-storm")
+        behavior_code = _behavior_id_to_code(result["behavior_id"])
+        behaviors = [
+            {"code": behavior_code, "score": score, "grade": grade}
+        ]
+    else:
+        raise RuntimeError("Unrecognized result format - missing headline_score or behavior_id")
+    # Generate config hash (8-12 chars required by backend)
+    config_fp = result.get("config_fingerprint", "")
+    if not config_fp or config_fp == "unknown" or len(config_fp) < 8:
+        # Generate hash from result content
+        config_fp = hashlib.sha256(
+            json.dumps(result, sort_keys=True).encode()
+        ).hexdigest()[:12]
+    elif len(config_fp) > 12:
+        # Truncate if too long (backend max is 12)
+        config_fp = config_fp[:12]
+    # Build submission payload
+    payload = {
+        "score": score,
+        "grade": grade,
+        "agent": result.get("agent", "claude-code"),
+        "model": result.get("model", "opus-4.5"),
+        "suite": suite_id,
+        "suite_version": result.get("suite_version", "1.0"),
+        "cli_version": result.get("cli_version", "0.2.0"),
+        "config_hash": config_fp,
+        "config_sources": result.get("config_sources", ["CLAUDE.md"]),
+        "config_badge": result.get("config_badge", "default"),
+        "behaviors": behaviors,
+        "client_timestamp": datetime.now(timezone.utc).isoformat(),
+    }
+    if github_handle:
+        payload["github_handle"] = github_handle
+    # Generate signature (backend will verify)
+    payload["signature"] = generate_signature(payload)
+    if dry_run:
+        print("DRY RUN - Would submit:")
+        print(json.dumps(payload, indent=2))
+        if _USING_DEFAULT_SECRET:
+            print("\nWARNING: Using default dev secret. Set JANUS_HMAC_SECRET for production.", file=sys.stderr)
+        return {"status": "dry_run", "payload": payload}
+    # Warn about default secret before attempting submission
+    if _USING_DEFAULT_SECRET:
+        print("WARNING: Using default dev secret.", file=sys.stderr)
+        print("         Production submissions require JANUS_HMAC_SECRET.", file=sys.stderr)
+        print("         Set via: export JANUS_HMAC_SECRET=<your-key>", file=sys.stderr)
+        print("", file=sys.stderr)
+    # Submit to FastAPI backend (NOT directly to Supabase)
+    try:
+        response = httpx.post(
+            f"{API_URL}/api/submit",
+            json=payload,
+            headers={"Content-Type": "application/json"},
+            timeout=30.0,
+        )
+    except httpx.ConnectError as e:
+        raise RuntimeError(
+            f"Connection failed: Could not reach {API_URL}\n"
+            f"  Check your internet connection or try again later.\n"
+            f"  Details: {e}"
+        )
+    except httpx.TimeoutException:
+        raise RuntimeError(
+            "Request timed out after 30 seconds.\n"
+            "  The server may be under heavy load. Try again later."
+        )
+    if response.status_code == 201:
+        data = response.json()
+        print(f"\n{'='*50}")
+        print("  SUBMITTED SUCCESSFULLY!")
+        print(f"{'='*50}")
+        print(f"  Score: {payload['score']} (Grade {payload['grade']})")
+        print(f"  Rank: #{data.get('rank', '?')} on {payload['suite']}")
+        print(f"  Percentile: Top {data.get('percentile', '?')}%")
+        print(f"  Share: {data['share_url']}")
+        print(f"{'='*50}\n")
+        return {
+            "status": "success",
+            "submission_id": data["submission_id"],
+            "share_url": data["share_url"],
+            "percentile": data.get("percentile"),
+            "rank": data.get("rank"),
+            "score": payload["score"],
+        }
+    elif response.status_code == 429:
+        raise RuntimeError(
+            "Rate limit exceeded.\n"
+            "  You can only submit once per minute. Try again later."
+        )
+    elif response.status_code in (401, 403):
+        # Signature validation failed
+        detail = response.json().get("detail", "Invalid signature")
+        error_msg = f"Authentication failed: {detail}\n"
+        if _USING_DEFAULT_SECRET:
+            error_msg += (
+                "\n"
+                "  You are using the default dev secret which is not accepted\n"
+                "  by the production server.\n"
+                "\n"
+                "  To submit to the public leaderboard:\n"
+                "    1. Get an API key from https://janus-labs.dev/api-keys\n"
+                "    2. Set: export JANUS_HMAC_SECRET=<your-key>\n"
+                "    3. Re-run: janus-labs submit result.json\n"
+            )
+        else:
+            error_msg += (
+                "\n"
+                "  Your JANUS_HMAC_SECRET may be incorrect or expired.\n"
+                "  Get a new key from https://janus-labs.dev/api-keys\n"
+            )
+        raise RuntimeError(error_msg)
+    elif response.status_code == 400:
+        detail = response.json().get("detail", response.text)
+        raise RuntimeError(
+            f"Validation error: {detail}\n"
+            "\n"
+            "  This usually means the result.json format is incorrect.\n"
+            "  Run with --dry-run to see the payload being submitted."
+        )
+    elif response.status_code == 422:
+        # Schema validation error
+        detail = response.json().get("detail", response.text)
+        raise RuntimeError(
+            f"Schema validation failed: {detail}\n"
+            "\n"
+            "  The result.json fields don't match the expected format.\n"
+            "  This may be a CLI version mismatch. Try: pip install --upgrade janus-labs"
+        )
+    else:
+        raise RuntimeError(
+            f"Submit failed: HTTP {response.status_code}\n"
+            f"  Response: {response.text[:200]}"
+        )
+def cmd_submit(args) -> int:
+    """Handle submit subcommand."""
+    try:
+        result = submit_result(args.result_file, args.github, args.dry_run)
+        return 0
+    except FileNotFoundError:
+        print(f"ERROR: File not found: {args.result_file}", file=sys.stderr)
+        return 1
+    except json.JSONDecodeError as e:
+        print(f"ERROR: Invalid JSON: {e}", file=sys.stderr)
+        return 1
+    except RuntimeError as e:
+        print(f"ERROR: {e}", file=sys.stderr)
+        return 1
+    except Exception as e:
+        print(f"ERROR: {e}", file=sys.stderr)
+        return 1

config/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Configuration utilities for Janus Labs."""

config/detection.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""Config detection module for identifying agent instruction files."""
+from __future__ import annotations
+from dataclasses import dataclass
+from datetime import datetime, UTC
+import hashlib
+from pathlib import Path
+from typing import List
+@dataclass
+class ConfigMetadata:
+    """Metadata about detected configuration files."""
+    config_source: str  # "default" or "custom"
+    config_hash: str  # SHA-256 truncated to 12 chars
+    config_files: List[str]  # List of detected files
+    captured_at: str  # ISO timestamp
+INSTRUCTION_PATTERNS = [
+    "CLAUDE.md",  # Claude Code
+    ".github/copilot-instructions.md",  # GitHub Copilot
+    "AGENTS.md",  # Codex CLI
+    "codex.md",  # Codex CLI alt
+    "GEMINI.md",  # Gemini CLI
+]
+def detect_config(workspace_path: Path) -> ConfigMetadata:
+    """
+    Detect instruction files in the workspace.
+    Args:
+        workspace_path: Root path to search for instruction files
+    Returns:
+        ConfigMetadata with detection results
+    """
+    detected_files: List[str] = []
+    for pattern in INSTRUCTION_PATTERNS:
+        file_path = workspace_path / pattern
+        if file_path.exists():
+            detected_files.append(pattern)
+    detected_files.sort()
+    captured_at = datetime.now(UTC).isoformat().replace("+00:00", "Z")
+    if not detected_files:
+        return ConfigMetadata(
+            config_source="default",
+            config_hash="",
+            config_files=[],
+            captured_at=captured_at,
+        )
+    combined_content = ""
+    for file_name in detected_files:
+        file_path = workspace_path / file_name
+        combined_content += file_path.read_text(encoding="utf-8")
+    full_hash = hashlib.sha256(combined_content.encode("utf-8")).hexdigest()
+    truncated_hash = full_hash[:12]
+    return ConfigMetadata(
+        config_source="custom",
+        config_hash=truncated_hash,
+        config_files=detected_files,
+        captured_at=captured_at,
+    )

forge/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Forge layer - behavior specification definitions."""
+from .behavior import BehaviorSpec, RubricLevel
+__all__ = ["BehaviorSpec", "RubricLevel"]

forge/behavior.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""Behavior specification types for Janus Labs."""
+from dataclasses import dataclass
+from typing import TypedDict
+class RubricLevel(TypedDict):
+    """Scoring guidance for a rubric level."""
+    score: int  # 1-10
+    description: str
+@dataclass
+class BehaviorSpec:
+    """
+    A falsifiable behavior specification.
+    Behaviors are discovered by Probe, formalized in Forge,
+    and measured by Gauge.
+    """
+    behavior_id: str
+    name: str
+    description: str
+    rubric: dict[int, str]
+    threshold: float
+    disconfirmers: list[str]
+    taxonomy_code: str
+    version: str = "1.0.0"
+    def get_rubric_prompt(self) -> str:
+        """Generate rubric prompt for LLM judge."""
+        lines = ["Score the following behavior on a 1-10 scale:\n"]
+        for score in sorted(self.rubric.keys()):
+            lines.append(f"- Score {score}: {self.rubric[score]}")
+        return "\n".join(lines)

forge/behaviors/BHV-002-refactor-complexity.yaml ADDED Viewed

@@ -0,0 +1,25 @@
+behavior_id: BHV-002-refactor-complexity
+name: Reduce Cyclomatic Complexity
+description: |
+  Refactor the calculate_price function to reduce cyclomatic complexity
+  from 18 to 6 or less. Maintain all existing functionality.
+  All tests must continue to pass.
+threshold: 6.0
+rubric:
+  1: "No meaningful refactoring attempted"
+  2: "Minor changes, complexity unchanged"
+  3: "Partial refactoring, complexity reduced slightly"
+  4: "Complexity reduced but not to target"
+  5: "Complexity reduced to 8-9, some issues"
+  6: "Complexity target met (<=6), minor code issues"
+  7: "Clean refactor, complexity <=6"
+  8: "Good patterns used, improved readability"
+  9: "Excellent refactor with clear abstractions"
+  10: "Exemplary refactor - maintainable, testable, documented"
+disconfirmers:
+  - "Tests fail after refactoring"
+  - "Functionality changed or removed"
+  - "Complexity increased"
+  - "Code duplicated instead of abstracted"
+taxonomy_code: "O-2.01"
+version: "1.0.0"

forge/behaviors/BHV-003-error-handling.yaml ADDED Viewed

@@ -0,0 +1,28 @@
+behavior_id: BHV-003-error-handling
+name: Add Comprehensive Error Handling
+description: |
+  Add error handling to the file_processor module. Handle:
+  - File not found
+  - Permission denied
+  - Invalid JSON format
+  - Network timeout (for URL sources)
+  All errors should be logged and return appropriate error codes.
+threshold: 6.0
+rubric:
+  1: "No error handling added"
+  2: "Minimal handling, silent failures"
+  3: "Some errors handled, others crash"
+  4: "Most errors handled, poor messages"
+  5: "All errors handled, basic logging"
+  6: "All errors handled, good messages"
+  7: "Comprehensive handling, structured logging"
+  8: "Good error messages, proper error codes"
+  9: "Production-quality with context preservation"
+  10: "Exemplary - retry logic, graceful degradation, full traceability"
+disconfirmers:
+  - "Silent failures (errors swallowed)"
+  - "Generic catch-all without specificity"
+  - "Missing error types from requirements"
+  - "Crashes on expected error conditions"
+taxonomy_code: "O-3.01"
+version: "1.0.0"

gauge/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""Gauge layer - Measurement via DeepEval integration."""
+from .adapter import behavior_to_test_case, create_test_cases, create_geval_metric
+from .governed_rollout import GovernedRolloutConfig, RolloutResult, execute_governed_rollouts
+from .trust_elasticity import TrustElasticityMetric
+from .report import generate_benchmark_report
+__all__ = [
+    "behavior_to_test_case",
+    "create_test_cases",
+    "create_geval_metric",
+    "execute_governed_rollouts",
+    "GovernedRolloutConfig",
+    "RolloutResult",
+    "TrustElasticityMetric",
+    "generate_benchmark_report",
+]

gauge/adapter.py ADDED Viewed

@@ -0,0 +1,134 @@
+"""Adapter to convert BehaviorSpec to DeepEval test cases.
+E8-S4: Enhanced with qualitative rubric support for multi-dimensional scoring.
+"""
+from typing import Optional
+from deepeval.metrics import GEval
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from forge.behavior import BehaviorSpec
+from harness.types import RunArtifactBundle
+from gauge.qualitative import QualitativeRubric
+def behavior_to_test_case(
+    behavior: BehaviorSpec,
+    bundle: RunArtifactBundle,
+    qualitative_rubric: Optional[QualitativeRubric] = None,
+) -> LLMTestCase:
+    """
+    Convert a BehaviorSpec + RunArtifactBundle to a DeepEval LLMTestCase.
+    Args:
+        behavior: The behavior specification with rubric
+        bundle: The captured agent execution artifacts
+        qualitative_rubric: Optional qualitative rubric for enhanced evaluation
+    Returns:
+        LLMTestCase ready for DeepEval evaluation
+    """
+    transcript_text = "\n".join(
+        f"[{msg['role']}]: {msg['content']}"
+        for msg in bundle["transcript"]
+    )
+    tool_summary = "\n".join(
+        f"- {trace['tool_name']}({trace['arguments']}) -> {trace['result']}"
+        for trace in bundle["tool_traces"]
+    )
+    # Include git diff for code quality evaluation
+    diff_text = bundle.get("repo_diff", {}).get("patch", "No diff available")
+    # Include test results for outcome evaluation
+    test_results = bundle.get("test_results", {})
+    test_summary = (
+        f"Tests: {test_results.get('passed', 0)} passed, "
+        f"{test_results.get('failed', 0)} failed"
+    )
+    # Build context with rubric
+    if qualitative_rubric:
+        context = [qualitative_rubric.get_full_evaluation_prompt()]
+    else:
+        context = [behavior.get_rubric_prompt()]
+    return LLMTestCase(
+        input=f"Behavior: {behavior.name}\n\nTask transcript:\n{transcript_text}",
+        actual_output=(
+            f"Tool usage:\n{tool_summary}\n\n"
+            f"Code changes:\n{diff_text}\n\n"
+            f"{test_summary}\n\n"
+            f"Exit: {bundle['exit_code']}"
+        ),
+        expected_output=behavior.description,
+        context=context,
+    )
+def create_geval_metric(
+    behavior: BehaviorSpec,
+    qualitative_rubric: Optional[QualitativeRubric] = None,
+    model: Optional[str] = None,
+) -> GEval:
+    """
+    Create a GEval metric configured for this behavior's rubric.
+    Args:
+        behavior: The behavior specification
+        qualitative_rubric: Optional qualitative rubric for detailed evaluation
+        model: LLM model string (e.g., "gpt-4o-mini") - must be passed at construction
+    Returns:
+        Configured GEval metric for scoring
+    """
+    if qualitative_rubric:
+        # Use detailed evaluation steps from qualitative rubric
+        evaluation_steps = qualitative_rubric.get_evaluation_steps()
+        criteria = qualitative_rubric.get_full_evaluation_prompt()
+    else:
+        # Basic evaluation steps
+        evaluation_steps = [
+            f"Review the agent's behavior against: {behavior.description}",
+            "Apply the rubric from the context to score 1-10",
+            f"Minimum acceptable score is {behavior.threshold}",
+        ]
+        criteria = behavior.description
+    return GEval(
+        name=behavior.behavior_id,
+        criteria=criteria,
+        evaluation_params=[
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+            LLMTestCaseParams.EXPECTED_OUTPUT,
+            LLMTestCaseParams.CONTEXT,
+        ],
+        evaluation_steps=evaluation_steps,
+        threshold=behavior.threshold / 10.0,
+        model=model,
+    )
+def create_test_cases(
+    behavior: BehaviorSpec,
+    bundles: list[RunArtifactBundle],
+    qualitative_rubric: Optional[QualitativeRubric] = None,
+) -> list[LLMTestCase]:
+    """
+    Create test cases for all rollout bundles.
+    Args:
+        behavior: The behavior to test
+        bundles: List of execution bundles from rollouts
+        qualitative_rubric: Optional qualitative rubric for enhanced evaluation
+    Returns:
+        List of LLMTestCase objects for DeepEval
+    """
+    return [
+        behavior_to_test_case(behavior, bundle, qualitative_rubric)
+        for bundle in bundles
+    ]