PyPI - hud-python - Versions diffs - 0.4.28__py3-none-any.whl → 0.4.30__py3-none-any.whl - Mend

hud-python 0.4.28py3-none-any.whl → 0.4.30py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (77) hide show

hud/__init__.py +2 -1
hud/agents/base.py +81 -45
hud/agents/claude.py +8 -4
hud/agents/openai_chat_generic.py +66 -40
hud/agents/tests/test_base.py +0 -4
hud/agents/tests/test_openai.py +1 -1
hud/cli/__init__.py +182 -52
hud/cli/dev.py +8 -9
hud/cli/eval.py +317 -119
hud/cli/flows/__init__.py +0 -0
hud/cli/flows/tasks.py +0 -0
hud/cli/get.py +160 -0
hud/cli/rl/__init__.py +567 -71
hud/cli/rl/config.py +94 -0
hud/cli/rl/display.py +133 -0
hud/cli/rl/gpu.py +63 -0
hud/cli/rl/gpu_utils.py +318 -0
hud/cli/rl/presets.py +96 -0
hud/cli/rl/remote_runner.py +347 -0
hud/cli/rl/rl_api.py +150 -0
hud/cli/rl/vllm.py +177 -0
hud/cli/tests/test_analyze_metadata.py +0 -1
hud/cli/utils/tasks.py +26 -0
hud/clients/base.py +21 -23
hud/clients/mcp_use.py +36 -44
hud/clients/tests/test_mcp_use_retry.py +10 -10
hud/datasets/__init__.py +4 -3
hud/datasets/{execution/parallel.py → parallel.py} +1 -1
hud/datasets/{execution/runner.py → runner.py} +1 -1
hud/datasets/utils.py +1 -1
hud/native/comparator.py +6 -6
hud/native/tests/test_comparator.py +8 -8
hud/native/tests/test_native_init.py +13 -11
hud/otel/config.py +1 -1
hud/otel/instrumentation.py +35 -0
hud/rl/README.md +30 -0
hud/rl/__init__.py +1 -0
hud/rl/actor.py +174 -0
hud/rl/buffer.py +371 -0
hud/rl/chat_template.jinja +101 -0
hud/rl/config.py +184 -0
hud/rl/distributed.py +95 -0
hud/rl/learner.py +589 -0
hud/rl/tests/__init__.py +1 -0
hud/rl/tests/test_learner.py +171 -0
hud/rl/train.py +354 -0
hud/rl/types.py +101 -0
hud/rl/utils/start_vllm_server.sh +30 -0
hud/rl/utils.py +524 -0
hud/rl/vllm_adapter.py +125 -0
hud/settings.py +6 -0
hud/telemetry/__init__.py +2 -1
hud/telemetry/job.py +46 -3
hud/telemetry/tests/test_trace.py +3 -3
hud/telemetry/trace.py +85 -13
hud/tools/tests/test_computer.py +3 -3
hud/tools/tests/test_computer_actions.py +1 -1
hud/types.py +123 -2
hud/utils/group_eval.py +223 -0
hud/utils/hud_console.py +113 -13
hud/utils/tasks.py +119 -0
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/METADATA +20 -2
{hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/RECORD +68 -48
hud/cli/hf.py +0 -406
hud/cli/rl/README.md +0 -243
hud/cli/rl/init.py +0 -370
hud/cli/rl/pod.py +0 -501
hud/cli/rl/ssh.py +0 -322
hud/cli/rl/train.py +0 -562
hud/cli/rl/utils.py +0 -165
hud/datasets/execution/__init__.py +0 -13
hud/datasets/task.py +0 -116
{hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/WHEEL +0 -0
{hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/licenses/LICENSE +0 -0

hud/telemetry/job.py CHANGED Viewed

@@ -89,6 +89,46 @@ class Job:
             except Exception as e:
                 logger.warning("Failed to update job status: %s", e)
+    async def log(self, metrics: dict[str, Any]) -> None:
+        """Log metrics to the job.
+        Args:
+            metrics: Dictionary of metric name to value pairs
+        Example:
+            await job.log({"loss": 0.5, "accuracy": 0.95, "epoch": 1})
+        """
+        if settings.telemetry_enabled:
+            try:
+                await make_request(
+                    method="POST",
+                    url=f"{settings.hud_telemetry_url}/jobs/{self.id}/log",
+                    json={"metrics": metrics, "timestamp": datetime.now(UTC).isoformat()},
+                    api_key=settings.api_key,
+                )
+            except Exception as e:
+                logger.warning("Failed to log metrics to job: %s", e)
+    def log_sync(self, metrics: dict[str, Any]) -> None:
+        """Synchronously log metrics to the job.
+        Args:
+            metrics: Dictionary of metric name to value pairs
+        Example:
+            job.log_sync({"loss": 0.5, "accuracy": 0.95, "epoch": 1})
+        """
+        if settings.telemetry_enabled:
+            try:
+                make_request_sync(
+                    method="POST",
+                    url=f"{settings.hud_telemetry_url}/jobs/{self.id}/log",
+                    json={"metrics": metrics, "timestamp": datetime.now(UTC).isoformat()},
+                    api_key=settings.api_key,
+                )
+            except Exception as e:
+                logger.warning("Failed to log metrics to job: %s", e)
     def __repr__(self) -> str:
         return f"Job(id={self.id!r}, name={self.name!r}, status={self.status!r})"
@@ -225,7 +265,10 @@ def job(
 def create_job(
-    name: str, metadata: dict[str, Any] | None = None, dataset_link: str | None = None
+    name: str,
+    metadata: dict[str, Any] | None = None,
+    dataset_link: str | None = None,
+    job_id: str | None = None,
 ) -> Job:
     """Create a job without using context manager.
@@ -235,7 +278,7 @@ def create_job(
         name: Human-readable job name
         metadata: Optional metadata dictionary
         dataset_link: Optional HuggingFace dataset identifier (e.g. "hud-evals/SheetBench-50")
+        job_id: Optional job ID (auto-generated if not provided)
     Returns:
         Job: The created job object
@@ -248,7 +291,7 @@ def create_job(
         finally:
             await job.update_status("completed")
     """
-    job_id = str(uuid.uuid4())
+    job_id = job_id or str(uuid.uuid4())
     return Job(job_id, name, metadata, dataset_link)

hud/telemetry/tests/test_trace.py CHANGED Viewed

@@ -23,7 +23,7 @@ class TestTraceAPI:
             with trace("test-trace") as task_run_id:
                 # Should use placeholder ID for custom backends
-                assert task_run_id == "custom-otlp-trace"
+                assert task_run_id.id == "custom-otlp-trace"
     def test_trace_with_enabled_telemetry_and_api_key(self):
         """Test trace behavior when telemetry is enabled with API key."""
@@ -39,7 +39,7 @@ class TestTraceAPI:
             with trace("test-trace") as task_run_id:
                 # Should use generated UUID
-                assert task_run_id == "mock-uuid-123"
+                assert task_run_id.id == "mock-uuid-123"
     def test_trace_with_no_api_key(self):
         """Test trace behavior with no API key (custom backend scenario)."""
@@ -60,4 +60,4 @@ class TestTraceAPI:
             with trace("test-trace") as task_run_id:
                 # Should use custom backend placeholder
-                assert task_run_id == "custom-otlp-trace"
+                assert task_run_id.id == "custom-otlp-trace"

hud/telemetry/trace.py CHANGED Viewed

@@ -6,17 +6,83 @@ The actual OpenTelemetry implementation is in hud.otel.
 from __future__ import annotations
+import logging
 import uuid
 from contextlib import contextmanager
+from datetime import UTC, datetime
 from typing import TYPE_CHECKING, Any
 from hud.otel import configure_telemetry
 from hud.otel import trace as OtelTrace
+from hud.settings import settings
+from hud.shared import make_request, make_request_sync
 if TYPE_CHECKING:
     from collections.abc import Generator
-__all__ = ["trace"]
+logger = logging.getLogger(__name__)
+__all__ = ["Trace", "trace"]
+class Trace:
+    """A trace represents a single task execution with telemetry."""
+    def __init__(
+        self,
+        trace_id: str,
+        name: str,
+        job_id: str | None = None,
+        task_id: str | None = None,
+    ) -> None:
+        self.id = trace_id
+        self.name = name
+        self.job_id = job_id
+        self.task_id = task_id
+        self.created_at = datetime.now(UTC)
+    async def log(self, metrics: dict[str, Any]) -> None:
+        """Log metrics to this trace.
+        Args:
+            metrics: Dictionary of metric name to value pairs
+        Example:
+            await trace.log({"step": 1, "loss": 0.5, "accuracy": 0.92})
+        """
+        if settings.telemetry_enabled:
+            try:
+                await make_request(
+                    method="POST",
+                    url=f"{settings.hud_telemetry_url}/traces/{self.id}/log",
+                    json={"metrics": metrics, "timestamp": datetime.now(UTC).isoformat()},
+                    api_key=settings.api_key,
+                )
+            except Exception as e:
+                logger.warning("Failed to log metrics to trace: %s", e)
+    def log_sync(self, metrics: dict[str, Any]) -> None:
+        """Synchronously log metrics to this trace.
+        Args:
+            metrics: Dictionary of metric name to value pairs
+        Example:
+            trace.log_sync({"step": 1, "loss": 0.5, "accuracy": 0.92})
+        """
+        if settings.telemetry_enabled:
+            try:
+                make_request_sync(
+                    method="POST",
+                    url=f"{settings.hud_telemetry_url}/traces/{self.id}/log",
+                    json={"metrics": metrics, "timestamp": datetime.now(UTC).isoformat()},
+                    api_key=settings.api_key,
+                )
+            except Exception as e:
+                logger.warning("Failed to log metrics to trace: %s", e)
+    def __repr__(self) -> str:
+        return f"Trace(id={self.id!r}, name={self.name!r})"
 @contextmanager
@@ -27,7 +93,7 @@ def trace(
     attrs: dict[str, Any] | None = None,
     job_id: str | None = None,
     task_id: str | None = None,
-) -> Generator[str, None, None]:
+) -> Generator[Trace, None, None]:
     """Start a HUD trace context.
     A unique task_run_id is automatically generated for each trace.
@@ -37,24 +103,27 @@ def trace(
         root: Whether this is a root trace (updates task status)
         attrs: Additional attributes to attach to the trace
         job_id: Optional job ID to associate with this trace
+        task_id: Optional task ID (for custom task identifiers)
     Yields:
-        str: The auto-generated task run ID
+        Trace: The trace object with logging capabilities
     Usage:
         import hud
-        with hud.trace("My Task") as task_run_id:
+        # Basic usage
+        with hud.trace("My Task") as trace:
             # Your code here
-            print(f"Running task: {task_run_id}")
+            trace.log_sync({"step": 1, "progress": 0.5})
-        # Or with default name:
-        with hud.trace() as task_run_id:
-            pass
+        # Async logging
+        async with hud.trace("Async Task") as trace:
+            await trace.log({"loss": 0.23, "accuracy": 0.95})
-        # Or with job_id:
-        with hud.trace("My Task", job_id="550e8400-e29b-41d4-a716-446655440000") as task_run_id:
-            pass
+        # With job association
+        with hud.job("Training Run") as job:
+            with hud.trace("Epoch 1", job_id=job.id) as trace:
+                trace.log_sync({"epoch": 1, "loss": 0.5})
     """
     # Ensure telemetry is configured
     configure_telemetry()
@@ -71,6 +140,9 @@ def trace(
         # Use a placeholder for custom backends
         task_run_id = "custom-otlp-trace"
+    # Create trace object
+    trace_obj = Trace(task_run_id, name, job_id, task_id)
     # Delegate to OpenTelemetry implementation
     with OtelTrace(
         task_run_id,
@@ -79,5 +151,5 @@ def trace(
         attributes=attrs or {},
         job_id=job_id,
         task_id=task_id,
-    ) as run_id:
-        yield run_id
+    ):
+        yield trace_obj

hud/tools/tests/test_computer.py CHANGED Viewed

@@ -151,7 +151,7 @@ class TestHudComputerToolExtended:
     async def test_type_action(self, base_executor):
         """Test type action with BaseExecutor."""
         tool = HudComputerTool(executor=base_executor)
-        result = await tool(action="type", text="Hello World", enter_after=True)
+        result = await tool(action="write", text="Hello World", enter_after=True)
         assert result
         assert any(
             "[SIMULATED] Type" in content.text
@@ -329,7 +329,7 @@ class TestHudComputerToolExtended:
         assert result
         # Test type without coordinates
-        result = await tool(action="type", text="test")
+        result = await tool(action="write", text="test")
         assert result
     @pytest.mark.asyncio
@@ -360,7 +360,7 @@ class TestHudComputerToolExtended:
         from hud.tools.types import ToolError
         with pytest.raises(ToolError, match="text parameter is required"):
-            await tool(action="type", text=None)
+            await tool(action="write", text=None)
         # Test press without keys
         with pytest.raises(ToolError, match="keys parameter is required"):

hud/tools/tests/test_computer_actions.py CHANGED Viewed

@@ -12,7 +12,7 @@ CASES = [
     ("press", {"keys": ["ctrl", "c"]}),
     ("keydown", {"keys": ["shift"]}),
     ("keyup", {"keys": ["shift"]}),
-    ("type", {"text": "hello"}),
+    ("write", {"text": "hello"}),
     ("scroll", {"x": 10, "y": 10, "scroll_y": 20}),  # Added required x,y coordinates
     # Skip move test - it has Field parameter handling issues when called directly
     # ("move", {"x": 5, "y": 5}),  # x,y are for absolute positioning

hud/types.py CHANGED Viewed

@@ -1,12 +1,120 @@
 from __future__ import annotations
 import json
+import logging
 import uuid
+from collections import defaultdict
+from string import Template
 from typing import Any, Literal
 import mcp.types as types
 from mcp.types import CallToolRequestParams, CallToolResult
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, field_validator
+from hud.settings import settings
+logger = logging.getLogger(__name__)
+class Task(BaseModel):
+    """
+    A task configuration that can be used to create a task.
+    The mcp_config field supports environment variable substitution using
+    template placeholders in the format ${VAR_NAME} or ${VAR_NAME:default_value}.
+    Example:
+        mcp_config: {
+            "hud": {
+                "url": "${HUD_MCP_URL:https://mcp.hud.so/v3/mcp}",
+                "headers": {
+                    "Authorization": "Bearer ${HUD_API_KEY}",
+                    "Mcp-Image": "your-mcp-image"
+                }
+            }
+        }
+    """
+    id: str | None = None
+    prompt: str
+    mcp_config: dict[str, Any]
+    setup_tool: MCPToolCall | list[MCPToolCall] | None = None
+    evaluate_tool: MCPToolCall | list[MCPToolCall] | None = None
+    agent_tools: list[str] | None = None
+    system_prompt: str | None = None
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    @field_validator("mcp_config", "metadata", mode="before")
+    @classmethod
+    def parse_json_strings(cls, v: Any) -> Any:
+        """Parse JSON strings into dictionaries."""
+        if isinstance(v, str):
+            try:
+                return json.loads(v)
+            except json.JSONDecodeError as e:
+                from hud.shared.exceptions import HudConfigError
+                raise HudConfigError(f"Invalid JSON string: {e}") from e
+        return v
+    @field_validator("setup_tool", "evaluate_tool", mode="before")
+    @classmethod
+    def convert_dict_to_tool_call(cls, v: Any) -> Any:
+        """Convert dict to MCPToolCall instance, parsing JSON strings first."""
+        if v is None:
+            return None
+        # Parse JSON string if needed
+        if isinstance(v, str):
+            try:
+                v = json.loads(v)
+            except json.JSONDecodeError as e:
+                from hud.shared.exceptions import HudConfigError
+                raise HudConfigError(f"Invalid JSON string: {e}") from e
+        if isinstance(v, dict):
+            return MCPToolCall(**v)
+        if isinstance(v, list):
+            return [MCPToolCall(**item) if isinstance(item, dict) else item for item in v]
+        return v
+    @field_validator("mcp_config", mode="before")
+    @classmethod
+    def resolve_env_vars(cls, v: dict[str, Any]) -> dict[str, Any]:
+        """
+        Automatically resolve environment variables in mcp_config using Template.
+        Supports ${VAR_NAME} syntax with variable substitution from
+        System environment variables (including HUD_API_KEY, etc.)
+        Missing variables resolve to empty strings.
+        """
+        import os
+        # Start with current environment variables
+        mapping = dict(os.environ)
+        mapping.update(settings.model_dump())
+        if settings.api_key:
+            mapping["HUD_API_KEY"] = settings.api_key
+        else:
+            logger.error("HUD_API_KEY is not set, tracing and remote training will not work")
+        def substitute_in_value(obj: Any) -> Any:
+            """Recursively substitute variables in nested structures."""
+            if isinstance(obj, str):
+                # Use Template's substitute with defaultdict - missing vars become empty strings
+                safe_mapping = defaultdict(str, mapping)
+                return Template(obj).substitute(safe_mapping)
+            elif isinstance(obj, dict):
+                return {k: substitute_in_value(v) for k, v in obj.items()}
+            elif isinstance(obj, list):
+                return [substitute_in_value(item) for item in obj]
+            else:
+                return obj
+        return substitute_in_value(v)
 class MCPToolCall(CallToolRequestParams):
@@ -150,12 +258,25 @@ class Trace(BaseModel):
     - trace: The steps taken in the run (empty if not tracing)
     """
-    done: bool = Field(default=True)
     reward: float = Field(default=0.0)
+    done: bool = Field(default=True)
     info: dict[str, Any] = Field(default_factory=dict)
     content: str | None = Field(default=None)
     isError: bool = Field(default=False)
+    # Metadata
+    task: Task | None = Field(default=None)
+    # Trace
     trace: list[TraceStep] = Field(default_factory=list)
+    messages: list[Any] = Field(default_factory=list)
+    def __len__(self) -> int:
+        return len(self.trace)
+    @property
+    def num_messages(self) -> int:
+        return len(self.messages)
     def append(self, step: TraceStep) -> None:
         self.trace.append(step)

hud/utils/group_eval.py ADDED Viewed

@@ -0,0 +1,223 @@
+"""Utilities for grouped evaluation of tasks, following the RL pattern."""
+from __future__ import annotations
+import asyncio
+from statistics import mean, stdev
+from typing import Any
+import numpy as np
+import hud
+from hud.datasets import Task
+from hud.types import Trace
+from hud.utils.hud_console import HUDConsole
+hud_console = HUDConsole()
+async def run_tasks_grouped(
+    tasks: list[Any],
+    agent_class: type | Any,
+    agent_config: dict[str, Any] | None = None,
+    group_size: int = 1,
+    max_parallel_episodes: int = 48,
+    max_steps: int = 10,
+    verbose: bool = False,
+    job_id: str | None = None,
+) -> list[dict[str, Any]]:
+    """
+    Run tasks with grouping, following the RL Actor pattern.
+    Args:
+        tasks: List of tasks to run
+        agent_class: Agent class or instance to use
+        agent_config: Configuration for agent instantiation
+        group_size: Number of times to run each task
+        max_parallel_episodes: Maximum parallel episodes to run
+        max_steps: Maximum steps per episode
+        verbose: Whether to show progress
+        job_id: Optional job ID for tracking
+    Returns:
+        List of statistics for each task group
+    """
+    agent_config = agent_config or {}
+    # Duplicate tasks according to group_size, exactly like RL
+    grouped_tasks = []
+    task_mapping = []  # Track which group each result belongs to
+    for i, task in enumerate(tasks):
+        for _ in range(group_size):
+            grouped_tasks.append(task)
+            task_mapping.append(i)
+    hud_console.info(
+        f"Running {len(tasks)} tasks with group_size={group_size} ({len(grouped_tasks)} total runs)"
+    )
+    # Run all episodes, respecting max_parallel_episodes
+    all_traces = []
+    for batch_start in range(0, len(grouped_tasks), max_parallel_episodes):
+        batch_end = min(batch_start + max_parallel_episodes, len(grouped_tasks))
+        batch = grouped_tasks[batch_start:batch_end]
+        # Run batch in parallel
+        async def run_single_episode(task_data: dict[str, Any] | Task, idx: int) -> Trace:
+            """Run a single episode."""
+            try:
+                # Create task if needed
+                task = Task(**task_data) if isinstance(task_data, dict) else task_data
+                # Create fresh agent instance
+                if isinstance(agent_class, type):
+                    agent = agent_class(**agent_config)
+                else:
+                    # Agent is already instantiated
+                    agent = agent_class
+                # Run the task
+                trace_name = f"Eval | {task.id if hasattr(task, 'id') else 'Task'} | Group {task_mapping[idx]}"  # noqa: E501
+                with hud.trace(trace_name, job_id=job_id):
+                    result = await agent.run(task, max_steps=max_steps)
+                    return result
+            except Exception as e:
+                hud_console.warning_log(f"Episode failed: {e}")
+                return Trace(isError=True, content=str(e), reward=0.0, done=True)
+        # Run batch
+        batch_results = await asyncio.gather(
+            *[run_single_episode(t, batch_start + i) for i, t in enumerate(batch)],
+            return_exceptions=True,
+        )
+        # Normalize exceptions to error traces
+        for res in batch_results:
+            if isinstance(res, Exception):
+                hud_console.warning_log(f"Episode error: {res}")
+                all_traces.append(Trace(isError=True, content=str(res), reward=0.0, done=True))
+            else:
+                all_traces.append(res)
+        if verbose:
+            hud_console.info(f"Completed batch: {len(all_traces)}/{len(grouped_tasks)} episodes")
+    # Group results back by original task and calculate statistics
+    return calculate_group_statistics(tasks, all_traces, task_mapping, group_size)
+def calculate_group_statistics(
+    original_tasks: list[Any],
+    traces: list[Trace],
+    task_mapping: list[int],
+    group_size: int,
+) -> list[dict[str, Any]]:
+    """
+    Calculate statistics for each group, similar to preprocess_advantages.
+    Args:
+        original_tasks: Original task list
+        traces: All traces from grouped runs
+        task_mapping: Mapping of trace index to task index
+        group_size: Number of runs per task
+    Returns:
+        List of statistics for each task
+    """
+    stats = []
+    # Process each original task
+    for task_idx, task in enumerate(original_tasks):
+        # Get all traces for this task
+        task_traces = [
+            traces[i] for i, mapping_idx in enumerate(task_mapping) if mapping_idx == task_idx
+        ]
+        # Extract rewards
+        rewards = np.array([t.reward for t in task_traces])
+        errors = [t for t in task_traces if t.isError]
+        # Calculate statistics
+        task_stats = {
+            "task_id": task.id
+            if isinstance(task, Task) and hasattr(task, "id")
+            else f"task_{task_idx}",
+            "prompt": task.prompt if isinstance(task, Task) else task.get("prompt", ""),
+            "group_size": group_size,
+            "rewards": rewards.tolist(),
+            "mean_reward": float(np.mean(rewards)),
+            "std_reward": float(np.std(rewards)) if len(rewards) > 1 else 0.0,
+            "min_reward": float(np.min(rewards)),
+            "max_reward": float(np.max(rewards)),
+            "success_rate": float(np.sum(rewards > 0) / len(rewards)) if len(rewards) > 0 else 0.0,
+            "error_rate": len(errors) / len(task_traces) if len(task_traces) > 0 else 0.0,
+            "traces": task_traces,  # Keep full traces for detailed analysis
+        }
+        # Add variance info like RL does
+        if task_stats["std_reward"] > 1e-6:
+            task_stats["normalized_rewards"] = [
+                (r - task_stats["mean_reward"]) / task_stats["std_reward"] for r in rewards
+            ]
+        else:
+            task_stats["normalized_rewards"] = [0.0] * len(rewards)
+        stats.append(task_stats)
+    return stats
+def display_group_statistics(stats: list[dict[str, Any]], show_details: bool = True) -> None:
+    """Display statistics from grouped evaluation."""
+    from rich.console import Console
+    from rich.table import Table
+    console = Console()
+    # Overall statistics
+    all_means = [s["mean_reward"] for s in stats]
+    overall_mean = mean(all_means) if all_means else 0.0
+    overall_std = stdev(all_means) if len(all_means) > 1 else 0.0
+    hud_console.success("\n📊 Evaluation Summary")
+    hud_console.info(f"Tasks evaluated: {len(stats)}")
+    hud_console.info(f"Episodes per task: {stats[0]['group_size'] if stats else 0}")
+    hud_console.info(f"Total episodes: {sum(len(s['rewards']) for s in stats)}")
+    hud_console.info(f"Overall mean reward: {overall_mean:.3f} ± {overall_std:.3f}")
+    # Detailed table
+    if show_details and len(stats) <= 20:  # Only show for reasonable dataset sizes
+        table = Table(title="\nPer-Task Performance Distribution")
+        table.add_column("Task", style="cyan", no_wrap=True)
+        table.add_column("Mean±Std", justify="right", style="green")
+        table.add_column("Min/Max", justify="right")
+        table.add_column("Success%", justify="right", style="yellow")
+        table.add_column("Rewards", style="dim")
+        for stat in stats:
+            task_name = stat["prompt"][:30] + "..." if len(stat["prompt"]) > 30 else stat["prompt"]
+            rewards_str = " ".join([f"{r:.2f}" for r in stat["rewards"][:5]])
+            if len(stat["rewards"]) > 5:
+                rewards_str += " ..."
+            table.add_row(
+                task_name,
+                f"{stat['mean_reward']:.3f}±{stat['std_reward']:.3f}",
+                f"{stat['min_reward']:.2f}/{stat['max_reward']:.2f}",
+                f"{stat['success_rate'] * 100:.0f}%",
+                rewards_str,
+            )
+        console.print(table)
+    # High variance tasks
+    high_variance_tasks = [s for s in stats if s["std_reward"] > 0.3 and s["group_size"] > 1]
+    if high_variance_tasks:
+        hud_console.warning(f"\n⚠️  {len(high_variance_tasks)} tasks show high variance (std > 0.3)")
+        for task in high_variance_tasks[:3]:
+            hud_console.info(
+                f"  • {task['task_id']}: μ={task['mean_reward']:.3f}, σ={task['std_reward']:.3f}"  # noqa: RUF001
+            )

hud-python 0.4.28__py3-none-any.whl → 0.4.30__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.28py3-none-any.whl → 0.4.30py3-none-any.whl