PyPI - hud-python - Versions diffs - 0.5.8__tar.gz → 0.5.18__tar.gz - Mend

hud-python 0.5.8tar.gz → 0.5.18tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (362) hide show

{hud_python-0.5.8 → hud_python-0.5.18}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.5.8
+Version: 0.5.18
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -91,7 +91,7 @@ Requires-Dist: pyright==1.1.407; extra == 'dev'
 Requires-Dist: pytest-asyncio; extra == 'dev'
 Requires-Dist: pytest-cov; extra == 'dev'
 Requires-Dist: pytest-mock; extra == 'dev'
-Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
+Requires-Dist: pytest>=8.1.1; extra == 'dev'
 Requires-Dist: ruff>=0.11.8; extra == 'dev'
 Requires-Dist: tornado>=6.5.2; extra == 'dev'
 Description-Content-Type: text/markdown

{hud_python-0.5.8 → hud_python-0.5.18}/hud/agents/__init__.py RENAMED Viewed

@@ -2,12 +2,13 @@ from __future__ import annotations
 from typing import Any
-from .base import MCPAgent
+from .base import CategorizedTools, MCPAgent
 from .openai import OpenAIAgent
 from .openai_chat import OpenAIChatAgent
 from .operator import OperatorAgent
 __all__ = [
+    "CategorizedTools",
     "MCPAgent",
     "OpenAIAgent",
     "OpenAIChatAgent",
@@ -47,24 +48,20 @@ def create_agent(model: str, **kwargs: Any) -> MCPAgent:
     # Resolve class and gateway info
     agent_cls, gateway_info = resolve_cls(model)
-    # Get model ID from gateway info or use input
+    # Get model name from gateway info or use input
     model_id = model
     if gateway_info:
-        model_id = gateway_info.get("model") or gateway_info.get("id") or model
+        model_id = gateway_info.get("model_name") or model
     # Determine provider: from gateway info, or infer from agent class
     if gateway_info:
-        provider = gateway_info.get("provider") or "openai"
+        provider = gateway_info["provider"]["name"]
     else:
-        # Map agent class to provider for known types
-        from hud.agents.claude import ClaudeAgent
-        from hud.agents.gemini import GeminiAgent
-        _AGENT_TO_PROVIDER = {
-            ClaudeAgent: "anthropic",
-            GeminiAgent: "google",
-        }
-        provider = _AGENT_TO_PROVIDER.get(agent_cls, "openai")
+        provider = "openai"
+        if agent_cls.__name__ == "ClaudeAgent":
+            provider = "anthropic"
+        elif agent_cls.__name__ in ("GeminiAgent", "GeminiCUAAgent"):
+            provider = "gemini"
     client = build_gateway_client(provider)

{hud_python-0.5.8 → hud_python-0.5.18}/hud/agents/base.py RENAMED Viewed

@@ -6,14 +6,17 @@ import asyncio
 import json
 import logging
 from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, ClassVar, Literal
 import mcp.types as types
-from pydantic import BaseModel, ConfigDict
-from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult, Trace
+from hud.tools.native_types import NativeToolSpec
+from hud.types import AgentResponse, AgentType, BaseAgentConfig, MCPToolCall, MCPToolResult, Trace
 from hud.utils.hud_console import HUDConsole
+from .types import BaseCreateParams
 if TYPE_CHECKING:
     from hud.environment import Environment
     from hud.eval.context import EvalContext
@@ -22,16 +25,28 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
-class BaseCreateParams(BaseModel):
-    """Runtime parameters for agent creation."""
+@dataclass
+class CategorizedTools:
+    """Result of categorizing tools by native spec availability.
+    Used by agents to efficiently process tools with shared logic for
+    role-based mutual exclusion.
+    """
+    native: list[tuple[types.Tool, NativeToolSpec]] = field(default_factory=list)
+    """Tools with native specs for this agent (tool, spec) pairs."""
+    hosted: list[tuple[types.Tool, NativeToolSpec]] = field(default_factory=list)
+    """Hosted tools with native specs for this agent (tool, spec) pairs."""
-    model_config = ConfigDict(arbitrary_types_allowed=True)
+    generic: list[types.Tool] = field(default_factory=list)
+    """Tools without native specs that aren't role-blocked."""
-    # Primary way to bind agent to execution context (v5)
-    ctx: Any | None = None  # EvalContext or Environment - agent uses this for tool calls
+    claimed_roles: set[str] = field(default_factory=set)
+    """Roles claimed by native tools."""
-    auto_respond: bool = False
-    verbose: bool = False
+    skipped: list[tuple[types.Tool, str]] = field(default_factory=list)
+    """Tools skipped due to role conflicts (tool, reason) pairs."""
 class MCPAgent(ABC):
@@ -52,6 +67,185 @@ class MCPAgent(ABC):
     required_tools: ClassVar[list[str]] = []  # Tools that must be available
     config_cls: ClassVar[type[BaseAgentConfig]] = BaseAgentConfig
+    @classmethod
+    @abstractmethod
+    def agent_type(cls) -> AgentType:
+        """Return the AgentType for this agent.
+        Subclasses must implement this to return their corresponding AgentType enum value.
+        This is used for resolving native tool specifications.
+        Returns:
+            AgentType enum value for this agent
+        """
+        raise NotImplementedError
+    def resolve_native_spec(self, tool: types.Tool) -> NativeToolSpec | None:
+        """Check if a tool has a native spec for this agent type and model.
+        Looks up the tool's meta.native_tools field for a spec matching this agent's type.
+        If found, validates that the current model supports this native spec.
+        Returns a NativeToolSpec that can be used to register the tool with
+        the provider's native API format.
+        Falls back to legacy name-based detection for backwards compatibility with
+        old environments that don't emit native_tools metadata.
+        Args:
+            tool: MCP Tool object to check for native specs
+        Returns:
+            NativeToolSpec if the tool has a native spec for this agent and the
+            current model supports it, None otherwise. When the model doesn't
+            match supported_models, returns None so the tool falls back to
+            generic function calling.
+        """
+        spec: NativeToolSpec | None = None
+        # First try metadata-based resolution
+        if tool.meta:
+            native_tools = tool.meta.get("native_tools", {})
+            spec_dict = native_tools.get(self.agent_type().value)
+            if spec_dict and isinstance(spec_dict, dict):
+                # Extract known fields and put the rest in extra
+                known_fields = {
+                    "api_type",
+                    "api_name",
+                    "beta",
+                    "hosted",
+                    "role",
+                    "supported_models",
+                }
+                extra = {k: v for k, v in spec_dict.items() if k not in known_fields}
+                # Convert supported_models list to tuple for frozen model
+                supported_models_raw = spec_dict.get("supported_models")
+                supported_models: tuple[str, ...] | None = None
+                if supported_models_raw:
+                    supported_models = tuple(supported_models_raw)
+                spec = NativeToolSpec(
+                    api_type=spec_dict.get("api_type"),
+                    api_name=spec_dict.get("api_name"),
+                    beta=spec_dict.get("beta"),
+                    hosted=spec_dict.get("hosted", False),
+                    role=spec_dict.get("role"),
+                    supported_models=supported_models,
+                    extra=extra,
+                )
+        # Fall back to legacy name-based detection for old environments
+        if spec is None:
+            spec = self._legacy_native_spec_fallback(tool)
+        # Check if current model supports this native spec
+        if spec is not None and not spec.supports_model(self.model):
+            logger.debug(
+                "Model %s not in supported_models for native spec %s, falling back to functions",
+                self.model,
+                spec.api_type,
+            )
+            return None
+        return spec
+    def _legacy_native_spec_fallback(self, tool: types.Tool) -> NativeToolSpec | None:
+        """Detect native tools by name for backwards compatibility.
+        Override in subclasses to support old environments that expose tools
+        without native_tools metadata.
+        Args:
+            tool: MCP Tool object to check
+        Returns:
+            NativeToolSpec if the tool matches a known legacy pattern, None otherwise
+        """
+        return None
+    def get_tool_role(self, tool: types.Tool) -> str | None:
+        """Get the role of a tool from any of its native specs.
+        The role is used for mutual exclusion - when an agent accepts a tool
+        natively, other tools with the same role are excluded.
+        Args:
+            tool: MCP Tool object to check
+        Returns:
+            The role string if any native spec defines one, None otherwise
+        """
+        if not tool.meta:
+            return None
+        native_tools = tool.meta.get("native_tools", {})
+        if not native_tools:
+            return None
+        # Check all specs for a role (they should all have the same role)
+        for spec_dict in native_tools.values():
+            if isinstance(spec_dict, dict) and spec_dict.get("role"):
+                return spec_dict["role"]
+        return None
+    def categorize_tools(self, tools: list[types.Tool] | None = None) -> CategorizedTools:
+        """Categorize tools by native spec availability with role-based exclusion.
+        This shared method implements the two-pass tool processing logic:
+        1. First pass: identify native/hosted tools and claim their roles
+        2. Second pass: include generic tools if their role isn't claimed
+        Args:
+            tools: List of MCP tools to categorize. If None, uses get_available_tools()
+        Returns:
+            CategorizedTools with native, hosted, generic, and skipped tools
+        """
+        if tools is None:
+            tools = self.get_available_tools()
+        result = CategorizedTools()
+        # First pass: process tools with native specs for this agent
+        for tool in tools:
+            spec = self.resolve_native_spec(tool)
+            if not spec:
+                continue
+            # Check for role conflicts between native tools
+            if spec.role:
+                if spec.role in result.claimed_roles:
+                    # Another native tool already claimed this role - skip this one
+                    result.skipped.append(
+                        (tool, f"role '{spec.role}' already claimed by another native tool")
+                    )
+                    continue
+                result.claimed_roles.add(spec.role)
+            if spec.hosted:
+                result.hosted.append((tool, spec))
+            else:
+                result.native.append((tool, spec))
+        # Second pass: process tools without native specs (generic function tools)
+        for tool in tools:
+            spec = self.resolve_native_spec(tool)
+            if spec:
+                # Already processed in first pass
+                continue
+            # Check if this tool's role is already claimed by a native tool
+            tool_role = self.get_tool_role(tool)
+            if tool_role and tool_role in result.claimed_roles:
+                result.skipped.append((tool, f"role '{tool_role}' already claimed by native tool"))
+                continue
+            result.generic.append(tool)
+        return result
     def __init__(self, params: BaseCreateParams | None = None, **kwargs: Any) -> None:
         if params is None:
             import warnings
@@ -129,8 +323,8 @@ class MCPAgent(ABC):
                 f"Available tools: {sorted(available_tool_names)}"
             )
-        self.console.info(
-            f"Agent initialized with {len(self._available_tools)} tools: "
+        self.console.debug(
+            f"Discovered {len(self._available_tools)} tools from environment: "
             f"{', '.join([t.name for t in self._available_tools])}"
         )
@@ -208,7 +402,21 @@ class MCPAgent(ABC):
             await self._initialize_from_ctx(ctx)
         try:
-            result = await self._run_context(text_to_blocks(ctx.prompt), max_steps=max_steps)
+            # Build initial context - optionally append setup tool output
+            # Check ctx first (task-level override), then fall back to agent config
+            append_setup = getattr(ctx, "append_setup_output", False) or getattr(
+                self.config, "append_setup_output", False
+            )
+            initial_prompt = ctx.prompt
+            if append_setup:
+                setup_output = getattr(ctx, "setup_output", None)
+                if setup_output:
+                    initial_prompt = f"{initial_prompt}\n\n{setup_output}"
+            # Build initial blocks (text prompt + optional screenshot)
+            initial_blocks = text_to_blocks(initial_prompt)
+            result = await self._run_context(initial_blocks, max_steps=max_steps)
             # Propagate error state to context for platform visibility
             if result.isError and hasattr(ctx, "error"):
@@ -342,8 +550,17 @@ class MCPAgent(ABC):
             is_error = False
         # Ensure all parameters are the correct type
+        # Use ctx.reward if already set (e.g., from scenario evaluate), otherwise 0.0
+        # Note: For v4 tasks with evaluate_tool, reward is set in __aexit__ after this returns,
+        # so callers should prefer ctx.reward over Trace.reward for the final result.
+        reward = 0.0
+        if self.ctx is not None:
+            ctx_reward = getattr(self.ctx, "reward", None)
+            if ctx_reward is not None:
+                reward = ctx_reward
         trace_params = {
-            "reward": 0.0,
+            "reward": reward,
             "done": True,
             "messages": messages,
             "content": final_response.content if final_response else error,
@@ -519,8 +736,14 @@ def find_reward(result: MCPToolResult) -> float:
     Agent accepts "reward", "grade", "score", or weighted subscores
+    If isError is True, return 0.0 (error results should not contribute positive reward).
     If not found, return 0.0
     """
+    # Error results should return 0.0 - don't extract reward from error responses
+    if result.isError:
+        logger.warning("Evaluate tool returned error, using reward=0.0")
+        return 0.0
     accept_keys = ["reward", "grade", "score"]
     # Check for direct reward/grade/score keys

hud-python 0.5.8__tar.gz → 0.5.18__tar.gz

hud-python 0.5.8tar.gz → 0.5.18tar.gz