PyPI - hud-python - Versions diffs - 0.4.63__tar.gz → 0.4.64__tar.gz - Mend

hud-python 0.4.63tar.gz → 0.4.64tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (318) hide show

{hud_python-0.4.63 → hud_python-0.4.64}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.4.63
+Version: 0.4.64
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -227,7 +227,7 @@ async def main() -> None:
         client = MCPClient(mcp_config=task.mcp_config)
         agent = ClaudeAgent(
             mcp_client=client,
-            model="claude-sonnet-4-20250514",  # requires ANTHROPIC_API_KEY
+            model="claude-sonnet-4-5",  # requires ANTHROPIC_API_KEY
         )
         result = await agent.run(task)
@@ -292,7 +292,7 @@ results = await run_dataset(
     name="My SheetBench-50 Evaluation",
     dataset="hud-evals/SheetBench-50",      # <-- HuggingFace dataset
     agent_class=ClaudeAgent,                # <-- Your custom agent can replace this (see https://docs.hud.ai/evaluate-agents/create-agents)
-    agent_config={"model": "claude-sonnet-4-20250514"},
+    agent_config={"model": "claude-sonnet-4-5"},
     max_concurrent=50,
     max_steps=30,
 )

{hud_python-0.4.63 → hud_python-0.4.64}/README.md RENAMED Viewed

@@ -86,7 +86,7 @@ async def main() -> None:
         client = MCPClient(mcp_config=task.mcp_config)
         agent = ClaudeAgent(
             mcp_client=client,
-            model="claude-sonnet-4-20250514",  # requires ANTHROPIC_API_KEY
+            model="claude-sonnet-4-5",  # requires ANTHROPIC_API_KEY
         )
         result = await agent.run(task)
@@ -151,7 +151,7 @@ results = await run_dataset(
     name="My SheetBench-50 Evaluation",
     dataset="hud-evals/SheetBench-50",      # <-- HuggingFace dataset
     agent_class=ClaudeAgent,                # <-- Your custom agent can replace this (see https://docs.hud.ai/evaluate-agents/create-agents)
-    agent_config={"model": "claude-sonnet-4-20250514"},
+    agent_config={"model": "claude-sonnet-4-5"},
     max_concurrent=50,
     max_steps=30,
 )

{hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/base.py RENAMED Viewed

@@ -25,8 +25,6 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
-GLOBAL_SYSTEM_PROMPT = "You are an assistant that can use tools to help the user. You will be given a task and you will need to use the tools to complete the task."  # noqa: E501
 class MCPAgent(ABC):
     """
@@ -58,7 +56,7 @@ class MCPAgent(ABC):
         disallowed_tools: list[str] | None = None,
         response_tool_name: str | None = None,
         # Messages
-        system_prompt: str = GLOBAL_SYSTEM_PROMPT,
+        system_prompt: str | None = None,
         append_setup_output: bool = True,
         initial_screenshot: bool = True,
         # Misc
@@ -155,7 +153,10 @@ class MCPAgent(ABC):
         # If task is provided, apply agent_config and add lifecycle tools
         if isinstance(task, Task) and task.agent_config:
             if task.agent_config.get("system_prompt"):
-                self.system_prompt += "\n\n" + task.agent_config["system_prompt"]
+                if self.system_prompt is None:
+                    self.system_prompt = task.agent_config["system_prompt"]
+                else:
+                    self.system_prompt += "\n\n" + task.agent_config["system_prompt"]
             if "append_setup_output" in task.agent_config:
                 self.append_setup_output = task.agent_config["append_setup_output"]
             if "initial_screenshot" in task.agent_config:
@@ -242,6 +243,7 @@ class MCPAgent(ABC):
                 return await self._run_context(context, max_steps=max_steps)
         except Exception as e:
+            logger.exception("Error while running agent:")
             # Always return a Trace object for any exception
             if self._is_connection_error(e):
                 # Return error trace for connection failures

hud_python-0.4.64/hud/agents/claude.py ADDED Viewed

@@ -0,0 +1,365 @@
+"""Claude MCP Agent implementation."""
+from __future__ import annotations
+import copy
+import logging
+import re
+from inspect import cleandoc
+from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
+from anthropic import Anthropic, AsyncAnthropic, Omit
+from anthropic.types import (
+    CacheControlEphemeralParam,
+)
+from anthropic.types.beta import (
+    BetaBase64ImageSourceParam,
+    BetaContentBlockParam,
+    BetaImageBlockParam,
+    BetaMessageParam,
+    BetaTextBlockParam,
+    BetaToolBash20250124Param,
+    BetaToolComputerUse20250124Param,
+    BetaToolParam,
+    BetaToolResultBlockParam,
+    BetaToolTextEditor20250728Param,
+    BetaToolUnionParam,
+)
+import hud
+if TYPE_CHECKING:
+    from hud.datasets import Task
+import mcp.types as types
+from hud.settings import settings
+from hud.tools.computer.settings import computer_settings
+from hud.types import AgentResponse, MCPToolCall, MCPToolResult
+from hud.utils.hud_console import HUDConsole
+from .base import MCPAgent
+logger = logging.getLogger(__name__)
+class ClaudeAgent(MCPAgent):
+    """
+    Claude agent that uses MCP servers for tool execution.
+    This agent uses Claude's native tool calling capabilities but executes
+    tools through MCP servers instead of direct implementation.
+    """
+    metadata: ClassVar[dict[str, Any]] = {
+        "display_width": computer_settings.ANTHROPIC_COMPUTER_WIDTH,
+        "display_height": computer_settings.ANTHROPIC_COMPUTER_HEIGHT,
+    }
+    def __init__(
+        self,
+        model_client: AsyncAnthropic | None = None,
+        model: str = "claude-sonnet-4-5",
+        max_tokens: int = 16384,
+        use_computer_beta: bool = True,
+        validate_api_key: bool = True,
+        computer_tool_regex: str = r"(^|_)(anthropic_computer|computer_anthropic|computer)$",
+        **kwargs: Any,
+    ) -> None:
+        """
+        Initialize Claude MCP agent.
+        Args:
+            model_client: AsyncAnthropic client (created if not provided)
+            model: Claude model to use
+            max_tokens: Maximum tokens for response
+            use_computer_beta: Whether to use computer-use beta features
+            computer_tool_regex: we use this regex to identify the computer tool
+            **kwargs: Additional arguments passed to BaseMCPAgent (including mcp_client)
+        """
+        super().__init__(**kwargs)
+        # Initialize client if not provided
+        if model_client is None:
+            api_key = settings.anthropic_api_key
+            if not api_key:
+                raise ValueError("Anthropic API key not found. Set ANTHROPIC_API_KEY.")
+            model_client = AsyncAnthropic(api_key=api_key)
+        # validate api key if requested
+        if validate_api_key:
+            try:
+                Anthropic(api_key=model_client.api_key).models.list()
+            except Exception as e:
+                raise ValueError(f"Anthropic API key is invalid: {e}") from e
+        self.anthropic_client = model_client
+        self.model = model
+        self.max_tokens = max_tokens
+        self.use_computer_beta = use_computer_beta
+        self.hud_console = HUDConsole(logger=logger)
+        self.model_name = "Claude"
+        self.checkpoint_name = self.model
+        self.computer_tool_regex = computer_tool_regex
+        # these will be initialized in _convert_tools_for_claude
+        self.has_computer_tool = False
+        self.tool_mapping: dict[str, str] = {}
+        self.claude_tools: list[BetaToolUnionParam] = []
+    async def initialize(self, task: str | Task | None = None) -> None:
+        """Initialize the agent and build tool mappings."""
+        await super().initialize(task)
+        # Build tool mappings after tools are discovered
+        self._convert_tools_for_claude()
+    async def get_system_messages(self) -> list[Any]:
+        """No system messages for Claude because applied in get_response"""
+        return []
+    async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
+        """Format messages for Claude."""
+        # Convert MCP content types to Anthropic content types
+        anthropic_blocks: list[BetaContentBlockParam] = []
+        for block in blocks:
+            if isinstance(block, types.TextContent):
+                # Only include fields that Anthropic expects
+                anthropic_blocks.append(
+                    BetaTextBlockParam(
+                        type="text",
+                        text=block.text,
+                    )
+                )
+            elif isinstance(block, types.ImageContent):
+                # Convert MCP ImageContent to Anthropic format
+                anthropic_blocks.append(
+                    BetaImageBlockParam(
+                        type="image",
+                        source=BetaBase64ImageSourceParam(
+                            type="base64",
+                            media_type=cast(
+                                "Literal['image/jpeg', 'image/png', 'image/gif', 'image/webp']",
+                                block.mimeType,
+                            ),
+                            data=block.data,
+                        ),
+                    )
+                )
+            else:
+                raise ValueError(f"Unknown content block type: {type(block)}")
+        return [BetaMessageParam(role="user", content=anthropic_blocks)]
+    @hud.instrument(
+        span_type="agent",
+        record_args=False,  # Messages can be large
+        record_result=True,
+    )
+    async def get_response(self, messages: list[BetaMessageParam]) -> AgentResponse:
+        """Get response from Claude including any tool calls."""
+        messages_cached = self._add_prompt_caching(messages)
+        response = await self.anthropic_client.beta.messages.create(
+            model=self.model,
+            system=self.system_prompt if self.system_prompt is not None else Omit(),
+            max_tokens=self.max_tokens,
+            messages=messages_cached,
+            tools=self.claude_tools,
+            tool_choice={"type": "auto", "disable_parallel_tool_use": True},
+            betas=["computer-use-2025-01-24"] if self.has_computer_tool else [],
+        )
+        messages.append(
+            BetaMessageParam(
+                role="assistant",
+                content=response.content,
+            )
+        )
+        # Process response
+        result = AgentResponse(content="", tool_calls=[], done=True)
+        # Extract text content and reasoning
+        text_content = ""
+        thinking_content = ""
+        for block in response.content:
+            if block.type == "tool_use":
+                tool_call = MCPToolCall(
+                    id=block.id,
+                    # look up name in tool_mapping if available, otherwise use block name
+                    name=self.tool_mapping.get(block.name, block.name),
+                    arguments=block.input,
+                )
+                result.tool_calls.append(tool_call)
+                result.done = False
+            elif block.type == "text":
+                text_content += block.text
+            elif hasattr(block, "type") and block.type == "thinking":
+                thinking_content += f"Thinking: {block.thinking}\n"
+        result.content = thinking_content + text_content
+        return result
+    async def format_tool_results(
+        self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
+    ) -> list[BetaMessageParam]:
+        """Format tool results into Claude messages."""
+        # Process each tool result
+        user_content = []
+        for tool_call, result in zip(tool_calls, tool_results, strict=True):
+            # Extract Claude-specific metadata from extra fields
+            tool_use_id = tool_call.id
+            if not tool_use_id:
+                self.hud_console.warning(f"No tool_use_id found for {tool_call.name}")
+                continue
+            # Convert MCP tool results to Claude format
+            claude_blocks = []
+            if result.isError:
+                # Extract error message from content
+                error_msg = "Tool execution failed"
+                for content in result.content:
+                    if isinstance(content, types.TextContent):
+                        error_msg = content.text
+                        break
+                claude_blocks.append(text_to_content_block(f"Error: {error_msg}"))
+            else:
+                # Process success content
+                for content in result.content:
+                    if isinstance(content, types.TextContent):
+                        claude_blocks.append(text_to_content_block(content.text))
+                    elif isinstance(content, types.ImageContent):
+                        claude_blocks.append(base64_to_content_block(content.data))
+            # Add tool result
+            user_content.append(tool_use_content_block(tool_use_id, claude_blocks))
+        # Return as a user message containing all tool results
+        return [
+            BetaMessageParam(
+                role="user",
+                content=user_content,
+            )
+        ]
+    async def create_user_message(self, text: str) -> BetaMessageParam:
+        """Create a user message in Claude's format."""
+        return BetaMessageParam(role="user", content=text)
+    def _convert_tools_for_claude(self) -> None:
+        """Convert MCP tools to Claude API tools."""
+        def to_api_tool(tool: types.Tool) -> BetaToolUnionParam:
+            if tool.name == "str_replace_based_edit_tool":
+                return BetaToolTextEditor20250728Param(
+                    type="text_editor_20250728",
+                    name="str_replace_based_edit_tool",
+                    cache_control=CacheControlEphemeralParam(type="ephemeral"),
+                )
+            if tool.name == "bash":
+                return BetaToolBash20250124Param(
+                    type="bash_20250124",
+                    name="bash",
+                    cache_control=CacheControlEphemeralParam(type="ephemeral"),
+                )
+            if re.fullmatch(self.computer_tool_regex, tool.name):
+                return BetaToolComputerUse20250124Param(
+                    type="computer_20250124",
+                    name="computer",
+                    display_number=1,
+                    display_width_px=computer_settings.ANTHROPIC_COMPUTER_WIDTH,
+                    display_height_px=computer_settings.ANTHROPIC_COMPUTER_HEIGHT,
+                    cache_control=CacheControlEphemeralParam(type="ephemeral"),
+                )
+            if tool.description is None or tool.inputSchema is None:
+                raise ValueError(
+                    cleandoc(f"""MCP tool {tool.name} requires both a description and inputSchema.
+                    Add these by:
+                    1. Adding a docstring to your @mcp.tool decorated function for the description
+                    2. Using pydantic Field() annotations on function parameters for the schema
+                    """)
+                )
+            """Convert a tool to the API format"""
+            return BetaToolParam(
+                name=tool.name,
+                description=tool.description,
+                input_schema=tool.inputSchema,
+                cache_control=CacheControlEphemeralParam(type="ephemeral"),
+            )
+        self.has_computer_tool = False
+        self.tool_mapping = {}
+        self.claude_tools = []
+        for tool in self.get_available_tools():
+            claude_tool = to_api_tool(tool)
+            # warn if multiple computer tools are found
+            if claude_tool["name"] == "computer":
+                if self.has_computer_tool:
+                    logger.warning(
+                        "Multiple computer tools found. Ignoring %s since %s is already present",
+                        tool.name,
+                        self.tool_mapping["computer"],
+                    )
+                    continue
+                else:
+                    self.has_computer_tool = True
+            self.tool_mapping[claude_tool["name"]] = tool.name
+            self.claude_tools.append(claude_tool)
+    def _add_prompt_caching(self, messages: list[BetaMessageParam]) -> list[BetaMessageParam]:
+        """Add prompt caching to messages."""
+        messages_cached = copy.deepcopy(messages)
+        cache_control: CacheControlEphemeralParam = {"type": "ephemeral"}
+        # Mark last user message with cache control
+        if (
+            messages_cached
+            and isinstance(messages_cached[-1], dict)
+            and messages_cached[-1].get("role") == "user"
+        ):
+            last_content = messages_cached[-1]["content"]
+            # Content is formatted to be list of ContentBlock in format_blocks and format_message
+            if isinstance(last_content, list):
+                for block in last_content:
+                    # Only add cache control to dict-like block types that support it
+                    if isinstance(block, dict):
+                        match block["type"]:
+                            case "redacted_thinking" | "thinking":
+                                pass
+                            case _:
+                                block["cache_control"] = cache_control
+        return messages_cached
+def base64_to_content_block(base64: str) -> BetaImageBlockParam:
+    """Convert base64 image to Claude content block."""
+    return BetaImageBlockParam(
+        type="image",
+        source=BetaBase64ImageSourceParam(
+            type="base64",
+            media_type="image/png",
+            data=base64,
+        ),
+    )
+def text_to_content_block(text: str) -> BetaTextBlockParam:
+    """Convert text to Claude content block."""
+    return {"type": "text", "text": text}
+def tool_use_content_block(
+    tool_use_id: str, content: list[BetaTextBlockParam | BetaImageBlockParam]
+) -> BetaToolResultBlockParam:
+    """Create tool result content block."""
+    return {"type": "tool_result", "tool_use_id": tool_use_id, "content": content}

{hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/langchain.py RENAMED Viewed

@@ -89,7 +89,10 @@ class LangChainAgent(MCPAgent):
     async def get_system_messages(self) -> list[BaseMessage]:
         """Get system messages for LangChain."""
-        return [SystemMessage(content=self.system_prompt)]
+        if self.system_prompt is not None:
+            return [SystemMessage(content=self.system_prompt)]
+        else:
+            return []
     async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[BaseMessage]:
         """Create initial messages for LangChain."""

{hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/openai_chat_generic.py RENAMED Viewed

@@ -84,7 +84,10 @@ class GenericOpenAIChatAgent(MCPAgent):
     async def get_system_messages(self) -> list[Any]:
         """Get system messages for OpenAI."""
-        return [{"role": "system", "content": self.system_prompt}]
+        if self.system_prompt is not None:
+            return [{"role": "system", "content": self.system_prompt}]
+        else:
+            return []
     async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
         """Format blocks for OpenAI."""

{hud_python-0.4.63 → hud_python-0.4.64}/hud/agents/tests/test_base.py RENAMED Viewed

@@ -96,7 +96,6 @@ class TestBaseMCPAgent:
         assert agent.allowed_tools is None
         assert agent.disallowed_tools is None
         assert agent.initial_screenshot is True
-        assert agent.system_prompt is not None  # Default system prompt is set
     def test_init_with_params(self):
         """Test initialization with custom parameters."""

{hud_python-0.4.63 → hud_python-0.4.64}/hud/cli/eval.py RENAMED Viewed

@@ -232,7 +232,7 @@ def build_agent(
         )
         raise typer.Exit(1) from e
-    model = model or "claude-sonnet-4-20250514"
+    model = model or "claude-sonnet-4-5"
     if allowed_tools:
         return ClaudeAgent(
@@ -393,7 +393,7 @@ async def run_single_task(
         agent_class = ClaudeAgent
         agent_config = {
-            "model": model or "claude-sonnet-4-20250514",
+            "model": model or "claude-sonnet-4-5",
             "verbose": verbose,
             "validate_api_key": False,
         }
@@ -626,7 +626,7 @@ async def run_full_dataset(
             raise typer.Exit(1) from e
         agent_config = {
-            "model": model or "claude-sonnet-4-20250514",
+            "model": model or "claude-sonnet-4-5",
             "verbose": verbose,
             "validate_api_key": False,
         }

hud-python 0.4.63__tar.gz → 0.4.64__tar.gz

hud-python 0.4.63tar.gz → 0.4.64tar.gz