PyPI - hud-python - Versions diffs - 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl - Mend

hud-python 0.4.45py3-none-any.whl → 0.5.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (282) hide show

hud/__init__.py +27 -7
hud/agents/__init__.py +70 -5
hud/agents/base.py +238 -500
hud/agents/claude.py +236 -247
hud/agents/gateway.py +42 -0
hud/agents/gemini.py +264 -0
hud/agents/gemini_cua.py +324 -0
hud/agents/grounded_openai.py +98 -100
hud/agents/misc/integration_test_agent.py +51 -20
hud/agents/misc/response_agent.py +48 -36
hud/agents/openai.py +282 -296
hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
hud/agents/operator.py +199 -0
hud/agents/resolver.py +70 -0
hud/agents/tests/conftest.py +133 -0
hud/agents/tests/test_base.py +300 -622
hud/agents/tests/test_base_runtime.py +233 -0
hud/agents/tests/test_claude.py +381 -214
hud/agents/tests/test_client.py +9 -10
hud/agents/tests/test_gemini.py +369 -0
hud/agents/tests/test_grounded_openai_agent.py +65 -50
hud/agents/tests/test_openai.py +377 -140
hud/agents/tests/test_operator.py +362 -0
hud/agents/tests/test_resolver.py +192 -0
hud/agents/tests/test_run_eval.py +179 -0
hud/agents/types.py +148 -0
hud/cli/__init__.py +493 -546
hud/cli/analyze.py +43 -5
hud/cli/build.py +699 -113
hud/cli/debug.py +8 -5
hud/cli/dev.py +889 -732
hud/cli/eval.py +793 -667
hud/cli/flows/dev.py +167 -0
hud/cli/flows/init.py +191 -0
hud/cli/flows/tasks.py +153 -56
hud/cli/flows/templates.py +151 -0
hud/cli/flows/tests/__init__.py +1 -0
hud/cli/flows/tests/test_dev.py +126 -0
hud/cli/init.py +60 -58
hud/cli/pull.py +1 -1
hud/cli/push.py +38 -13
hud/cli/rft.py +311 -0
hud/cli/rft_status.py +145 -0
hud/cli/tests/test_analyze.py +5 -5
hud/cli/tests/test_analyze_metadata.py +3 -2
hud/cli/tests/test_analyze_module.py +120 -0
hud/cli/tests/test_build.py +110 -8
hud/cli/tests/test_build_failure.py +41 -0
hud/cli/tests/test_build_module.py +50 -0
hud/cli/tests/test_cli_init.py +6 -1
hud/cli/tests/test_cli_more_wrappers.py +30 -0
hud/cli/tests/test_cli_root.py +140 -0
hud/cli/tests/test_convert.py +361 -0
hud/cli/tests/test_debug.py +12 -10
hud/cli/tests/test_dev.py +197 -0
hud/cli/tests/test_eval.py +251 -0
hud/cli/tests/test_eval_bedrock.py +51 -0
hud/cli/tests/test_init.py +124 -0
hud/cli/tests/test_main_module.py +11 -5
hud/cli/tests/test_mcp_server.py +12 -100
hud/cli/tests/test_push.py +1 -1
hud/cli/tests/test_push_happy.py +74 -0
hud/cli/tests/test_push_wrapper.py +23 -0
hud/cli/tests/test_registry.py +1 -1
hud/cli/tests/test_utils.py +1 -1
hud/cli/{rl → utils}/celebrate.py +14 -12
hud/cli/utils/config.py +18 -1
hud/cli/utils/docker.py +130 -4
hud/cli/utils/env_check.py +9 -9
hud/cli/utils/git.py +136 -0
hud/cli/utils/interactive.py +39 -5
hud/cli/utils/metadata.py +70 -1
hud/cli/utils/runner.py +1 -1
hud/cli/utils/server.py +2 -2
hud/cli/utils/source_hash.py +3 -3
hud/cli/utils/tasks.py +4 -1
hud/cli/utils/tests/__init__.py +0 -0
hud/cli/utils/tests/test_config.py +58 -0
hud/cli/utils/tests/test_docker.py +93 -0
hud/cli/utils/tests/test_docker_hints.py +71 -0
hud/cli/utils/tests/test_env_check.py +74 -0
hud/cli/utils/tests/test_environment.py +42 -0
hud/cli/utils/tests/test_git.py +142 -0
hud/cli/utils/tests/test_interactive_module.py +60 -0
hud/cli/utils/tests/test_local_runner.py +50 -0
hud/cli/utils/tests/test_logging_utils.py +23 -0
hud/cli/utils/tests/test_metadata.py +49 -0
hud/cli/utils/tests/test_package_runner.py +35 -0
hud/cli/utils/tests/test_registry_utils.py +49 -0
hud/cli/utils/tests/test_remote_runner.py +25 -0
hud/cli/utils/tests/test_runner_modules.py +52 -0
hud/cli/utils/tests/test_source_hash.py +36 -0
hud/cli/utils/tests/test_tasks.py +80 -0
hud/cli/utils/version_check.py +258 -0
hud/cli/{rl → utils}/viewer.py +2 -2
hud/clients/README.md +12 -11
hud/clients/__init__.py +4 -3
hud/clients/base.py +166 -26
hud/clients/environment.py +51 -0
hud/clients/fastmcp.py +13 -6
hud/clients/mcp_use.py +45 -15
hud/clients/tests/test_analyze_scenarios.py +206 -0
hud/clients/tests/test_protocol.py +9 -3
hud/datasets/__init__.py +23 -20
hud/datasets/loader.py +326 -0
hud/datasets/runner.py +198 -105
hud/datasets/tests/__init__.py +0 -0
hud/datasets/tests/test_loader.py +221 -0
hud/datasets/tests/test_utils.py +315 -0
hud/datasets/utils.py +270 -90
hud/environment/__init__.py +52 -0
hud/environment/connection.py +258 -0
hud/environment/connectors/__init__.py +33 -0
hud/environment/connectors/base.py +68 -0
hud/environment/connectors/local.py +177 -0
hud/environment/connectors/mcp_config.py +137 -0
hud/environment/connectors/openai.py +101 -0
hud/environment/connectors/remote.py +172 -0
hud/environment/environment.py +835 -0
hud/environment/integrations/__init__.py +45 -0
hud/environment/integrations/adk.py +67 -0
hud/environment/integrations/anthropic.py +196 -0
hud/environment/integrations/gemini.py +92 -0
hud/environment/integrations/langchain.py +82 -0
hud/environment/integrations/llamaindex.py +68 -0
hud/environment/integrations/openai.py +238 -0
hud/environment/mock.py +306 -0
hud/environment/router.py +263 -0
hud/environment/scenarios.py +620 -0
hud/environment/tests/__init__.py +1 -0
hud/environment/tests/test_connection.py +317 -0
hud/environment/tests/test_connectors.py +205 -0
hud/environment/tests/test_environment.py +593 -0
hud/environment/tests/test_integrations.py +257 -0
hud/environment/tests/test_local_connectors.py +242 -0
hud/environment/tests/test_scenarios.py +1086 -0
hud/environment/tests/test_tools.py +208 -0
hud/environment/types.py +23 -0
hud/environment/utils/__init__.py +35 -0
hud/environment/utils/formats.py +215 -0
hud/environment/utils/schema.py +171 -0
hud/environment/utils/tool_wrappers.py +113 -0
hud/eval/__init__.py +67 -0
hud/eval/context.py +727 -0
hud/eval/display.py +299 -0
hud/eval/instrument.py +187 -0
hud/eval/manager.py +533 -0
hud/eval/parallel.py +268 -0
hud/eval/task.py +372 -0
hud/eval/tests/__init__.py +1 -0
hud/eval/tests/test_context.py +178 -0
hud/eval/tests/test_eval.py +210 -0
hud/eval/tests/test_manager.py +152 -0
hud/eval/tests/test_parallel.py +168 -0
hud/eval/tests/test_task.py +291 -0
hud/eval/types.py +65 -0
hud/eval/utils.py +194 -0
hud/patches/__init__.py +19 -0
hud/patches/mcp_patches.py +308 -0
hud/patches/warnings.py +54 -0
hud/samples/browser.py +4 -4
hud/server/__init__.py +2 -1
hud/server/low_level.py +2 -1
hud/server/router.py +164 -0
hud/server/server.py +567 -80
hud/server/tests/test_mcp_server_integration.py +11 -11
hud/server/tests/test_mcp_server_more.py +1 -1
hud/server/tests/test_server_extra.py +2 -0
hud/settings.py +45 -3
hud/shared/exceptions.py +36 -10
hud/shared/hints.py +26 -1
hud/shared/requests.py +15 -3
hud/shared/tests/test_exceptions.py +40 -31
hud/shared/tests/test_hints.py +167 -0
hud/telemetry/__init__.py +20 -19
hud/telemetry/exporter.py +201 -0
hud/telemetry/instrument.py +165 -253
hud/telemetry/tests/test_eval_telemetry.py +356 -0
hud/telemetry/tests/test_exporter.py +258 -0
hud/telemetry/tests/test_instrument.py +401 -0
hud/tools/__init__.py +18 -2
hud/tools/agent.py +223 -0
hud/tools/apply_patch.py +639 -0
hud/tools/base.py +54 -4
hud/tools/bash.py +2 -2
hud/tools/computer/__init__.py +36 -3
hud/tools/computer/anthropic.py +2 -2
hud/tools/computer/gemini.py +385 -0
hud/tools/computer/hud.py +23 -6
hud/tools/computer/openai.py +20 -21
hud/tools/computer/qwen.py +434 -0
hud/tools/computer/settings.py +37 -0
hud/tools/edit.py +3 -7
hud/tools/executors/base.py +4 -2
hud/tools/executors/pyautogui.py +1 -1
hud/tools/grounding/grounded_tool.py +13 -18
hud/tools/grounding/grounder.py +10 -31
hud/tools/grounding/tests/test_grounded_tool.py +26 -44
hud/tools/jupyter.py +330 -0
hud/tools/playwright.py +18 -3
hud/tools/shell.py +308 -0
hud/tools/tests/test_agent_tool.py +355 -0
hud/tools/tests/test_apply_patch.py +718 -0
hud/tools/tests/test_computer.py +4 -9
hud/tools/tests/test_computer_actions.py +24 -2
hud/tools/tests/test_jupyter_tool.py +181 -0
hud/tools/tests/test_shell.py +596 -0
hud/tools/tests/test_submit.py +85 -0
hud/tools/tests/test_types.py +193 -0
hud/tools/types.py +21 -1
hud/types.py +194 -56
hud/utils/__init__.py +2 -0
hud/utils/env.py +67 -0
hud/utils/hud_console.py +89 -18
hud/utils/mcp.py +15 -58
hud/utils/strict_schema.py +162 -0
hud/utils/tests/test_init.py +1 -2
hud/utils/tests/test_mcp.py +1 -28
hud/utils/tests/test_pretty_errors.py +186 -0
hud/utils/tests/test_tool_shorthand.py +154 -0
hud/utils/tests/test_version.py +1 -1
hud/utils/types.py +20 -0
hud/version.py +1 -1
hud_python-0.5.13.dist-info/METADATA +264 -0
hud_python-0.5.13.dist-info/RECORD +305 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
hud/agents/langchain.py +0 -261
hud/agents/lite_llm.py +0 -72
hud/cli/rl/__init__.py +0 -180
hud/cli/rl/config.py +0 -101
hud/cli/rl/display.py +0 -133
hud/cli/rl/gpu.py +0 -63
hud/cli/rl/gpu_utils.py +0 -321
hud/cli/rl/local_runner.py +0 -595
hud/cli/rl/presets.py +0 -96
hud/cli/rl/remote_runner.py +0 -463
hud/cli/rl/rl_api.py +0 -150
hud/cli/rl/vllm.py +0 -177
hud/cli/rl/wait_utils.py +0 -89
hud/datasets/parallel.py +0 -687
hud/misc/__init__.py +0 -1
hud/misc/claude_plays_pokemon.py +0 -292
hud/otel/__init__.py +0 -35
hud/otel/collector.py +0 -142
hud/otel/config.py +0 -181
hud/otel/context.py +0 -570
hud/otel/exporters.py +0 -369
hud/otel/instrumentation.py +0 -135
hud/otel/processors.py +0 -121
hud/otel/tests/__init__.py +0 -1
hud/otel/tests/test_processors.py +0 -197
hud/rl/README.md +0 -30
hud/rl/__init__.py +0 -1
hud/rl/actor.py +0 -176
hud/rl/buffer.py +0 -405
hud/rl/chat_template.jinja +0 -101
hud/rl/config.py +0 -192
hud/rl/distributed.py +0 -132
hud/rl/learner.py +0 -637
hud/rl/tests/__init__.py +0 -1
hud/rl/tests/test_learner.py +0 -186
hud/rl/train.py +0 -382
hud/rl/types.py +0 -101
hud/rl/utils/start_vllm_server.sh +0 -30
hud/rl/utils.py +0 -524
hud/rl/vllm_adapter.py +0 -143
hud/telemetry/job.py +0 -352
hud/telemetry/replay.py +0 -74
hud/telemetry/tests/test_replay.py +0 -40
hud/telemetry/tests/test_trace.py +0 -63
hud/telemetry/trace.py +0 -158
hud/utils/agent_factories.py +0 -86
hud/utils/async_utils.py +0 -65
hud/utils/group_eval.py +0 -223
hud/utils/progress.py +0 -149
hud/utils/tasks.py +0 -127
hud/utils/tests/test_async_utils.py +0 -173
hud/utils/tests/test_progress.py +0 -261
hud_python-0.4.45.dist-info/METADATA +0 -552
hud_python-0.4.45.dist-info/RECORD +0 -228
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0

hud/agents/tests/test_base.py CHANGED Viewed

@@ -1,738 +1,416 @@
-"""Tests for BaseMCPAgent using simulated actions."""
+"""Tests for MCPAgent base class with v5 EvalContext pattern."""
 from __future__ import annotations
 from typing import Any, ClassVar
-from unittest.mock import MagicMock
-# Import AsyncMock from unittest.mock if available (Python 3.8+)
-try:
-    from unittest.mock import AsyncMock
-except ImportError:
-    # Fallback for older Python versions
-    from unittest.mock import MagicMock as AsyncMock
 import pytest
 from mcp import types
 from hud.agents import MCPAgent
-from hud.datasets import Task
-from hud.tools.executors.base import BaseExecutor
-from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
+from hud.agents.base import BaseCreateParams
+from hud.environment.router import ToolRouter
+from hud.eval.context import EvalContext
+from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
+class MockConfig(BaseAgentConfig):
+    model_name: str = "MockAgent"
+    model: str = "mock-model"
+class MockCreateParams(BaseCreateParams, MockConfig):
+    pass
+class MockEvalContext(EvalContext):
+    """Mock EvalContext for testing."""
+    def __init__(
+        self,
+        prompt: str = "Test prompt",
+        tools: list[types.Tool] | None = None,
+    ) -> None:
+        # Core attributes
+        self.prompt = prompt
+        self._tools = tools or [
+            types.Tool(name="test_tool", description="A test tool", inputSchema={}),
+            types.Tool(name="another_tool", description="Another tool", inputSchema={}),
+        ]
+        self._submitted: str | None = None
+        self.reward: float | None = None
+        self._tool_calls: list[tuple[str, dict[str, Any]]] = []
+        # Environment attributes
+        self._router = ToolRouter()
+        self._agent_include: list[str] | None = None
+        self._agent_exclude: list[str] | None = None
+        # EvalContext attributes
+        self._task = None
+        self.trace_id = "test-trace-id"
+        self.eval_name = "test-eval"
+        self.job_id: str | None = None
+        self.group_id: str | None = None
+        self.index = 0
+        self.variants: dict[str, Any] = {}
+        self.answer: str | None = None
+        self.system_prompt: str | None = None
+        self.error: BaseException | None = None
+        self.metadata: dict[str, Any] = {}
+        self.results: list[Any] = []
+        self._is_summary = False
+    def as_tools(self) -> list[types.Tool]:
+        return self._tools
+    @property
+    def has_scenario(self) -> bool:
+        return True
+    async def list_tools(self) -> list[types.Tool]:
+        return self._tools
+    async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
+        # Parse the call
+        if isinstance(call, tuple):
+            name, args = call[0], call[1] if len(call) > 1 else {}
+        elif hasattr(call, "name"):
+            name, args = call.name, getattr(call, "arguments", {}) or {}
+        else:
+            name, args = str(call), kwargs
+        self._tool_calls.append((name, args))
+        return MCPToolResult(
+            content=[types.TextContent(type="text", text=f"Result from {name}")],
+            isError=False,
+        )
+    async def submit(self, answer: str) -> None:
+        self._submitted = answer
 class MockMCPAgent(MCPAgent):
-    """Concrete implementation of BaseMCPAgent for testing."""
-    metadata: ClassVar[dict[str, Any]] = {}  # Optional metadata for MCP config
-    def __init__(self, mcp_client: Any = None, **kwargs: Any) -> None:
-        if mcp_client is None:
-            # Create a mock client if none provided
-            mcp_client = MagicMock()
-            mcp_client.get_available_tools = MagicMock(return_value=[])
-            mcp_client.initialize = AsyncMock()
-            mcp_client.list_tools = AsyncMock(return_value=[])
-            mcp_client.mcp_config = {"test_server": {"url": "http://localhost"}}
-        super().__init__(mcp_client=mcp_client, **kwargs)
-        self.executor = BaseExecutor()  # Use simulated executor
-        self._messages = []
-    async def run(self, task: Task) -> list[dict[str, Any]]:
-        """Mock run method."""
-        return self._messages
-    async def create_initial_messages(
-        self, prompt: str, initial_screenshot: bool = False
-    ) -> list[dict[str, Any]]:
-        """Mock create initial messages."""
-        messages = [{"role": "user", "content": prompt}]
-        if initial_screenshot:
-            messages.append({"role": "assistant", "content": "Screenshot: mock_screenshot"})
-        return messages
+    """Concrete implementation of MCPAgent for testing."""
+    metadata: ClassVar[dict[str, Any] | None] = {}
+    config_cls: ClassVar[type[BaseAgentConfig]] = MockConfig
+    def __init__(self, **kwargs: Any) -> None:
+        params = MockCreateParams(**kwargs)
+        super().__init__(params)
+        self._response = AgentResponse(content="Mock response", tool_calls=[], done=True)
+    def set_response(self, response: AgentResponse) -> None:
+        self._response = response
     async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
-        """Mock get response."""
-        return AgentResponse(content="Mock response", tool_calls=[], done=True)
+        return self._response
     async def format_tool_results(
         self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
     ) -> list[dict[str, Any]]:
-        """Mock format tool results."""
         formatted = []
-        for tool_call, result in zip(tool_calls, tool_results):
+        for tool_call, result in zip(tool_calls, tool_results, strict=True):
             formatted.append({"role": "tool", "name": tool_call.name, "content": str(result)})
         return formatted
-    async def create_user_message(self, text: str) -> Any:
-        """Mock create user message."""
-        return {"role": "user", "content": text}
     async def get_system_messages(self) -> list[Any]:
-        """Mock get system messages."""
         return []
     async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
-        """Mock format blocks."""
-        formatted = []
-        for block in blocks:
-            if isinstance(block, types.TextContent):
-                formatted.append({"type": "text", "text": block.text})
-            elif isinstance(block, types.ImageContent):
-                formatted.append({"type": "image", "data": block.data})
-            elif hasattr(block, "type"):
-                formatted.append({"type": getattr(block, "type", "unknown")})
-        return formatted
+        return [{"type": "text", "text": getattr(b, "text", "")} for b in blocks]
-class TestBaseMCPAgent:
-    """Tests for BaseMCPAgent with simulated actions."""
+class TestMCPAgentInit:
+    """Tests for MCPAgent initialization."""
-    def test_init_defaults(self):
-        """Test initialization with default values."""
+    def test_init_defaults(self) -> None:
+        """Test agent initializes with default config."""
         agent = MockMCPAgent()
+        assert agent.ctx is None
+        assert agent._initialized is False
+        assert agent.system_prompt is None
-        assert agent.mcp_client is not None
-        assert agent.allowed_tools is None
-        assert agent.disallowed_tools == []
-        assert agent.initial_screenshot is True
-        assert agent.system_prompt is not None  # Default system prompt is set
-    def test_init_with_params(self):
-        """Test initialization with custom parameters."""
-        client = MagicMock()
-        agent = MockMCPAgent(
-            mcp_client=client,
-            allowed_tools=["tool1", "tool2"],
-            disallowed_tools=["bad_tool"],
-            initial_screenshot=True,
-            system_prompt="Custom prompt",
-        )
-        assert agent.mcp_client == client
-        assert agent.allowed_tools == ["tool1", "tool2"]
-        assert agent.disallowed_tools == ["bad_tool"]
-        assert agent.initial_screenshot is True
+    def test_init_with_system_prompt(self) -> None:
+        """Test agent with custom system prompt."""
+        agent = MockMCPAgent(system_prompt="Custom prompt")
         assert agent.system_prompt == "Custom prompt"
-    @pytest.mark.asyncio
-    async def test_init_no_client_no_task(self):
-        """Test initialize fails without client and without task."""
-        # Create a minimal concrete implementation to test the ValueError
-        class TestAgent(MCPAgent):
-            async def create_initial_messages(
-                self, prompt: str, initial_screenshot: bool = False
-            ) -> list[dict[str, Any]]:
-                return []
-            async def format_tool_results(
-                self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
-            ) -> list[dict[str, Any]]:
-                return []
-            async def get_response(self, messages: list[dict[str, Any]]) -> dict[str, Any]:
-                return {"content": "test"}
-            async def get_system_messages(self) -> list[Any]:
-                return []
-            async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
-                return []
-        # Agent can be created with None client
-        agent = TestAgent(mcp_client=None)
-        # But initialize should fail without client or task
-        with pytest.raises(ValueError, match="No MCPClient"):
-            await agent.initialize()
+class TestMCPAgentRun:
+    """Tests for MCPAgent.run() with EvalContext."""
     @pytest.mark.asyncio
-    async def test_initialize_with_sessions(self):
-        """Test initialize with existing sessions."""
+    async def test_run_basic(self) -> None:
+        """Test basic run flow with EvalContext."""
+        ctx = MockEvalContext(prompt="Do something")
         agent = MockMCPAgent()
-        # Create proper async mock for session
-        mock_session = MagicMock()
-        # Set up the connector and client_session structure
-        mock_session.connector = MagicMock()
-        mock_session.connector.client_session = MagicMock()
-        # Mock list_tools on the client_session
-        async def mock_list_tools():
-            return types.ListToolsResult(
-                tools=[
-                    types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
-                    types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
-                    types.Tool(
-                        name="setup", description="Setup tool", inputSchema={"type": "object"}
-                    ),
-                ]
-            )
-        mock_session.connector.client_session.list_tools = mock_list_tools
-        assert agent.mcp_client is not None
-        # Mock the list_tools method on mcp_client to return the tools
-        agent.mcp_client.list_tools = AsyncMock(
-            return_value=[
-                types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
-                types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
-                types.Tool(name="setup", description="Setup tool", inputSchema={"type": "object"}),
-            ]
-        )
-        await agent.initialize()
+        result = await agent.run(ctx)
-        # Check available tools were populated (excludes lifecycle tools)
-        tools = agent.get_available_tools()
-        assert len(tools) == 3  # All tools (setup is not in default lifecycle tools)
-        # Ensure names exist in available tools
-        names = {t.name for t in tools}
-        assert {"tool1", "tool2", "setup"} <= names
-    @pytest.mark.asyncio
-    async def test_initialize_with_filtering(self):
-        """Test initialize with tool filtering."""
-        agent = MockMCPAgent(allowed_tools=["tool1"], disallowed_tools=["tool3"])
-        # Create proper async mock for session
-        mock_session = MagicMock()
-        # Set up the connector and client_session structure
-        mock_session.connector = MagicMock()
-        mock_session.connector.client_session = MagicMock()
-        async def mock_list_tools():
-            return types.ListToolsResult(
-                tools=[
-                    types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
-                    types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
-                    types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
-                    types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
-                ]
-            )
-        mock_session.connector.client_session.list_tools = mock_list_tools
-        assert agent.mcp_client is not None
-        # Mock the list_tools method on mcp_client to return the tools
-        agent.mcp_client.list_tools = AsyncMock(
-            return_value=[
-                types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
-                types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
-                types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
-                types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
-            ]
-        )
-        await agent.initialize()
-        # Check filtering worked - get_available_tools excludes lifecycle tools
-        tools = agent.get_available_tools()
-        tool_names = [t.name for t in tools]
-        assert len(tools) == 1  # Only tool1 (tool2 and tool3 are filtered out)
-        assert "tool1" in tool_names
-        assert "setup" not in tool_names  # Lifecycle tool excluded from available tools
-        assert "tool2" not in tool_names  # Not in allowed list
-        assert "tool3" not in tool_names  # In disallowed list
+        assert result.done is True
+        assert result.content == "Mock response"
+        assert ctx._submitted == "Mock response"
     @pytest.mark.asyncio
-    async def test_call_tool_success(self):
-        """Test successful tool call."""
+    async def test_run_initializes_agent(self) -> None:
+        """Test run() initializes the agent with context."""
+        ctx = MockEvalContext(prompt="Do something")
         agent = MockMCPAgent()
-        # Initialize with a tool
-        mock_session = MagicMock()
-        mock_session.connector = MagicMock()
-        mock_session.connector.client_session = MagicMock()
-        async def mock_list_tools():
-            return types.ListToolsResult(
-                tools=[
-                    types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
-                ]
-            )
-        mock_session.connector.client_session.list_tools = mock_list_tools
-        # Mock the call_tool method on the client session
-        mock_result = types.CallToolResult(
-            content=[types.TextContent(type="text", text="Tool result")], isError=False
-        )
-        async def mock_call_tool(name, args):
-            return mock_result
-        mock_session.connector.client_session.call_tool = mock_call_tool
-        assert agent.mcp_client is not None
-        # Mock the client's call_tool method directly
-        agent.mcp_client.call_tool = AsyncMock(return_value=mock_result)
-        # Mock the list_tools method to return the test tool
-        agent.mcp_client.list_tools = AsyncMock(
-            return_value=[
-                types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
-            ]
-        )
-        await agent.initialize()
-        # Call the tool
-        tool_call = MCPToolCall(name="test_tool", arguments={"param": "value"})
-        results = await agent.call_tools(tool_call)
-        assert len(results) == 1
-        assert results[0] == mock_result
-        assert not results[0].isError
+        assert not agent._initialized
+        await agent.run(ctx)
+        assert agent._initialized
     @pytest.mark.asyncio
-    async def test_call_tool_not_found(self):
-        """Test calling non-existent tool."""
+    async def test_run_discovers_tools(self) -> None:
+        """Test run() discovers tools from context."""
+        tools = [
+            types.Tool(name="tool1", description="Tool 1", inputSchema={}),
+            types.Tool(name="tool2", description="Tool 2", inputSchema={}),
+        ]
+        ctx = MockEvalContext(prompt="Do something", tools=tools)
         agent = MockMCPAgent()
-        # Initialize without tools
-        mock_session = MagicMock()
+        # We need to check tools before cleanup
+        # Store a reference to check
+        discovered_tools = []
-        async def mock_list_tools():
-            return types.ListToolsResult(tools=[])
+        original_run = agent._run_context
-        mock_session.list_tools = mock_list_tools
-        assert agent.mcp_client is not None
+        async def capture_tools(*args: Any, **kwargs: Any) -> Any:
+            discovered_tools.extend(agent.get_available_tools())
+            return await original_run(*args, **kwargs)
-        await agent.initialize()
+        agent._run_context = capture_tools  # type: ignore
+        await agent.run(ctx)
-        # Try to call unknown tool - call_tools doesn't raise for unknown tools
-        tool_call = MCPToolCall(name="unknown_tool", arguments={})
-        await agent.call_tools(tool_call)
+        assert len(discovered_tools) == 2
+        assert discovered_tools[0].name == "tool1"
+        assert discovered_tools[1].name == "tool2"
     @pytest.mark.asyncio
-    async def test_call_tool_no_name(self):
-        """Test calling tool without name."""
-        # MCPToolCall accepts empty names
+    async def test_run_requires_eval_context(self) -> None:
+        """Test run() raises TypeError for non-EvalContext."""
         agent = MockMCPAgent()
-        tool_call = MCPToolCall(name="", arguments={})
-        # call_tools doesn't validate empty names, it will return error
-        await agent.call_tools(tool_call)
+        with pytest.raises(TypeError, match="must be EvalContext"):
+            await agent.run("not a context")  # type: ignore
-    def test_get_tool_schemas(self):
-        """Test getting tool schemas."""
+    @pytest.mark.asyncio
+    async def test_run_requires_prompt(self) -> None:
+        """Test run() raises ValueError when prompt is empty."""
+        ctx = MockEvalContext(prompt="")
         agent = MockMCPAgent()
-        # Add setup to lifecycle tools to test filtering
-        agent.lifecycle_tools = ["setup"]
-        agent._available_tools = [
-            types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
-            types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
-        ]
-        schemas = agent.get_tool_schemas()
-        # Should include non-lifecycle tools
-        assert len(schemas) == 1
-        assert schemas[0]["name"] == "tool1"
+        with pytest.raises(ValueError, match="prompt is not set"):
+            await agent.run(ctx)
-    def test_get_tools_by_server(self):
-        """Test getting tools grouped by server."""
+    @pytest.mark.asyncio
+    async def test_run_clears_context_after(self) -> None:
+        """Test run() clears ctx after completion."""
+        ctx = MockEvalContext(prompt="Do something")
         agent = MockMCPAgent()
-        # Set up tools from different servers
-        tool1 = types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"})
-        tool2 = types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"})
-        agent._available_tools = [tool1, tool2]
-        tools = agent.get_available_tools()
-        assert {t.name for t in tools} == {"tool1", "tool2"}
+        await agent.run(ctx)
+        assert agent.ctx is None
     @pytest.mark.asyncio
-    async def test_executor_integration(self):
-        """Test integration with BaseExecutor for simulated actions."""
+    async def test_run_no_submit_on_empty_content(self) -> None:
+        """Test run() doesn't submit when content is empty."""
+        ctx = MockEvalContext(prompt="Do something")
         agent = MockMCPAgent()
+        agent.set_response(AgentResponse(content="", tool_calls=[], done=True))
-        # Test various executor actions
-        click_result = await agent.executor.click(100, 200, take_screenshot=False)
-        assert click_result.output is not None
-        assert "[SIMULATED] Click at (100, 200)" in click_result.output
+        await agent.run(ctx)
+        assert ctx._submitted is None
-        type_result = await agent.executor.write("Test input", take_screenshot=False)
-        assert type_result.output is not None
-        assert "[SIMULATED] Type 'Test input'" in type_result.output
-        scroll_result = await agent.executor.scroll(x=50, y=50, scroll_y=5, take_screenshot=False)
-        assert scroll_result.output is not None
-        assert "[SIMULATED] Scroll" in scroll_result.output
+class TestMCPAgentToolCalling:
+    """Tests for tool calling through context."""
-        # Test screenshot
-        screenshot = await agent.executor.screenshot()
-        assert isinstance(screenshot, str)
-        assert screenshot.startswith("iVBORw0KGgo")  # PNG header
+    @pytest.mark.asyncio
+    async def test_call_tools_uses_context(self) -> None:
+        """Test call_tools routes through ctx.call_tool."""
+        ctx = MockEvalContext(prompt="Do something")
+        agent = MockMCPAgent()
+        # Bind context manually
+        agent.ctx = ctx
+        await agent._initialize_from_ctx(ctx)
-class MockAgentExtended(MCPAgent):
-    """Mock agent for testing with predefined responses."""
+        # Call a tool
+        results = await agent.call_tools(MCPToolCall(name="test_tool", arguments={"arg": "value"}))
-    metadata: ClassVar[dict[str, Any]] = {}  # Optional metadata for MCP config
+        assert len(results) == 1
+        assert not results[0].isError
+        assert ("test_tool", {"arg": "value"}) in ctx._tool_calls
-    def __init__(self, responses=None, **kwargs):
-        super().__init__(**kwargs)
-        self.responses = responses or []
-        self.call_count = 0
+    @pytest.mark.asyncio
+    async def test_call_tools_without_context_raises(self) -> None:
+        """Test call_tools raises when no context bound."""
+        agent = MockMCPAgent()
-    async def create_initial_messages(
-        self, prompt: str, initial_screenshot: bool = False
-    ) -> list[dict[str, Any]]:
-        """Create initial messages."""
-        messages = [{"role": "user", "content": prompt}]
-        if initial_screenshot:
-            # capture_screenshot doesn't exist, just mock it
-            screenshot = "mock_screenshot_data"
-            messages.append({"role": "assistant", "content": f"Screenshot: {screenshot}"})
-        return messages
+        with pytest.raises(ValueError, match="not bound to context"):
+            await agent.call_tools(MCPToolCall(name="test_tool", arguments={}))
-    async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
-        """Return predefined responses - must be async."""
-        if self.call_count < len(self.responses):
-            response_dict = self.responses[self.call_count]
-            self.call_count += 1
-            # Convert dict to AgentResponse
-            return AgentResponse(
-                content=response_dict.get("content", ""),
-                tool_calls=response_dict.get("tool_calls", []),
-                done=response_dict.get("done", not bool(response_dict.get("tool_calls"))),
-            )
-        return AgentResponse(content="Done", tool_calls=[], done=True)
-    async def format_tool_results(
-        self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
-    ) -> list[dict[str, Any]]:
-        """Format tool results."""
-        formatted = []
-        for tool_call, result in zip(tool_calls, tool_results):
-            formatted.append({"role": "tool", "name": tool_call.name, "content": str(result)})
-        return formatted
+class TestMCPAgentRequiredTools:
+    """Tests for required_tools validation."""
-    async def create_user_message(self, text: str) -> Any:
-        """Create user message."""
-        return {"role": "user", "content": text}
-    async def get_system_messages(self) -> list[Any]:
-        """Mock get system messages."""
-        return []
+    @pytest.mark.asyncio
+    async def test_missing_required_tools_raises(self) -> None:
+        """Test run() raises when required tools are missing."""
-    async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
-        """Mock format blocks."""
-        formatted = []
-        for block in blocks:
-            if isinstance(block, types.TextContent):
-                formatted.append({"type": "text", "text": block.text})
-            elif isinstance(block, types.ImageContent):
-                formatted.append({"type": "image", "data": block.data})
-            elif hasattr(block, "type"):
-                formatted.append({"type": getattr(block, "type", "unknown")})
-        return formatted
+        class AgentWithRequiredTools(MockMCPAgent):
+            required_tools: ClassVar[list[str]] = ["must_have_tool"]
+        ctx = MockEvalContext(prompt="Do something", tools=[])
+        agent = AgentWithRequiredTools()
-class TestMCPAgentExtended:
-    """Extended tests for MCPAgent."""
-    @pytest.fixture
-    def mock_client(self):
-        """Create a mock MCP client."""
-        client = MagicMock()
-        client.get_all_active_sessions = MagicMock(return_value={})
-        client.initialize = AsyncMock()
-        client.list_tools = AsyncMock(return_value=[])
-        client.call_tool = AsyncMock(
-            return_value=types.CallToolResult(
-                content=[types.TextContent(type="text", text="Success")],
-                isError=False,
-            )
-        )
-        return client
-    @pytest.fixture
-    def agent_with_tools(self, mock_client):
-        """Create agent with mock tools."""
-        mock_client.list_tools = AsyncMock(
-            return_value=[
-                types.Tool(name="screenshot", description="Take screenshot", inputSchema={}),
-                types.Tool(name="click", description="Click at coordinates", inputSchema={}),
-                types.Tool(name="type", description="Type text", inputSchema={}),
-                types.Tool(name="bad_tool", description="A tool that fails", inputSchema={}),
-            ]
-        )
-        return MockAgentExtended(mcp_client=mock_client)
+        with pytest.raises(ValueError, match="Required tools are missing"):
+            await agent.run(ctx)
     @pytest.mark.asyncio
-    async def test_run_with_task_object(self, agent_with_tools):
-        """Test running agent with Task object."""
-        from hud.types import MCPToolResult
-        task = Task(
-            id="test_task",
-            prompt="Click the button",
-            mcp_config={"test_server": {"url": "http://localhost:8080"}},
-            setup_tool={"name": "navigate", "arguments": {"url": "https://example.com"}},  # type: ignore[arg-type]
-            evaluate_tool={"name": "check_result", "arguments": {}},  # type: ignore[arg-type]
-        )
-        # Set up responses
-        agent_with_tools.responses = [
-            {
-                "role": "assistant",
-                "content": "I'll click the button",
-                "tool_calls": [MCPToolCall(name="click", arguments={"x": 100, "y": 200})],
-            }
-        ]
+    async def test_required_tools_present_succeeds(self) -> None:
+        """Test run() succeeds when required tools are present."""
-        # Mock the evaluation to return a reward
-        agent_with_tools.mcp_client.call_tool = AsyncMock(
-            side_effect=[
-                # Setup tool
-                MCPToolResult(
-                    content=[types.TextContent(type="text", text="Navigated")],
-                    isError=False,
-                ),
-                # Click tool
-                MCPToolResult(
-                    content=[types.TextContent(type="text", text="Clicked")],
-                    isError=False,
-                ),
-                # Evaluate tool with reward
-                MCPToolResult(
-                    content=[types.TextContent(type="text", text="Success")],
-                    isError=False,
-                    structuredContent={"reward": 1.0},
-                ),
-            ]
-        )
+        class AgentWithRequiredTools(MockMCPAgent):
+            required_tools: ClassVar[list[str]] = ["required_tool"]
-        result = await agent_with_tools.run(task)
+        tools = [types.Tool(name="required_tool", description="Required", inputSchema={})]
+        ctx = MockEvalContext(prompt="Do something", tools=tools)
+        agent = AgentWithRequiredTools()
-        assert isinstance(result, Trace)
-        assert result.reward == 1.0
-        assert not result.isError
+        result = await agent.run(ctx)
         assert result.done
+class TestMCPAgentOnToolsReady:
+    """Tests for _on_tools_ready hook."""
     @pytest.mark.asyncio
-    async def test_run_with_setup_error(self, agent_with_tools):
-        """Test task execution with setup phase error."""
-        from hud.types import MCPToolResult
-        task = Task(
-            id="test_task",
-            prompt="Do something",
-            mcp_config={"test_server": {"url": "http://localhost:8080"}},
-            setup_tool={"name": "bad_setup", "arguments": {}},  # type: ignore[arg-type]
-        )
+    async def test_on_tools_ready_called(self) -> None:
+        """Test _on_tools_ready is called during initialization."""
+        hook_called = [False]
-        # Mock setup tool to fail
-        agent_with_tools.mcp_client.call_tool = AsyncMock(
-            return_value=MCPToolResult(
-                content=[types.TextContent(type="text", text="Setup failed")],
-                isError=True,
-            )
-        )
+        class AgentWithHook(MockMCPAgent):
+            def _on_tools_ready(self) -> None:
+                hook_called[0] = True
-        result = await agent_with_tools.run(task)
+        ctx = MockEvalContext(prompt="Do something")
+        agent = AgentWithHook()
-        assert isinstance(result, Trace)
-        assert result.isError
-        # Error content is the string representation of the MCPToolResult list
-        assert result.content is not None
-        assert "Setup failed" in result.content
-        assert "MCPToolResult" in result.content
+        await agent.run(ctx)
+        assert hook_called[0]
     @pytest.mark.asyncio
-    async def test_run_with_multiple_setup_tools(self, agent_with_tools):
-        """Test task with multiple setup tools."""
-        task = Task(
-            id="test_task",
-            prompt="Test multiple setup",
-            mcp_config={"test_server": {"url": "http://localhost:8080"}},
-            setup_tool=[
-                MCPToolCall(name="setup1", arguments={}),
-                MCPToolCall(name="setup2", arguments={}),
-            ],
-        )
-        agent_with_tools.responses = [{"role": "assistant", "content": "Done", "tool_calls": []}]
+    async def test_on_tools_ready_has_access_to_tools(self) -> None:
+        """Test _on_tools_ready can access discovered tools."""
+        captured_tools: list[types.Tool] = []
-        setup_calls = []
-        agent_with_tools.mcp_client.call_tool = AsyncMock(
-            side_effect=lambda tool_call: setup_calls.append(tool_call)
-            or MCPToolResult(
-                content=[types.TextContent(type="text", text=f"{tool_call.name} done")],
-                isError=False,
-            )
-        )
+        class AgentWithHook(MockMCPAgent):
+            def _on_tools_ready(self) -> None:
+                captured_tools.extend(self.get_available_tools())
-        result = await agent_with_tools.run(task)
+        tools = [
+            types.Tool(name="tool1", description="Tool 1", inputSchema={}),
+            types.Tool(name="tool2", description="Tool 2", inputSchema={}),
+        ]
+        ctx = MockEvalContext(prompt="Do something", tools=tools)
+        agent = AgentWithHook()
-        # Check that the tool names match
-        setup_names = [call.name for call in setup_calls]
-        assert "setup1" in setup_names
-        assert "setup2" in setup_names
-        assert not result.isError
+        await agent.run(ctx)
-    @pytest.mark.asyncio
-    async def test_allowed_tools_filtering(self, mock_client):
-        """Test that allowed_tools filters available tools."""
-        mock_client.list_tools = AsyncMock(
-            return_value=[
-                types.Tool(name="tool1", description="Tool 1", inputSchema={}),
-                types.Tool(name="tool2", description="Tool 2", inputSchema={}),
-                types.Tool(name="tool3", description="Tool 3", inputSchema={}),
-            ]
-        )
+        assert len(captured_tools) == 2
+        assert captured_tools[0].name == "tool1"
-        agent = MockAgentExtended(mcp_client=mock_client, allowed_tools=["tool1", "tool3"])
-        await agent.initialize("test")
-        available_names = [tool.name for tool in agent._available_tools]
-        assert "tool1" in available_names
-        assert "tool3" in available_names
-        assert "tool2" not in available_names
+class TestMCPAgentToolSchemas:
+    """Tests for tool schema generation."""
     @pytest.mark.asyncio
-    async def test_disallowed_tools_filtering(self, mock_client):
-        """Test that disallowed_tools filters available tools."""
-        mock_client.list_tools = AsyncMock(
-            return_value=[
-                types.Tool(name="tool1", description="Tool 1", inputSchema={}),
-                types.Tool(name="tool2", description="Tool 2", inputSchema={}),
-                types.Tool(name="tool3", description="Tool 3", inputSchema={}),
-            ]
-        )
-        agent = MockAgentExtended(mcp_client=mock_client, disallowed_tools=["tool2"])
-        await agent.initialize("test")
+    async def test_get_tool_schemas(self) -> None:
+        """Test get_tool_schemas returns correct format."""
+        tools = [
+            types.Tool(
+                name="my_tool",
+                description="My tool description",
+                inputSchema={"type": "object", "properties": {"x": {"type": "string"}}},
+            )
+        ]
+        ctx = MockEvalContext(prompt="Do something", tools=tools)
+        agent = MockMCPAgent()
-        available_names = [tool.name for tool in agent._available_tools]
-        assert "tool1" in available_names
-        assert "tool3" in available_names
-        assert "tool2" not in available_names
+        # Initialize agent
+        agent.ctx = ctx
+        await agent._initialize_from_ctx(ctx)
-    @pytest.mark.asyncio
-    async def test_lifecycle_tools(self, mock_client):
-        """Test lifecycle tools are called in run_prompt."""
-        # Lifecycle tools are specified by name, not as objects
-        agent = MockAgentExtended(
-            mcp_client=mock_client,
-            responses=[{"role": "assistant", "content": "Done", "tool_calls": []}],
-        )
+        schemas = agent.get_tool_schemas()
+        assert len(schemas) == 1
+        assert schemas[0]["name"] == "my_tool"
+        assert schemas[0]["description"] == "My tool description"
-        # Add screenshot tool to available tools
-        mock_client.list_tools = AsyncMock(
-            return_value=[
-                types.Tool(name="screenshot", description="Take screenshot", inputSchema={})
-            ]
-        )
-        # Initialize to make tools available
-        await agent.initialize()
-        result = await agent.run("Test lifecycle", max_steps=1)
-        assert not result.isError
-    # This test is commented out as screenshot history management may have changed
-    # @pytest.mark.asyncio
-    # async def test_screenshot_history_management(self, agent_with_tools):
-    #     """Test screenshot history is maintained."""
-    #     agent_with_tools.initial_screenshot = True
-    #     # Set up responses with tool calls
-    #     agent_with_tools.responses = [
-    #         {
-    #             "role": "assistant",
-    #             "content": "Action 1",
-    #             "tool_calls": [MCPToolCall(name="click", arguments={"x": 1, "y": 1})],
-    #         },
-    #         {
-    #             "role": "assistant",
-    #             "content": "Action 2",
-    #             "tool_calls": [MCPToolCall(name="click", arguments={"x": 2, "y": 2})],
-    #         },
-    #         {
-    #             "role": "assistant",
-    #             "content": "Action 3",
-    #             "tool_calls": [MCPToolCall(name="click", arguments={"x": 3, "y": 3})],
-    #         },
-    #     ]
-    #     await agent_with_tools.run("Test screenshots", max_steps=3)
-    #     # Should have screenshots in history
-    #     assert len(agent_with_tools.screenshot_history) > 0
+class TestMCPAgentErrorPropagation:
+    """Tests for error propagation to EvalContext."""
     @pytest.mark.asyncio
-    async def test_run_with_invalid_prompt_type(self, agent_with_tools):
-        """Test run with invalid prompt type raises TypeError."""
-        with pytest.raises(TypeError, match="prompt_or_task must be str or Task"):
-            await agent_with_tools.run(123)  # Invalid type
+    async def test_exception_propagates_to_ctx_error(self) -> None:
+        """Test that exceptions during run() set ctx.error for platform visibility."""
-    @pytest.mark.asyncio
-    async def test_evaluate_phase_with_multiple_tools(self, agent_with_tools):
-        """Test evaluation phase with multiple evaluation tools."""
-        from hud.types import MCPToolResult
-        task = Task(
-            id="test_task",
-            prompt="Test evaluation",
-            mcp_config={"test_server": {"url": "http://localhost:8080"}},
-            evaluate_tool=[
-                MCPToolCall(name="eval1", arguments={}),
-                MCPToolCall(name="eval2", arguments={"reward": True}),
-            ],
-        )
+        class FailingAgent(MockMCPAgent):
+            async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
+                raise RuntimeError("Agent crashed")
-        agent_with_tools.responses = [{"role": "assistant", "content": "Done", "tool_calls": []}]
+        ctx = MockEvalContext(prompt="Do something")
+        agent = FailingAgent()
-        eval_calls = []
-        agent_with_tools.mcp_client.call_tool = AsyncMock(
-            side_effect=lambda tool_call: eval_calls.append(tool_call)
-            or MCPToolResult(
-                content=[types.TextContent(type="text", text=f"{tool_call.name} result")],
-                isError=False,
-                structuredContent={"reward": 0.5} if tool_call.name == "eval1" else {"reward": 1.0},
-            )
-        )
+        result = await agent.run(ctx)
-        result = await agent_with_tools.run(task)
+        # Should return error trace
+        assert result.isError is True
+        assert result.content is not None
+        assert "Agent crashed" in result.content
-        # Check that the tool names match
-        eval_names = [call.name for call in eval_calls]
-        assert "eval1" in eval_names
-        assert "eval2" in eval_names
-        assert result.reward == 0.5  # From eval1 (first evaluation tool)
+        assert ctx.error is not None
+        assert isinstance(ctx.error, BaseException)
+        assert "Agent crashed" in str(ctx.error)
     @pytest.mark.asyncio
-    async def test_trace_population_on_error(self, agent_with_tools):
-        """Test that trace is populated on task execution error."""
-        task = Task(
-            id="test_task",
-            prompt="Test error",
-            mcp_config={"test_server": {"url": "http://localhost:8080"}},
-            setup_tool={"name": "failing_setup", "arguments": {}},  # type: ignore[arg-type]
-        )
+    async def test_step_error_propagates_to_ctx_error(self) -> None:
+        """Test that step-level errors (caught internally) set ctx.error."""
+        step_count = [0]
+        class FailOnSecondStepAgent(MockMCPAgent):
+            async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
+                step_count[0] += 1
+                if step_count[0] == 1:
+                    return AgentResponse(
+                        content="",
+                        tool_calls=[MCPToolCall(name="test_tool", arguments={})],
+                        done=False,
+                    )
+                else:
+                    raise ValueError("Step 2 failed")
+        ctx = MockEvalContext(prompt="Do something")
+        agent = FailOnSecondStepAgent()
+        result = await agent.run(ctx)
+        # Should return error trace
+        assert result.isError is True
+        assert ctx.error is not None
+        assert "Step 2 failed" in str(ctx.error)
-        # Make setup fail with exception
-        agent_with_tools.mcp_client.call_tool = AsyncMock(side_effect=Exception("Setup explosion"))
+    @pytest.mark.asyncio
+    async def test_no_error_when_successful(self) -> None:
+        """Test that ctx.error remains None on successful run."""
+        ctx = MockEvalContext(prompt="Do something")
+        agent = MockMCPAgent()
-        result = await agent_with_tools.run(task)
+        result = await agent.run(ctx)
-        assert result.isError
-        # Error content is the string representation of the MCPToolResult list
-        assert "Setup explosion" in result.content
-        assert "MCPToolResult" in result.content
-        assert result.done
+        assert result.isError is False
+        assert ctx.error is None

hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

hud-python 0.4.45py3-none-any.whl → 0.5.13py3-none-any.whl