PyPI - hud-python - Versions diffs - 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl - Mend

hud-python 0.4.45py3-none-any.whl → 0.5.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (282) hide show

hud/__init__.py +27 -7
hud/agents/__init__.py +70 -5
hud/agents/base.py +238 -500
hud/agents/claude.py +236 -247
hud/agents/gateway.py +42 -0
hud/agents/gemini.py +264 -0
hud/agents/gemini_cua.py +324 -0
hud/agents/grounded_openai.py +98 -100
hud/agents/misc/integration_test_agent.py +51 -20
hud/agents/misc/response_agent.py +48 -36
hud/agents/openai.py +282 -296
hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
hud/agents/operator.py +199 -0
hud/agents/resolver.py +70 -0
hud/agents/tests/conftest.py +133 -0
hud/agents/tests/test_base.py +300 -622
hud/agents/tests/test_base_runtime.py +233 -0
hud/agents/tests/test_claude.py +381 -214
hud/agents/tests/test_client.py +9 -10
hud/agents/tests/test_gemini.py +369 -0
hud/agents/tests/test_grounded_openai_agent.py +65 -50
hud/agents/tests/test_openai.py +377 -140
hud/agents/tests/test_operator.py +362 -0
hud/agents/tests/test_resolver.py +192 -0
hud/agents/tests/test_run_eval.py +179 -0
hud/agents/types.py +148 -0
hud/cli/__init__.py +493 -546
hud/cli/analyze.py +43 -5
hud/cli/build.py +699 -113
hud/cli/debug.py +8 -5
hud/cli/dev.py +889 -732
hud/cli/eval.py +793 -667
hud/cli/flows/dev.py +167 -0
hud/cli/flows/init.py +191 -0
hud/cli/flows/tasks.py +153 -56
hud/cli/flows/templates.py +151 -0
hud/cli/flows/tests/__init__.py +1 -0
hud/cli/flows/tests/test_dev.py +126 -0
hud/cli/init.py +60 -58
hud/cli/pull.py +1 -1
hud/cli/push.py +38 -13
hud/cli/rft.py +311 -0
hud/cli/rft_status.py +145 -0
hud/cli/tests/test_analyze.py +5 -5
hud/cli/tests/test_analyze_metadata.py +3 -2
hud/cli/tests/test_analyze_module.py +120 -0
hud/cli/tests/test_build.py +110 -8
hud/cli/tests/test_build_failure.py +41 -0
hud/cli/tests/test_build_module.py +50 -0
hud/cli/tests/test_cli_init.py +6 -1
hud/cli/tests/test_cli_more_wrappers.py +30 -0
hud/cli/tests/test_cli_root.py +140 -0
hud/cli/tests/test_convert.py +361 -0
hud/cli/tests/test_debug.py +12 -10
hud/cli/tests/test_dev.py +197 -0
hud/cli/tests/test_eval.py +251 -0
hud/cli/tests/test_eval_bedrock.py +51 -0
hud/cli/tests/test_init.py +124 -0
hud/cli/tests/test_main_module.py +11 -5
hud/cli/tests/test_mcp_server.py +12 -100
hud/cli/tests/test_push.py +1 -1
hud/cli/tests/test_push_happy.py +74 -0
hud/cli/tests/test_push_wrapper.py +23 -0
hud/cli/tests/test_registry.py +1 -1
hud/cli/tests/test_utils.py +1 -1
hud/cli/{rl → utils}/celebrate.py +14 -12
hud/cli/utils/config.py +18 -1
hud/cli/utils/docker.py +130 -4
hud/cli/utils/env_check.py +9 -9
hud/cli/utils/git.py +136 -0
hud/cli/utils/interactive.py +39 -5
hud/cli/utils/metadata.py +70 -1
hud/cli/utils/runner.py +1 -1
hud/cli/utils/server.py +2 -2
hud/cli/utils/source_hash.py +3 -3
hud/cli/utils/tasks.py +4 -1
hud/cli/utils/tests/__init__.py +0 -0
hud/cli/utils/tests/test_config.py +58 -0
hud/cli/utils/tests/test_docker.py +93 -0
hud/cli/utils/tests/test_docker_hints.py +71 -0
hud/cli/utils/tests/test_env_check.py +74 -0
hud/cli/utils/tests/test_environment.py +42 -0
hud/cli/utils/tests/test_git.py +142 -0
hud/cli/utils/tests/test_interactive_module.py +60 -0
hud/cli/utils/tests/test_local_runner.py +50 -0
hud/cli/utils/tests/test_logging_utils.py +23 -0
hud/cli/utils/tests/test_metadata.py +49 -0
hud/cli/utils/tests/test_package_runner.py +35 -0
hud/cli/utils/tests/test_registry_utils.py +49 -0
hud/cli/utils/tests/test_remote_runner.py +25 -0
hud/cli/utils/tests/test_runner_modules.py +52 -0
hud/cli/utils/tests/test_source_hash.py +36 -0
hud/cli/utils/tests/test_tasks.py +80 -0
hud/cli/utils/version_check.py +258 -0
hud/cli/{rl → utils}/viewer.py +2 -2
hud/clients/README.md +12 -11
hud/clients/__init__.py +4 -3
hud/clients/base.py +166 -26
hud/clients/environment.py +51 -0
hud/clients/fastmcp.py +13 -6
hud/clients/mcp_use.py +45 -15
hud/clients/tests/test_analyze_scenarios.py +206 -0
hud/clients/tests/test_protocol.py +9 -3
hud/datasets/__init__.py +23 -20
hud/datasets/loader.py +326 -0
hud/datasets/runner.py +198 -105
hud/datasets/tests/__init__.py +0 -0
hud/datasets/tests/test_loader.py +221 -0
hud/datasets/tests/test_utils.py +315 -0
hud/datasets/utils.py +270 -90
hud/environment/__init__.py +52 -0
hud/environment/connection.py +258 -0
hud/environment/connectors/__init__.py +33 -0
hud/environment/connectors/base.py +68 -0
hud/environment/connectors/local.py +177 -0
hud/environment/connectors/mcp_config.py +137 -0
hud/environment/connectors/openai.py +101 -0
hud/environment/connectors/remote.py +172 -0
hud/environment/environment.py +835 -0
hud/environment/integrations/__init__.py +45 -0
hud/environment/integrations/adk.py +67 -0
hud/environment/integrations/anthropic.py +196 -0
hud/environment/integrations/gemini.py +92 -0
hud/environment/integrations/langchain.py +82 -0
hud/environment/integrations/llamaindex.py +68 -0
hud/environment/integrations/openai.py +238 -0
hud/environment/mock.py +306 -0
hud/environment/router.py +263 -0
hud/environment/scenarios.py +620 -0
hud/environment/tests/__init__.py +1 -0
hud/environment/tests/test_connection.py +317 -0
hud/environment/tests/test_connectors.py +205 -0
hud/environment/tests/test_environment.py +593 -0
hud/environment/tests/test_integrations.py +257 -0
hud/environment/tests/test_local_connectors.py +242 -0
hud/environment/tests/test_scenarios.py +1086 -0
hud/environment/tests/test_tools.py +208 -0
hud/environment/types.py +23 -0
hud/environment/utils/__init__.py +35 -0
hud/environment/utils/formats.py +215 -0
hud/environment/utils/schema.py +171 -0
hud/environment/utils/tool_wrappers.py +113 -0
hud/eval/__init__.py +67 -0
hud/eval/context.py +727 -0
hud/eval/display.py +299 -0
hud/eval/instrument.py +187 -0
hud/eval/manager.py +533 -0
hud/eval/parallel.py +268 -0
hud/eval/task.py +372 -0
hud/eval/tests/__init__.py +1 -0
hud/eval/tests/test_context.py +178 -0
hud/eval/tests/test_eval.py +210 -0
hud/eval/tests/test_manager.py +152 -0
hud/eval/tests/test_parallel.py +168 -0
hud/eval/tests/test_task.py +291 -0
hud/eval/types.py +65 -0
hud/eval/utils.py +194 -0
hud/patches/__init__.py +19 -0
hud/patches/mcp_patches.py +308 -0
hud/patches/warnings.py +54 -0
hud/samples/browser.py +4 -4
hud/server/__init__.py +2 -1
hud/server/low_level.py +2 -1
hud/server/router.py +164 -0
hud/server/server.py +567 -80
hud/server/tests/test_mcp_server_integration.py +11 -11
hud/server/tests/test_mcp_server_more.py +1 -1
hud/server/tests/test_server_extra.py +2 -0
hud/settings.py +45 -3
hud/shared/exceptions.py +36 -10
hud/shared/hints.py +26 -1
hud/shared/requests.py +15 -3
hud/shared/tests/test_exceptions.py +40 -31
hud/shared/tests/test_hints.py +167 -0
hud/telemetry/__init__.py +20 -19
hud/telemetry/exporter.py +201 -0
hud/telemetry/instrument.py +165 -253
hud/telemetry/tests/test_eval_telemetry.py +356 -0
hud/telemetry/tests/test_exporter.py +258 -0
hud/telemetry/tests/test_instrument.py +401 -0
hud/tools/__init__.py +18 -2
hud/tools/agent.py +223 -0
hud/tools/apply_patch.py +639 -0
hud/tools/base.py +54 -4
hud/tools/bash.py +2 -2
hud/tools/computer/__init__.py +36 -3
hud/tools/computer/anthropic.py +2 -2
hud/tools/computer/gemini.py +385 -0
hud/tools/computer/hud.py +23 -6
hud/tools/computer/openai.py +20 -21
hud/tools/computer/qwen.py +434 -0
hud/tools/computer/settings.py +37 -0
hud/tools/edit.py +3 -7
hud/tools/executors/base.py +4 -2
hud/tools/executors/pyautogui.py +1 -1
hud/tools/grounding/grounded_tool.py +13 -18
hud/tools/grounding/grounder.py +10 -31
hud/tools/grounding/tests/test_grounded_tool.py +26 -44
hud/tools/jupyter.py +330 -0
hud/tools/playwright.py +18 -3
hud/tools/shell.py +308 -0
hud/tools/tests/test_agent_tool.py +355 -0
hud/tools/tests/test_apply_patch.py +718 -0
hud/tools/tests/test_computer.py +4 -9
hud/tools/tests/test_computer_actions.py +24 -2
hud/tools/tests/test_jupyter_tool.py +181 -0
hud/tools/tests/test_shell.py +596 -0
hud/tools/tests/test_submit.py +85 -0
hud/tools/tests/test_types.py +193 -0
hud/tools/types.py +21 -1
hud/types.py +194 -56
hud/utils/__init__.py +2 -0
hud/utils/env.py +67 -0
hud/utils/hud_console.py +89 -18
hud/utils/mcp.py +15 -58
hud/utils/strict_schema.py +162 -0
hud/utils/tests/test_init.py +1 -2
hud/utils/tests/test_mcp.py +1 -28
hud/utils/tests/test_pretty_errors.py +186 -0
hud/utils/tests/test_tool_shorthand.py +154 -0
hud/utils/tests/test_version.py +1 -1
hud/utils/types.py +20 -0
hud/version.py +1 -1
hud_python-0.5.13.dist-info/METADATA +264 -0
hud_python-0.5.13.dist-info/RECORD +305 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
hud/agents/langchain.py +0 -261
hud/agents/lite_llm.py +0 -72
hud/cli/rl/__init__.py +0 -180
hud/cli/rl/config.py +0 -101
hud/cli/rl/display.py +0 -133
hud/cli/rl/gpu.py +0 -63
hud/cli/rl/gpu_utils.py +0 -321
hud/cli/rl/local_runner.py +0 -595
hud/cli/rl/presets.py +0 -96
hud/cli/rl/remote_runner.py +0 -463
hud/cli/rl/rl_api.py +0 -150
hud/cli/rl/vllm.py +0 -177
hud/cli/rl/wait_utils.py +0 -89
hud/datasets/parallel.py +0 -687
hud/misc/__init__.py +0 -1
hud/misc/claude_plays_pokemon.py +0 -292
hud/otel/__init__.py +0 -35
hud/otel/collector.py +0 -142
hud/otel/config.py +0 -181
hud/otel/context.py +0 -570
hud/otel/exporters.py +0 -369
hud/otel/instrumentation.py +0 -135
hud/otel/processors.py +0 -121
hud/otel/tests/__init__.py +0 -1
hud/otel/tests/test_processors.py +0 -197
hud/rl/README.md +0 -30
hud/rl/__init__.py +0 -1
hud/rl/actor.py +0 -176
hud/rl/buffer.py +0 -405
hud/rl/chat_template.jinja +0 -101
hud/rl/config.py +0 -192
hud/rl/distributed.py +0 -132
hud/rl/learner.py +0 -637
hud/rl/tests/__init__.py +0 -1
hud/rl/tests/test_learner.py +0 -186
hud/rl/train.py +0 -382
hud/rl/types.py +0 -101
hud/rl/utils/start_vllm_server.sh +0 -30
hud/rl/utils.py +0 -524
hud/rl/vllm_adapter.py +0 -143
hud/telemetry/job.py +0 -352
hud/telemetry/replay.py +0 -74
hud/telemetry/tests/test_replay.py +0 -40
hud/telemetry/tests/test_trace.py +0 -63
hud/telemetry/trace.py +0 -158
hud/utils/agent_factories.py +0 -86
hud/utils/async_utils.py +0 -65
hud/utils/group_eval.py +0 -223
hud/utils/progress.py +0 -149
hud/utils/tasks.py +0 -127
hud/utils/tests/test_async_utils.py +0 -173
hud/utils/tests/test_progress.py +0 -261
hud_python-0.4.45.dist-info/METADATA +0 -552
hud_python-0.4.45.dist-info/RECORD +0 -228
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0

hud/agents/base.py CHANGED Viewed

@@ -10,335 +10,235 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal
 import mcp.types as types
-from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
+from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult, Trace
 from hud.utils.hud_console import HUDConsole
-from hud.utils.mcp import MCPConfigPatch, patch_mcp_config, setup_hud_telemetry
-if TYPE_CHECKING:
-    from hud.clients.base import AgentMCPClient
-    from hud.datasets import Task
+from .types import BaseCreateParams
-    from .misc import ResponseAgent
+if TYPE_CHECKING:
+    from hud.environment import Environment
+    from hud.eval.context import EvalContext
 logger = logging.getLogger(__name__)
-GLOBAL_SYSTEM_PROMPT = "You are an assistant that can use tools to help the user. You will be given a task and you will need to use the tools to complete the task."  # noqa: E501
 class MCPAgent(ABC):
     """
     Base class for MCP-enabled agents.
-    Provides common behavior for agents that interact with MCP servers, including:
-    - Client management: accepts an `AgentMCPClient` or auto-creates one at
-      runtime when `run()` is called with a `Task` that includes `mcp_config`.
-    - Tool lifecycle: discovery, filtering (`allowed_tools`, `disallowed_tools`),
-      and automatic marking of lifecycle tools (setup/evaluate) from a `Task`.
-    - Messaging: system prompt handling, optional inclusion of setup output on
-      the first turn, and control over initial screenshots.
-    - Telemetry & UX: standardized logging/printing via `HUDConsole` and optional
-      automatic tracing (`auto_trace`).
+    Agents interact with MCP servers through an EvalContext:
+    - run(ctx): Main entry point - takes EvalContext from hud.eval()
+    - ctx.call_tool(): Used internally for all tool execution
+    - ctx.submit(): Called automatically with agent's final response
     Subclasses implement provider-specific formatting and response fetching
-    by overriding these abstract methods: `get_system_messages`, `get_response`,
-    `format_blocks`, and `format_tool_results`.
+    by overriding: `get_system_messages`, `get_response`, `format_blocks`,
+    and `format_tool_results`.
     """
-    metadata: dict[str, Any] | None = None
+    metadata: ClassVar[dict[str, Any] | None] = None
     required_tools: ClassVar[list[str]] = []  # Tools that must be available
+    config_cls: ClassVar[type[BaseAgentConfig]] = BaseAgentConfig
-    def __init__(
-        self,
-        mcp_client: AgentMCPClient | None = None,
-        # Filtering
-        allowed_tools: list[str] | None = None,
-        disallowed_tools: list[str] | None = None,
-        # Messages
-        system_prompt: str = GLOBAL_SYSTEM_PROMPT,
-        append_setup_output: bool = True,
-        initial_screenshot: bool = True,
-        # Misc
-        model_name: str = "mcp-agent",
-        response_agent: ResponseAgent | None = None,
-        auto_trace: bool = True,
-        verbose: bool = False,
-    ) -> None:
-        """
-        Initialize the base MCP agent.
+    def __init__(self, params: BaseCreateParams | None = None, **kwargs: Any) -> None:
+        if params is None:
+            import warnings
-        Args:
-            mcp_client: Client for connecting to MCP servers. If None, a client
-                is auto-created at runtime when `run()` is called with a `Task`
-                that provides `mcp_config`.
-            allowed_tools: Names of tools to allow (None means allow all).
-            disallowed_tools: Names of tools to always exclude.
-            system_prompt: System prompt to seed the conversation.
-            append_setup_output: Whether to append setup tool output to the
-                first turn's messages.
-            initial_screenshot: Whether to include an initial screenshot before
-                the first prompt (when supported by the environment).
-            model_name: Label used in telemetry/logging to identify the model.
-            response_agent: Optional automation that can respond to the model's
-                outputs to keep the loop going (e.g., auto-continue/stop).
-            auto_trace: If True, automatically creates a trace/span for runs.
-            verbose: If True, increases logging verbosity for developer UX.
-        """
+            warnings.warn(
+                f"Passing kwargs to {self.__class__.__name__}() is deprecated. "
+                f"Use {self.__class__.__name__}.create(...) instead.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            CreateParams = type(
+                f"{self.config_cls.__name__}CreateParams",
+                (BaseCreateParams, self.config_cls),
+                {"__module__": self.config_cls.__module__},
+            )
+            params = CreateParams(**kwargs)
-        self.mcp_client = mcp_client
-        self._auto_created_client = False  # Track if we created the client
+        config_kwargs = {
+            k: getattr(params, k) for k in self.config_cls.model_fields if hasattr(params, k)
+        }
+        self.config = self.config_cls(**config_kwargs)
-        self.model_name = model_name
-        self.console = HUDConsole(logger=logger)
+        # v5: Store execution context (EvalContext/Environment) - agent uses ctx.call_tool()
+        self.ctx: EvalContext | Environment | None = params.ctx
-        # Set verbose mode if requested
-        if verbose:
-            self.console.set_verbose(True)
+        self.model_name: str = getattr(params, "model_name", "MCPAgent")
+        self.model: str = getattr(params, "model", None) or "unknown"
+        self.auto_respond = params.auto_respond
-        # User filtering
-        self.allowed_tools = allowed_tools
-        self.disallowed_tools = disallowed_tools or []
+        self.console = HUDConsole(logger=logger)
-        # Task filtering
-        self.agent_tools = None
-        self.lifecycle_tools = []
+        if params.verbose:
+            self.console.set_verbose(True)
-        # Messages
-        self.system_prompt = system_prompt
-        self.append_setup_output = append_setup_output
-        self.initial_screenshot = initial_screenshot
+        self.system_prompt = self.config.system_prompt
-        # Initialize these here so methods can be called before initialize()
-        self._available_tools: list[types.Tool] = []
-        self._tool_map: dict[str, types.Tool] = {}  # Simplified: just name to tool
-        self.response_tool_name = None
+        self._available_tools: list[types.Tool] | None = None
+        self._tool_map: dict[str, types.Tool] = {}
+        self._initialized: bool = False
-        # Trace
-        self._auto_trace = auto_trace
-        self._auto_trace_cm: Any | None = None  # Store auto-created trace context manager
+    @classmethod
+    def create(cls, **kwargs: Any) -> MCPAgent:
+        """
+        Factory method to create an agent with typed parameters.
+        """
+        CreateParams = type(
+            f"{cls.config_cls.__name__}CreateParams",
+            (BaseCreateParams, cls.config_cls),
+            {"__module__": cls.config_cls.__module__},
+        )
+        return cls(params=CreateParams(**kwargs))
-        # Response agent to automatically interact with the model
-        self.response_agent = response_agent
+    async def _initialize_from_ctx(self, ctx: EvalContext) -> None:
+        """Initialize agent from EvalContext - discovers tools and sets up state.
-    async def initialize(self, task: str | Task | None = None) -> None:
-        """Initialize the agent with task-specific configuration."""
-        from hud.datasets import Task
+        This is the v5 initialization path. The agent uses ctx.call_tool() directly
+        for tool execution (no EnvironmentClient wrapper needed).
+        """
+        from hud.eval.context import EvalContext
-        # Create client if needed
-        if self.mcp_client is None and isinstance(task, Task) and task.mcp_config:
-            from hud.clients import MCPClient
+        if not isinstance(ctx, EvalContext):
+            raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
-            self.mcp_client = MCPClient(mcp_config=task.mcp_config)
-            self._auto_created_client = True
-            self.console.debug("Auto-created MCPClient from task.mcp_config")
+        # Refresh tools from connections, then get filtered list for agent
+        await ctx.list_tools()
+        self._available_tools = ctx.as_tools()
+        self._tool_map = {t.name: t for t in self._available_tools}
-        # Ensure we have a client
-        if self.mcp_client is None:
+        # Validate required tools are present
+        available_tool_names = {t.name for t in self._available_tools}
+        missing_tools = [tool for tool in self.required_tools if tool not in available_tool_names]
+        if missing_tools:
             raise ValueError(
-                "No MCPClient. Please provide one when initializing the agent or pass a Task with mcp_config."  # noqa: E501
+                f"Required tools are missing: {missing_tools}. "
+                f"Available tools: {sorted(available_tool_names)}"
             )
-        await self._setup_config(self.mcp_client.mcp_config)
+        self.console.info(
+            f"Agent initialized with {len(self._available_tools)} tools: "
+            f"{', '.join([t.name for t in self._available_tools])}"
+        )
-        # Initialize client if needed
-        try:
-            await self.mcp_client.initialize()
-        except Exception as e:
-            self._handle_connection_error(e)
-        # If task is provided, add lifecycle tools
-        if isinstance(task, Task):
-            if task.agent_tools:
-                self.agent_tools = task.agent_tools
-            if task.setup_tool:
-                if isinstance(task.setup_tool, list):
-                    for tool in task.setup_tool:
-                        if not self.agent_tools or (
-                            self.agent_tools and tool.name not in self.agent_tools
-                        ):
-                            self.lifecycle_tools.append(tool.name)
-                elif not self.agent_tools or (
-                    self.agent_tools and task.setup_tool.name not in self.agent_tools
-                ):
-                    self.lifecycle_tools.append(task.setup_tool.name)
-            if task.evaluate_tool:
-                if isinstance(task.evaluate_tool, list):
-                    for tool in task.evaluate_tool:
-                        if not self.agent_tools or (
-                            self.agent_tools and tool.name not in self.agent_tools
-                        ):
-                            self.lifecycle_tools.append(tool.name)
-                elif not self.agent_tools or (
-                    self.agent_tools and task.evaluate_tool.name not in self.agent_tools
-                ):
-                    self.lifecycle_tools.append(task.evaluate_tool.name)
-            if task.system_prompt:
-                self.system_prompt += "\n\n" + task.system_prompt
-        # Re-apply filtering with updated lifecycle tools
-        await self._filter_tools()
-    async def run(self, prompt_or_task: str | Task | dict[str, Any], max_steps: int = 10) -> Trace:
-        """
-        Run the agent with the given prompt or task.
+        # Call hook for subclass-specific initialization (e.g., tool format conversion)
+        self._on_tools_ready()
-        Args:
-            prompt_or_task: Either a string prompt for simple execution or a Task object
-            max_steps: Maximum number of steps (-1 for infinite)
+        self._initialized = True
-        Returns:
-            Trace with reward, done, content, isError fields and trace steps
+    def _on_tools_ready(self) -> None:
+        """Hook called after tools are discovered and validated.
+        Subclasses can override this to perform provider-specific setup,
+        such as converting MCP tools to the provider's format.
+        Called by _initialize_from_ctx() after _available_tools is populated.
         """
-        # Import here to avoid circular imports
-        from hud.datasets import Task
+        return  # Default no-op - subclasses override for provider-specific setup
-        if isinstance(prompt_or_task, dict):
-            prompt_or_task = Task(**prompt_or_task)
-        elif not isinstance(prompt_or_task, str) and not isinstance(prompt_or_task, Task):
-            raise TypeError(f"prompt_or_task must be str or Task, got {type(prompt_or_task)}")
+    async def run(
+        self,
+        ctx: EvalContext,
+        *,
+        max_steps: int = 10,
+    ) -> Trace:
+        """
+        Run the agent on the given evaluation context.
-        try:
-            # Establish the connection with the MCP server/Environment
-            await self.initialize(prompt_or_task)
+        The agent uses ctx.prompt as the task and ctx.call_tool() for tool execution.
+        Automatically calls ctx.submit() with the final answer.
-            # Handle Task objects with full lifecycle
-            if isinstance(prompt_or_task, Task):
-                return await self.run_task(prompt_or_task, max_steps)
+        Args:
+            ctx: EvalContext from hud.eval() - contains prompt and tools
+            max_steps: Maximum number of agent steps (-1 for infinite)
-            # Handle simple string prompts
-            elif isinstance(prompt_or_task, str):
-                context = text_to_blocks(prompt_or_task)
-                return await self._run_context(context, max_steps=max_steps)
+        Returns:
+            Trace with done, content, isError fields
+        Example:
+            ```python
+            async with hud.eval(task) as ctx:
+                agent = ClaudeAgent.create()
+                await agent.run(ctx)
+            # ctx.reward is set by the scenario's evaluate phase
+            ```
+        """
+        from hud.eval.context import EvalContext
-        except Exception as e:
-            # Always return a Trace object for any exception
-            if self._is_connection_error(e):
-                # Return error trace for connection failures
-                return Trace(
-                    reward=0.0,
-                    done=True,
-                    content=self._get_connection_error_message(e),
-                    isError=True,
+        if not isinstance(ctx, EvalContext):
+            raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
+        if not ctx.prompt:
+            if ctx.has_scenario:
+                # Scenario was specified but prompt is still empty
+                # (e.g., scenario returned empty string, or edge case not caught in scenarios.py)
+                scenario = ctx._task.scenario if ctx._task else "unknown"
+                raise ValueError(
+                    f"ctx.prompt is not set.\n\n"
+                    f"Scenario '{scenario}' was specified but returned an empty prompt.\n"
+                    f"Check that the scenario's setup function returns a non-empty string."
                 )
             else:
-                # Return error trace for any other exception
-                return Trace(
-                    reward=0.0,
-                    done=True,
-                    content=f"Task failed with error: {e}",
-                    isError=True,
-                    info={"error": str(e)},
+                # No scenario specified at all
+                raise ValueError(
+                    "ctx.prompt is not set.\n\n"
+                    "No scenario was specified in your task file.\n"
+                    "Either add a 'scenario' field to your task, or set ctx.prompt manually "
+                    "before running the agent."
                 )
-        finally:
-            # Cleanup auto-created resources
-            await self._cleanup()
-    async def run_task(self, task: Task, max_steps: int = 10) -> Trace:
-        """
-        Execute a task with setup and evaluate phases.
+        # Store context for tool calls
+        self.ctx = ctx
-        Args:
-            task: Task object with prompt, setup, and evaluate configs
-            max_steps: Maximum steps for task execution (-1 for infinite)
+        # Initialize tools from context
+        if not self._initialized:
+            await self._initialize_from_ctx(ctx)
-        Returns:
-            Trace with reward from evaluation
-        """
         try:
-            # Setup phase
-            start_context: list[types.ContentBlock] = []
-            # Extract the initial task information
-            if task.prompt:
-                start_context.extend(text_to_blocks(task.prompt))
-            # Execute the setup tool and append the initial observation to the context
-            if task.setup_tool is not None:
-                self.console.progress_log(f"Setting up tool phase: {task.setup_tool}")
-                results = await self.call_tools(task.setup_tool)
-                if any(result.isError for result in results):
-                    return Trace(
-                        reward=0.0,
-                        done=True,
-                        content=f"Setup tool failed: {results}",
-                        isError=True,
-                        task=task,
-                    )
-                if self.append_setup_output and isinstance(results[0].content, list):
-                    start_context.extend(results[0].content)
-            if not self.initial_screenshot:
-                start_context = await self._filter_messages(start_context, include_types=["text"])
-            # Execute the task (agent loop) - this returns a empty trace object with the final response  # noqa: E501
-            prompt_result = await self._run_context(start_context, max_steps=max_steps)
+            # Build initial context - optionally append setup tool output
+            # Check ctx first (task-level override), then fall back to agent config
+            append_setup = getattr(ctx, "append_setup_output", False) or getattr(
+                self.config, "append_setup_output", False
+            )
+            initial_prompt = ctx.prompt
+            if append_setup:
+                setup_output = getattr(ctx, "setup_output", None)
+                if setup_output:
+                    initial_prompt = f"{initial_prompt}\n\n{setup_output}"
-        except Exception as e:
-            self.console.error_log(f"Task execution failed: {e}")
-            # Create an error result but don't return yet - we still want to evaluate
-            prompt_result = Trace(reward=0.0, done=True, content=str(e), isError=True, task=task)
-            prompt_result.populate_from_context()
+            # Build initial blocks (text prompt + optional screenshot)
+            initial_blocks = text_to_blocks(initial_prompt)
-        # Always evaluate if we have evaluate tool, regardless of errors
-        if task.evaluate_tool is not None:
-            try:
-                results = await self.call_tools(task.evaluate_tool)
-                if any(result.isError for result in results):
-                    self.console.warning_log(f"Evaluate tool returned error: {results}")
-                    # Still extract what we can from the error response
-                    if prompt_result is None:
-                        prompt_result = Trace(
-                            reward=0.0,
-                            done=True,
-                            content="Task failed before evaluation",
-                            isError=True,
-                            task=task,
-                        )
-                    prompt_result.reward = 0.0  # Default to 0 on error
-                else:
-                    # Extract reward and content from evaluation
-                    if results:
-                        reward = find_reward(results[0])
-                        self.console.info_log(f"Eval: {reward:.4f} {task.evaluate_tool}")
-                        eval_content = find_content(results[0])
-                        # Update the prompt result with evaluation reward
-                        if prompt_result is None:
-                            prompt_result = Trace(
-                                reward=reward,
-                                done=True,
-                                content=eval_content or "",
-                                isError=False,
-                                task=task,
-                            )
-                        else:
-                            prompt_result.reward = reward
+            result = await self._run_context(initial_blocks, max_steps=max_steps)
-                            # Update the prompt result with evaluation content (if available)
-                            if eval_content:
-                                # Prompt result may already have final response content,
-                                # so we append to it
-                                if prompt_result.content:
-                                    prompt_result.content += "\n\n" + eval_content
-                                else:
-                                    prompt_result.content = eval_content
+            # Propagate error state to context for platform visibility
+            if result.isError and hasattr(ctx, "error"):
+                error_msg = result.info.get("error") if result.info else result.content
+                ctx.error = Exception(str(error_msg)) if error_msg else Exception("Agent error")
-            except Exception as e:
-                self.console.error_log(f"Evaluation phase failed: {e}")
-                # Ensure we have a result even if evaluation failed
-                if prompt_result is None:
-                    prompt_result = Trace(
-                        reward=0.0,
-                        done=True,
-                        content=f"Evaluation failed: {e}",
-                        isError=True,
-                        task=task,
-                    )
+            # Submit final answer to context (only if scenario is running)
+            if result.content and ctx.has_scenario:
+                await ctx.submit(result.content)
-        prompt_result.task = task
+            return result
-        return prompt_result
+        except Exception as e:
+            logger.exception("Error while running agent:")
+            # Propagate error to context for platform visibility
+            if hasattr(ctx, "error"):
+                ctx.error = e
+            return Trace(
+                reward=0.0,
+                done=True,
+                content=f"Agent failed with error: {e}",
+                isError=True,
+                info={"error": str(e)},
+            )
+        finally:
+            # Cleanup auto-created resources
+            await self._cleanup()
     async def _run_context(
         self, context: list[types.ContentBlock], *, max_steps: int = 10
@@ -356,6 +256,8 @@ class MCPAgent(ABC):
         final_response = None
         error = None
+        messages: list[Any] = []
         try:
             # Start with system messages
             messages = await self.get_system_messages()
@@ -380,19 +282,17 @@ class MCPAgent(ABC):
                     # Check if we should stop
                     if response.done or not response.tool_calls:
-                        # Optional external ResponseAgent to decide whether to stop
-                        decision = "STOP"
-                        if self.response_agent is not None and response.content:
+                        # Use auto_respond to decide whether to stop
+                        decision: Literal["STOP", "CONTINUE"] = "STOP"
+                        if self.auto_respond and response.content:
                             try:
-                                decision = await self.response_agent.determine_response(
-                                    response.content
-                                )
+                                from hud.agents.misc import ResponseAgent
+                                response_agent = ResponseAgent()
+                                decision = await response_agent.determine_response(response.content)
                             except Exception as e:
-                                self.console.warning_log(f"ResponseAgent failed: {e}")
+                                self.console.warning_log(f"Auto-respond failed: {e}")
                         if decision == "STOP":
-                            # Try to submit response through lifecycle tool
-                            await self._maybe_submit_response(response, messages)
                             self.console.debug("Stopping execution")
                             final_response = response
                             break
@@ -403,11 +303,7 @@ class MCPAgent(ABC):
                     # 2. Execute tools
                     tool_calls = response.tool_calls
-                    for tool_call in tool_calls:
-                        self.console.info_log(f"{tool_call}")
                     tool_results = await self.call_tools(tool_calls)
-                    for tool_result in tool_results:
-                        self.console.info_log(f"{tool_result}")
                     # 3. Format tool results and add to messages
                     tool_messages = await self.format_tool_results(tool_calls, tool_results)
@@ -449,8 +345,17 @@ class MCPAgent(ABC):
             is_error = False
         # Ensure all parameters are the correct type
+        # Use ctx.reward if already set (e.g., from scenario evaluate), otherwise 0.0
+        # Note: For v4 tasks with evaluate_tool, reward is set in __aexit__ after this returns,
+        # so callers should prefer ctx.reward over Trace.reward for the final result.
+        reward = 0.0
+        if self.ctx is not None:
+            ctx_reward = getattr(self.ctx, "reward", None)
+            if ctx_reward is not None:
+                reward = ctx_reward
         trace_params = {
-            "reward": 0.0,
+            "reward": reward,
             "done": True,
             "messages": messages,
             "content": final_response.content if final_response else error,
@@ -459,16 +364,13 @@ class MCPAgent(ABC):
         }
         trace_result = Trace(**trace_params)
-        # Populate trace steps from current context
-        trace_result.populate_from_context()
         return trace_result
     async def call_tools(
         self, tool_call: MCPToolCall | list[MCPToolCall] | None = None
     ) -> list[MCPToolResult]:
         """
-        Call a tool through the MCP client.
+        Call tools through the bound EvalContext.
         Args:
             tool_call: MCPToolCall or list of MCPToolCall
@@ -482,20 +384,17 @@ class MCPAgent(ABC):
         if isinstance(tool_call, MCPToolCall):
             tool_call = [tool_call]
-        if self.mcp_client is None:
-            raise ValueError("Client is not initialized")
+        if self.ctx is None:
+            raise ValueError("Agent not bound to context - call run(ctx) first")
         results: list[MCPToolResult] = []
         for tc in tool_call:
             try:
                 self.console.debug(f"Calling tool: {tc}")
-                results.append(await self.mcp_client.call_tool(tc))
+                result = await self.ctx.call_tool(tc)
+                results.append(MCPToolResult(content=result.content, isError=result.isError))
             except TimeoutError as e:
                 self.console.error_log(f"Tool execution timed out: {e}")
-                try:
-                    await self.mcp_client.shutdown()
-                except Exception as close_err:
-                    self.console.debug(f"Failed to close MCP client cleanly: {close_err}")
                 raise
             except Exception as e:
                 self.console.error_log(f"Tool execution failed: {e}")
@@ -514,8 +413,6 @@ class MCPAgent(ABC):
         """
         Get response from the model including any tool calls.
-        NOTE: Subclasses should decorate this method with:
-            @hud.instrument(span_type="agent", record_args=False, record_result=True)
         Args:
             messages: Current conversation messages
@@ -575,148 +472,13 @@ class MCPAgent(ABC):
         return await self.format_blocks(blocks)
-    async def _filter_tools(self) -> None:
-        """Apply tool filtering based on allowed/disallowed lists."""
-        # Get all tools from client
-        if self.mcp_client is None:
-            raise ValueError("MCP client is not initialized")
-        all_tools = await self.mcp_client.list_tools()
-        response_tools_by_server: dict[str, str] = {}  # server_name -> tool_name
-        for tool in all_tools:
-            if "response" in tool.name or tool.name == "response":
-                self.console.debug(f"Found response tool: '{tool.name}'")
-                # Extract server name from tool name (e.g., "grader_response" -> "grader")
-                if "_" in tool.name:
-                    server_name = tool.name.split("_", 1)[0]
-                    response_tools_by_server[server_name] = tool.name
-                else:
-                    response_tools_by_server["_default"] = tool.name
-        # Add response tool to lifecycle tools BEFORE filtering
-        if response_tools_by_server and hasattr(self.mcp_client, "mcp_config"):
-            # Get server names in order from mcp_config
-            server_names = list(self.mcp_client.mcp_config.keys())
-            self.console.debug(f"Server names: {server_names}")
-            # Try to find response tool from last server first
-            response_tool_name = None
-            for server_name in reversed(server_names):
-                if server_name in response_tools_by_server:
-                    response_tool_name = response_tools_by_server[server_name]
-                    self.console.debug(
-                        f"Found response tool '{response_tool_name}' from server '{server_name}'"
-                    )
-                    break
-            # Fallback to any response tool
-            if not response_tool_name and response_tools_by_server:
-                response_tool_name = next(iter(response_tools_by_server.values()))
-                self.console.debug(f"Using fallback response tool '{response_tool_name}'")
-            # Add to lifecycle tools if found
-            if response_tool_name and response_tool_name not in self.lifecycle_tools:
-                self.console.debug(f"Auto-detected '{response_tool_name}' tool as a lifecycle tool")
-                self.response_tool_name = response_tool_name
-                self.lifecycle_tools.append(response_tool_name)
-            elif response_tool_name:
-                self.console.debug(
-                    f"Response tool '{response_tool_name}' already in lifecycle_tools"
-                )
-                self.response_tool_name = response_tool_name
-        else:
-            self.console.debug("No response tools found or no mcp_config")
-        # Filter tools
-        self._available_tools = []
-        self._tool_map = {}
-        self.console.debug(f"All tools: {[t.name for t in all_tools]}")
-        self.console.debug(f"Allowed tools: {self.allowed_tools}")
-        self.console.debug(f"Agent tools: {self.agent_tools}")
-        self.console.debug(f"Disallowed tools: {self.disallowed_tools}")
-        self.console.debug(f"Lifecycle tools: {self.lifecycle_tools}")
-        for tool in all_tools:
-            # Lifecycle tools (setup, evaluate, response) should always be included
-            is_lifecycle = tool.name in self.lifecycle_tools
-            # Check if tool should be included
-            if not is_lifecycle:
-                if self.allowed_tools and tool.name not in self.allowed_tools:
-                    self.console.debug(f"Skipping tool '{tool.name}' - not in allowed_tools")
-                    continue
-                if self.agent_tools and tool.name not in self.agent_tools:
-                    self.console.debug(f"Skipping tool '{tool.name}' - not in agent_tools")
-                    continue
-                if tool.name in self.disallowed_tools:
-                    self.console.debug(f"Skipping tool '{tool.name}' - in disallowed_tools")
-                    continue
-            self.console.debug(
-                f"Adding tool '{tool.name}' to available tools (lifecycle={is_lifecycle})"
-            )
-            self._available_tools.append(tool)
-            self._tool_map[tool.name] = tool
-        # Check if all required tools are available
-        if self.required_tools:
-            available_tool_names = {tool.name for tool in self._available_tools}
-            missing_tools = [
-                tool for tool in self.required_tools if tool not in available_tool_names
-            ]
-            if missing_tools:
-                raise ValueError(
-                    f"Required tools not available: {missing_tools}. "
-                    f"Available tools: {list(available_tool_names)}"
-                )
-        available_tools = self.get_available_tools()
-        self.console.info(
-            f"Agent initialized with {len(available_tools)} tools: {', '.join([t.name for t in available_tools])}"  # noqa: E501
-        )
-    async def _maybe_submit_response(self, response: AgentResponse, messages: list[Any]) -> None:
-        """Submit response through lifecycle tool if available.
-        Args:
-            response: The agent's response
-            messages: The current message history (will be modified in-place)
-        """
-        if self.response_tool_name:
-            self.console.debug(f"Calling response lifecycle tool: {self.response_tool_name}")
-            try:
-                # Call the response tool with the agent's response
-                response_tool_call = MCPToolCall(
-                    name=self.response_tool_name, arguments={"response": response.content}
-                )
-                response_results = await self.call_tools(response_tool_call)
-                # Format and add the response tool results to messages
-                response_messages = await self.format_tool_results(
-                    [response_tool_call], response_results
-                )
-                messages.extend(response_messages)
-                # Mark the task as done
-                self.console.debug("Response lifecycle tool executed, marking task as done")
-            except Exception as e:
-                self.console.error_log(f"Response lifecycle tool failed: {e}")
-    async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
-        """Inject metadata into the metadata of the initialize request."""
-        if self.metadata:
-            patch_mcp_config(
-                mcp_config,
-                MCPConfigPatch(meta=self.metadata),
-            )
-        self._auto_trace_cm = setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
     def get_available_tools(self) -> list[types.Tool]:
         """Get list of available MCP tools for LLM use (excludes lifecycle tools)."""
-        lifecycle_tool_names = self.lifecycle_tools
-        return [tool for tool in self._available_tools if tool.name not in lifecycle_tool_names]
+        if self._available_tools is None:
+            raise RuntimeError(
+                "Tools have not been initialized. Call initialize() before accessing available tools."  # noqa: E501
+            )
+        return self._available_tools
     def get_tool_schemas(self) -> list[dict]:
         """Get tool schemas in a format suitable for the model."""
@@ -752,65 +514,8 @@ class MCPAgent(ABC):
     async def _cleanup(self) -> None:
         """Cleanup resources."""
-        # Clean up auto-created trace if any
-        if self._auto_trace_cm:
-            try:
-                self._auto_trace_cm.__exit__(None, None, None)
-                self.console.debug("Closed auto-created trace")
-            except Exception as e:
-                self.console.warning_log(f"Failed to close auto-created trace: {e}")
-            finally:
-                self._auto_trace_cm = None
-        # Clean up auto-created client
-        if self._auto_created_client and self.mcp_client:
-            try:
-                await self.mcp_client.shutdown()
-                self.console.debug("Closed auto-created MCPClient")
-            except Exception as e:
-                self.console.warning_log(f"Failed to close auto-created client: {e}")
-            finally:
-                self.mcp_client = None
-                self._auto_created_client = False
-    def _is_connection_error(self, e: Exception) -> bool:
-        """Check if an exception is a connection error."""
-        error_msg = str(e).lower()
-        return any(
-            pattern in error_msg
-            for pattern in [
-                "connection",
-                "connect",
-                "refused",
-                "failed",
-                "could not connect",
-                "mcp server",
-            ]
-        )
-    def _get_connection_error_message(self, e: Exception) -> str:
-        """Extract a helpful connection error message."""
-        import re
-        url_match = re.search(r"https?://[^\s]+", str(e))
-        url = url_match.group(0) if url_match else "the MCP server"
-        return f"Connection failed: Could not connect to {url}. Is your MCP client/server running?"
-    def _handle_connection_error(self, e: Exception) -> None:
-        """Handle connection errors with helpful messages."""
-        if self._is_connection_error(e):
-            msg = self._get_connection_error_message(e)
-            # Always show connection errors, not just when logging is enabled
-            self.console.error(f"❌ {msg}")
-            self.console.info("💡 Make sure the MCP server is started before running the agent.")
-            # For localhost, provide specific instructions
-            error_str = str(e).lower()
-            if "localhost" in error_str or "127.0.0.1" in error_str:
-                self.console.info("   Run 'hud dev' in another terminal to start the MCP server")
-            raise RuntimeError(msg) from e
-        raise
+        # Clear context reference
+        self.ctx = None
 def _format_error_result(error_message: str) -> MCPToolResult:
@@ -824,14 +529,45 @@ def text_to_blocks(text: str) -> list[types.ContentBlock]:
 def find_reward(result: MCPToolResult) -> float:
     """Find the reward in the result.
-    Agent accepts "reward", "grade", "score"
+    Agent accepts "reward", "grade", "score", or weighted subscores
+    If isError is True, return 0.0 (error results should not contribute positive reward).
     If not found, return 0.0
     """
+    # Error results should return 0.0 - don't extract reward from error responses
+    if result.isError:
+        logger.warning("Evaluate tool returned error, using reward=0.0")
+        return 0.0
     accept_keys = ["reward", "grade", "score"]
+    # Check for direct reward/grade/score keys
     for key in accept_keys:
         if isinstance(result.structuredContent, dict) and key in result.structuredContent:
             return result.structuredContent[key]
+    # Check for subscores and weights format
+    if (
+        isinstance(result.structuredContent, dict)
+        and "subscores" in result.structuredContent
+        and "weights" in result.structuredContent
+    ):
+        subscores = result.structuredContent["subscores"]
+        weights = result.structuredContent["weights"]
+        if isinstance(subscores, dict) and isinstance(weights, dict):
+            try:
+                # Multiply each subscore by its corresponding weight and sum
+                reward = sum(
+                    float(subscores[key]) * float(weights.get(key, 0.0))
+                    for key in subscores
+                    if key in weights
+                )
+                return reward
+            except (ValueError, TypeError) as e:
+                logger.error("Failed to parse subscores/weights: %s", e)
+                return 0.0
+    # Check for reward in JSON text content
     if isinstance(result.content, list):
         for content in result.content:
             if isinstance(content, types.TextContent):
@@ -842,6 +578,8 @@ def find_reward(result: MCPToolResult) -> float:
                             return value
                 except json.JSONDecodeError:
                     pass
+    logger.error("Couldn't parse reward from result: %s", str(result.structuredContent))
     return 0.0

hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

hud-python 0.4.45py3-none-any.whl → 0.5.13py3-none-any.whl