PyPI - hud-python - Versions diffs - 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

hud-python 0.4.45py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (274) hide show

hud/__init__.py +27 -7
hud/agents/__init__.py +11 -5
hud/agents/base.py +220 -500
hud/agents/claude.py +200 -240
hud/agents/gemini.py +275 -0
hud/agents/gemini_cua.py +335 -0
hud/agents/grounded_openai.py +98 -100
hud/agents/misc/integration_test_agent.py +51 -20
hud/agents/misc/response_agent.py +41 -36
hud/agents/openai.py +291 -292
hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
hud/agents/operator.py +211 -0
hud/agents/tests/conftest.py +133 -0
hud/agents/tests/test_base.py +300 -622
hud/agents/tests/test_base_runtime.py +233 -0
hud/agents/tests/test_claude.py +379 -210
hud/agents/tests/test_client.py +9 -10
hud/agents/tests/test_gemini.py +369 -0
hud/agents/tests/test_grounded_openai_agent.py +65 -50
hud/agents/tests/test_openai.py +376 -140
hud/agents/tests/test_operator.py +362 -0
hud/agents/tests/test_run_eval.py +179 -0
hud/cli/__init__.py +461 -545
hud/cli/analyze.py +43 -5
hud/cli/build.py +664 -110
hud/cli/debug.py +8 -5
hud/cli/dev.py +882 -734
hud/cli/eval.py +782 -668
hud/cli/flows/dev.py +167 -0
hud/cli/flows/init.py +191 -0
hud/cli/flows/tasks.py +153 -56
hud/cli/flows/templates.py +151 -0
hud/cli/flows/tests/__init__.py +1 -0
hud/cli/flows/tests/test_dev.py +126 -0
hud/cli/init.py +60 -58
hud/cli/push.py +29 -11
hud/cli/rft.py +311 -0
hud/cli/rft_status.py +145 -0
hud/cli/tests/test_analyze.py +5 -5
hud/cli/tests/test_analyze_metadata.py +3 -2
hud/cli/tests/test_analyze_module.py +120 -0
hud/cli/tests/test_build.py +108 -6
hud/cli/tests/test_build_failure.py +41 -0
hud/cli/tests/test_build_module.py +50 -0
hud/cli/tests/test_cli_init.py +6 -1
hud/cli/tests/test_cli_more_wrappers.py +30 -0
hud/cli/tests/test_cli_root.py +140 -0
hud/cli/tests/test_convert.py +361 -0
hud/cli/tests/test_debug.py +12 -10
hud/cli/tests/test_dev.py +197 -0
hud/cli/tests/test_eval.py +251 -0
hud/cli/tests/test_eval_bedrock.py +51 -0
hud/cli/tests/test_init.py +124 -0
hud/cli/tests/test_main_module.py +11 -5
hud/cli/tests/test_mcp_server.py +12 -100
hud/cli/tests/test_push_happy.py +74 -0
hud/cli/tests/test_push_wrapper.py +23 -0
hud/cli/tests/test_registry.py +1 -1
hud/cli/tests/test_utils.py +1 -1
hud/cli/{rl → utils}/celebrate.py +14 -12
hud/cli/utils/config.py +18 -1
hud/cli/utils/docker.py +130 -4
hud/cli/utils/env_check.py +9 -9
hud/cli/utils/git.py +136 -0
hud/cli/utils/interactive.py +39 -5
hud/cli/utils/metadata.py +69 -0
hud/cli/utils/runner.py +1 -1
hud/cli/utils/server.py +2 -2
hud/cli/utils/source_hash.py +3 -3
hud/cli/utils/tasks.py +4 -1
hud/cli/utils/tests/__init__.py +0 -0
hud/cli/utils/tests/test_config.py +58 -0
hud/cli/utils/tests/test_docker.py +93 -0
hud/cli/utils/tests/test_docker_hints.py +71 -0
hud/cli/utils/tests/test_env_check.py +74 -0
hud/cli/utils/tests/test_environment.py +42 -0
hud/cli/utils/tests/test_git.py +142 -0
hud/cli/utils/tests/test_interactive_module.py +60 -0
hud/cli/utils/tests/test_local_runner.py +50 -0
hud/cli/utils/tests/test_logging_utils.py +23 -0
hud/cli/utils/tests/test_metadata.py +49 -0
hud/cli/utils/tests/test_package_runner.py +35 -0
hud/cli/utils/tests/test_registry_utils.py +49 -0
hud/cli/utils/tests/test_remote_runner.py +25 -0
hud/cli/utils/tests/test_runner_modules.py +52 -0
hud/cli/utils/tests/test_source_hash.py +36 -0
hud/cli/utils/tests/test_tasks.py +80 -0
hud/cli/utils/version_check.py +258 -0
hud/cli/{rl → utils}/viewer.py +2 -2
hud/clients/README.md +12 -11
hud/clients/__init__.py +4 -3
hud/clients/base.py +166 -26
hud/clients/environment.py +51 -0
hud/clients/fastmcp.py +13 -6
hud/clients/mcp_use.py +40 -15
hud/clients/tests/test_analyze_scenarios.py +206 -0
hud/clients/tests/test_protocol.py +9 -3
hud/datasets/__init__.py +23 -20
hud/datasets/loader.py +327 -0
hud/datasets/runner.py +192 -105
hud/datasets/tests/__init__.py +0 -0
hud/datasets/tests/test_loader.py +221 -0
hud/datasets/tests/test_utils.py +315 -0
hud/datasets/utils.py +270 -90
hud/environment/__init__.py +50 -0
hud/environment/connection.py +206 -0
hud/environment/connectors/__init__.py +33 -0
hud/environment/connectors/base.py +68 -0
hud/environment/connectors/local.py +177 -0
hud/environment/connectors/mcp_config.py +109 -0
hud/environment/connectors/openai.py +101 -0
hud/environment/connectors/remote.py +172 -0
hud/environment/environment.py +694 -0
hud/environment/integrations/__init__.py +45 -0
hud/environment/integrations/adk.py +67 -0
hud/environment/integrations/anthropic.py +196 -0
hud/environment/integrations/gemini.py +92 -0
hud/environment/integrations/langchain.py +82 -0
hud/environment/integrations/llamaindex.py +68 -0
hud/environment/integrations/openai.py +238 -0
hud/environment/mock.py +306 -0
hud/environment/router.py +112 -0
hud/environment/scenarios.py +493 -0
hud/environment/tests/__init__.py +1 -0
hud/environment/tests/test_connection.py +317 -0
hud/environment/tests/test_connectors.py +218 -0
hud/environment/tests/test_environment.py +161 -0
hud/environment/tests/test_integrations.py +257 -0
hud/environment/tests/test_local_connectors.py +201 -0
hud/environment/tests/test_scenarios.py +280 -0
hud/environment/tests/test_tools.py +208 -0
hud/environment/types.py +23 -0
hud/environment/utils/__init__.py +35 -0
hud/environment/utils/formats.py +215 -0
hud/environment/utils/schema.py +171 -0
hud/environment/utils/tool_wrappers.py +113 -0
hud/eval/__init__.py +67 -0
hud/eval/context.py +674 -0
hud/eval/display.py +299 -0
hud/eval/instrument.py +185 -0
hud/eval/manager.py +466 -0
hud/eval/parallel.py +268 -0
hud/eval/task.py +340 -0
hud/eval/tests/__init__.py +1 -0
hud/eval/tests/test_context.py +178 -0
hud/eval/tests/test_eval.py +210 -0
hud/eval/tests/test_manager.py +152 -0
hud/eval/tests/test_parallel.py +168 -0
hud/eval/tests/test_task.py +145 -0
hud/eval/types.py +63 -0
hud/eval/utils.py +183 -0
hud/patches/__init__.py +19 -0
hud/patches/mcp_patches.py +151 -0
hud/patches/warnings.py +54 -0
hud/samples/browser.py +4 -4
hud/server/__init__.py +2 -1
hud/server/low_level.py +2 -1
hud/server/router.py +164 -0
hud/server/server.py +567 -80
hud/server/tests/test_mcp_server_integration.py +11 -11
hud/server/tests/test_mcp_server_more.py +1 -1
hud/server/tests/test_server_extra.py +2 -0
hud/settings.py +45 -3
hud/shared/exceptions.py +36 -10
hud/shared/hints.py +26 -1
hud/shared/requests.py +15 -3
hud/shared/tests/test_exceptions.py +40 -31
hud/shared/tests/test_hints.py +167 -0
hud/telemetry/__init__.py +20 -19
hud/telemetry/exporter.py +201 -0
hud/telemetry/instrument.py +158 -253
hud/telemetry/tests/test_eval_telemetry.py +356 -0
hud/telemetry/tests/test_exporter.py +258 -0
hud/telemetry/tests/test_instrument.py +401 -0
hud/tools/__init__.py +16 -2
hud/tools/apply_patch.py +639 -0
hud/tools/base.py +54 -4
hud/tools/bash.py +2 -2
hud/tools/computer/__init__.py +4 -0
hud/tools/computer/anthropic.py +2 -2
hud/tools/computer/gemini.py +385 -0
hud/tools/computer/hud.py +23 -6
hud/tools/computer/openai.py +20 -21
hud/tools/computer/qwen.py +434 -0
hud/tools/computer/settings.py +37 -0
hud/tools/edit.py +3 -7
hud/tools/executors/base.py +4 -2
hud/tools/executors/pyautogui.py +1 -1
hud/tools/grounding/grounded_tool.py +13 -18
hud/tools/grounding/grounder.py +10 -31
hud/tools/grounding/tests/test_grounded_tool.py +26 -44
hud/tools/jupyter.py +330 -0
hud/tools/playwright.py +18 -3
hud/tools/shell.py +308 -0
hud/tools/tests/test_apply_patch.py +718 -0
hud/tools/tests/test_computer.py +4 -9
hud/tools/tests/test_computer_actions.py +24 -2
hud/tools/tests/test_jupyter_tool.py +181 -0
hud/tools/tests/test_shell.py +596 -0
hud/tools/tests/test_submit.py +85 -0
hud/tools/tests/test_types.py +193 -0
hud/tools/types.py +21 -1
hud/types.py +167 -57
hud/utils/__init__.py +2 -0
hud/utils/env.py +67 -0
hud/utils/hud_console.py +61 -3
hud/utils/mcp.py +15 -58
hud/utils/strict_schema.py +162 -0
hud/utils/tests/test_init.py +1 -2
hud/utils/tests/test_mcp.py +1 -28
hud/utils/tests/test_pretty_errors.py +186 -0
hud/utils/tests/test_tool_shorthand.py +154 -0
hud/utils/tests/test_version.py +1 -1
hud/utils/types.py +20 -0
hud/version.py +1 -1
hud_python-0.5.1.dist-info/METADATA +264 -0
hud_python-0.5.1.dist-info/RECORD +299 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
hud/agents/langchain.py +0 -261
hud/agents/lite_llm.py +0 -72
hud/cli/rl/__init__.py +0 -180
hud/cli/rl/config.py +0 -101
hud/cli/rl/display.py +0 -133
hud/cli/rl/gpu.py +0 -63
hud/cli/rl/gpu_utils.py +0 -321
hud/cli/rl/local_runner.py +0 -595
hud/cli/rl/presets.py +0 -96
hud/cli/rl/remote_runner.py +0 -463
hud/cli/rl/rl_api.py +0 -150
hud/cli/rl/vllm.py +0 -177
hud/cli/rl/wait_utils.py +0 -89
hud/datasets/parallel.py +0 -687
hud/misc/__init__.py +0 -1
hud/misc/claude_plays_pokemon.py +0 -292
hud/otel/__init__.py +0 -35
hud/otel/collector.py +0 -142
hud/otel/config.py +0 -181
hud/otel/context.py +0 -570
hud/otel/exporters.py +0 -369
hud/otel/instrumentation.py +0 -135
hud/otel/processors.py +0 -121
hud/otel/tests/__init__.py +0 -1
hud/otel/tests/test_processors.py +0 -197
hud/rl/README.md +0 -30
hud/rl/__init__.py +0 -1
hud/rl/actor.py +0 -176
hud/rl/buffer.py +0 -405
hud/rl/chat_template.jinja +0 -101
hud/rl/config.py +0 -192
hud/rl/distributed.py +0 -132
hud/rl/learner.py +0 -637
hud/rl/tests/__init__.py +0 -1
hud/rl/tests/test_learner.py +0 -186
hud/rl/train.py +0 -382
hud/rl/types.py +0 -101
hud/rl/utils/start_vllm_server.sh +0 -30
hud/rl/utils.py +0 -524
hud/rl/vllm_adapter.py +0 -143
hud/telemetry/job.py +0 -352
hud/telemetry/replay.py +0 -74
hud/telemetry/tests/test_replay.py +0 -40
hud/telemetry/tests/test_trace.py +0 -63
hud/telemetry/trace.py +0 -158
hud/utils/agent_factories.py +0 -86
hud/utils/async_utils.py +0 -65
hud/utils/group_eval.py +0 -223
hud/utils/progress.py +0 -149
hud/utils/tasks.py +0 -127
hud/utils/tests/test_async_utils.py +0 -173
hud/utils/tests/test_progress.py +0 -261
hud_python-0.4.45.dist-info/METADATA +0 -552
hud_python-0.4.45.dist-info/RECORD +0 -228
{hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0

hud/agents/grounded_openai.py CHANGED Viewed

@@ -3,96 +3,91 @@
 from __future__ import annotations
 import json
-from typing import Any, ClassVar
+from typing import TYPE_CHECKING, Any, ClassVar
+from pydantic import ConfigDict, field_validator
-from hud import instrument
 from hud.tools.grounding import GroundedComputerTool, Grounder, GrounderConfig
 from hud.types import AgentResponse, MCPToolCall, MCPToolResult
-from .openai_chat_generic import GenericOpenAIChatAgent
-class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
-    """OpenAI agent that uses a separate grounding model for element detection.
-    This agent:
-    - Exposes only a synthetic "computer" tool to the planning model
-    - Intercepts tool calls to ground element descriptions to coordinates
-    - Converts grounded results to real computer tool calls
-    - Maintains screenshot state for grounding operations
-    The architecture separates concerns:
-    - Planning model (GPT-4o etc) focuses on high-level reasoning
-    - Grounding model (Qwen2-VL etc) handles visual element detection
-    """
-    metadata: ClassVar[dict[str, Any]] = {}
-    def __init__(
-        self,
-        *,
-        grounder_config: GrounderConfig,
-        model_name: str = "gpt-4o-mini",
-        allowed_tools: list[str] | None = None,
-        append_setup_output: bool = False,
-        system_prompt: str | None = None,
-        **kwargs: Any,
-    ) -> None:
-        """Initialize the grounded OpenAI agent.
-        Args:
-            grounder_config: Configuration for the grounding model
-            openai_client: OpenAI client for the planning model
-            model: Name of the OpenAI model to use for planning (e.g., "gpt-4o", "gpt-4o-mini")
-            real_computer_tool_name: Name of the actual computer tool to execute
-            **kwargs: Additional arguments passed to GenericOpenAIChatAgent
-        """
-        # Set defaults for grounded agent
-        if allowed_tools is None:
-            allowed_tools = ["computer"]
-        if system_prompt is None:
-            system_prompt = (
-                "You are a helpful AI assistant that can control the computer "
-                "through visual interaction.\n\n"
-                "IMPORTANT: Always explain your reasoning and observations before taking actions:\n"
-                "1. First, describe what you see on the screen\n"
-                "2. Explain what you plan to do and why\n"
-                "3. Then use the computer tool with natural language descriptions\n\n"
-                "For example:\n"
-                "- 'I can see a login form with username and password fields. "
-                "I need to click on the username field first.'\n"
-                "- 'There's a blue submit button at the bottom. "
-                "I'll click on it to submit the form.'\n"
-                "- 'I notice a red close button in the top right corner. "
-                "I'll click it to close this dialog.'\n\n"
-                "Use descriptive element descriptions like:\n"
-                "- Colors: 'red button', 'blue link', 'green checkmark'\n"
-                "- Position: 'top right corner', 'bottom of the page', 'left sidebar'\n"
-                "- Text content: 'Submit button', 'Login link', 'Cancel option'\n"
-                "- Element type: 'text field', 'dropdown menu', 'checkbox'"
-            )
-        super().__init__(
-            model_name=model_name,
-            allowed_tools=allowed_tools,
-            append_setup_output=append_setup_output,
-            system_prompt=system_prompt,
-            **kwargs,
-        )
-        self.grounder = Grounder(grounder_config)
-        self.grounded_tool = None
-    async def initialize(self, task: Any = None) -> None:
-        """Initialize the agent and create the grounded tool with mcp_client."""
-        # Call parent initialization first
-        await super().initialize(task)
-        if self.mcp_client is None:
-            raise ValueError("mcp_client must be initialized before creating grounded tool")
+from hud.utils.types import with_signature
+if TYPE_CHECKING:
+    from hud.types import BaseAgentConfig
+from .base import BaseCreateParams
+from .openai_chat import OpenAIChatAgent, OpenAIChatConfig
+DEFAULT_GROUNDED_PROMPT = (
+    "You are a helpful AI assistant that can control the computer through visual "
+    "interaction.\n\n"
+    "IMPORTANT: Always explain your reasoning and observations before taking actions:\n"
+    "1. First, describe what you see on the screen.\n"
+    "2. Explain what you plan to do and why.\n"
+    "3. Then use the computer tool with natural language descriptions.\n\n"
+    "Use descriptive element descriptions:\n"
+    '- Colors ("red button", "blue link")\n'
+    '- Position ("top right corner", "left sidebar")\n'
+    '- Text content ("Submit button", "Login link")\n'
+    '- Element type ("text field", "dropdown")'
+)
+class GroundedOpenAIConfig(OpenAIChatConfig):
+    """Configuration for grounded OpenAI chat agent."""
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    grounder_config: GrounderConfig
+    model: str = "gpt-4o-mini"
+    allowed_tools: list[str] | None = None  # Default set in validator
+    append_setup_output: bool = False
+    system_prompt: str | None = DEFAULT_GROUNDED_PROMPT
+    @field_validator("grounder_config", mode="before")
+    @classmethod
+    def _coerce_grounder_config(cls, value: GrounderConfig | dict[str, Any]) -> GrounderConfig:
+        if isinstance(value, GrounderConfig):
+            return value
+        if isinstance(value, dict):
+            return GrounderConfig(**value)
+    @field_validator("allowed_tools", mode="before")
+    @classmethod
+    def _default_allowed_tools(cls, value: list[str] | None) -> list[str] | None:
+        if value is None:
+            return ["computer"]
+        return value
+class GroundedOpenAICreateParams(BaseCreateParams, GroundedOpenAIConfig):
+    pass
+class GroundedOpenAIChatAgent(OpenAIChatAgent):
+    """OpenAI chat agent that pipes 'computer' tool calls through a vision grounder."""
+    metadata: ClassVar[dict[str, Any] | None] = None
+    config_cls: ClassVar[type[BaseAgentConfig]] = GroundedOpenAIConfig
+    @with_signature(GroundedOpenAICreateParams)
+    @classmethod
+    def create(cls, **kwargs: Any) -> GroundedOpenAIChatAgent:  # pyright: ignore[reportIncompatibleMethodOverride]
+        from .base import MCPAgent
+        return MCPAgent.create.__func__(cls, **kwargs)  # type: ignore[return-value]
+    def __init__(self, params: GroundedOpenAICreateParams | None = None, **kwargs: Any) -> None:
+        super().__init__(params, **kwargs)  # type: ignore[arg-type]
+        self.config: GroundedOpenAIConfig  # type: ignore[assignment]
+        self.grounder = Grounder(self.config.grounder_config)
+        self.grounded_tool: GroundedComputerTool | None = None
+    def _on_tools_ready(self) -> None:
+        """Create the grounded tool after context is bound."""
+        if self.ctx is None:
+            raise ValueError("ctx must be set before creating grounded tool")
         self.grounded_tool = GroundedComputerTool(
-            grounder=self.grounder, mcp_client=self.mcp_client, computer_tool_name="computer"
+            grounder=self.grounder, ctx=self.ctx, computer_tool_name="computer"
         )
     def get_tool_schemas(self) -> list[Any]:
@@ -108,11 +103,6 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
             return []
         return [self.grounded_tool.get_openai_tool_schema()]
-    @instrument(
-        span_type="agent",
-        record_args=False,
-        record_result=True,
-    )
     async def get_response(self, messages: Any) -> AgentResponse:
         """Get response from the planning model and handle grounded tool calls.
@@ -142,11 +132,9 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
         )
         if not has_image:
-            if self.mcp_client is None:
-                raise ValueError("mcp_client is not initialized")
-            screenshot_result = await self.mcp_client.call_tool(
-                MCPToolCall(name="computer", arguments={"action": "screenshot"})
-            )
+            if self.ctx is None:
+                raise ValueError("ctx is not initialized")
+            screenshot_result = await self.ctx.call_tool(("computer", {"action": "screenshot"}))
             for block in screenshot_result.content:
                 # Check for ImageContent type from MCP
@@ -169,8 +157,8 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
         protected_keys = {"model", "messages", "tools", "parallel_tool_calls"}
         extra = {k: v for k, v in (self.completion_kwargs or {}).items() if k not in protected_keys}
-        response = await self.oai.chat.completions.create(
-            model=self.model_name,
+        response = await self.oai.chat.completions.create(  # type: ignore
+            model=self.config.model,
             messages=messages,
             tools=tool_schemas,
             parallel_tool_calls=False,
@@ -193,6 +181,7 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
         if not msg.tool_calls:
             return AgentResponse(
                 content=msg.content or "",
+                reasoning=msg.reasoning_content,
                 tool_calls=[],
                 done=choice.finish_reason in ("stop", "length"),
                 raw=response,
@@ -203,6 +192,7 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
         if tc.function.name != "computer":
             return AgentResponse(
                 content=f"Error: Model called unexpected tool '{tc.function.name}'",
+                reasoning=msg.reasoning_content,
                 tool_calls=[],
                 done=True,
                 raw=response,
@@ -213,13 +203,21 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
             args = json.loads(tc.function.arguments or "{}")
         except json.JSONDecodeError:
             return AgentResponse(
-                content="Error: Invalid tool arguments", tool_calls=[], done=True, raw=response
+                content="Error: Invalid tool arguments",
+                reasoning=msg.reasoning_content,
+                tool_calls=[],
+                done=True,
+                raw=response,
             )
         tool_call = MCPToolCall(name="computer", arguments=args, id=tc.id)
         return AgentResponse(
-            content=msg.content or "", tool_calls=[tool_call], done=False, raw=response
+            content=msg.content or "",
+            reasoning=msg.reasoning_content,
+            tool_calls=[tool_call],
+            done=False,
+            raw=response,
         )
     async def call_tools(

hud/agents/misc/integration_test_agent.py CHANGED Viewed

@@ -1,41 +1,72 @@
 from __future__ import annotations
-from typing import Any
+from typing import TYPE_CHECKING, Any, ClassVar
-from hud.agents.base import MCPAgent, find_reward
-from hud.types import AgentResponse, Task, Trace
+from hud.agents.base import MCPAgent
+from hud.types import AgentResponse, BaseAgentConfig, Trace
+if TYPE_CHECKING:
+    from hud.eval.context import EvalContext
 class IntegrationTestRunner(MCPAgent):
+    """Special agent that runs integration tests by executing tools directly.
+    Unlike regular agents, this doesn't run an LLM loop - it executes
+    integration_test_tool and evaluate_tool in sequence to verify tool behavior.
+    """
+    metadata: ClassVar[dict[str, Any] | None] = {}
+    config_cls: ClassVar[type[BaseAgentConfig]] = BaseAgentConfig
     def __init__(self, **kwargs: Any) -> None:
         kwargs["auto_trace"] = False
         super().__init__(**kwargs)
-        self.metadata = {}
-    async def run(self, task: Task, max_steps: int = 10) -> Trace:
+    async def run(
+        self,
+        ctx: EvalContext,
+        *,
+        max_steps: int = 10,
+    ) -> Trace:
+        """Run integration test by executing tools directly.
+        The EvalContext should have integration_test_tool and evaluate_tool
+        configured in its metadata or environment setup.
+        """
+        from hud.eval.context import EvalContext
+        if not isinstance(ctx, EvalContext):
+            raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
+        self.ctx = ctx
         try:
-            # Initialize using base to set up client and telemetry correctly
-            await self.initialize(task)
+            # Initialize tools from context
+            if not self._initialized:
+                await self._initialize_from_ctx(ctx)
-            # Validate task shape
-            if not getattr(task, "integration_test_tool", None):
+            self.console.info(f"Full system prompt: {self.system_prompt}")
+            # For integration tests, we expect the context's environment to have
+            # _setup_calls, _integration_test_calls, and _evaluate_calls configured
+            env = ctx
+            # Run integration test tool (stored in environment metadata or separate list)
+            integration_test_calls = getattr(env, "_integration_test_calls", [])
+            if not integration_test_calls:
                 raise ValueError(
-                    "--integration-test requires task.integration_test_tool (single call)"
+                    "--integration-test requires integration_test_tool to be configured"
                 )
-            elif not getattr(task, "evaluate_tool", None):
-                raise ValueError("--integration-test requires task.evaluate_tool (single call)")
-            if task.setup_tool:
-                _ = await self.call_tools(task.setup_tool)
-            _ = await self.call_tools(task.integration_test_tool)
-            evaluate_result = await self.call_tools(task.evaluate_tool)
+            for name, args in integration_test_calls:
+                await ctx.call_tool((name, args))
-            reward = float(find_reward(evaluate_result[0])) if evaluate_result else 0.0
+            # The evaluate phase runs automatically when ctx exits,
+            # but we can also get the reward from ctx.reward after
+            return Trace(done=True, reward=ctx.reward or 0.0, info={})
-            return Trace(done=True, reward=reward, info={})
         finally:
-            # Ensure resources are cleaned up so the CLI can exit cleanly
             await self._cleanup()
     # Stub implementations to satisfy abstract base class; not used in --integration-test path

hud/agents/misc/response_agent.py CHANGED Viewed

@@ -1,14 +1,36 @@
 from __future__ import annotations
-import os
+import logging
 from typing import Literal
 from openai import AsyncOpenAI
 from hud.settings import settings
+logger = logging.getLogger(__name__)
 ResponseType = Literal["STOP", "CONTINUE"]
+DEFAULT_SYSTEM_PROMPT = """\
+You are an assistant that helps determine the appropriate response to an agent's message.
+You will receive messages from an agent that is performing tasks for a user.
+Your job is to analyze these messages and respond with one of the following:
+- STOP: If the agent indicates it has successfully completed a task or is stuck,
+  struggling or says it cannot complete the task, even if phrased as a question
+  like "I have entered the right values into this form. Would you like me to do
+  anything else?" or "Here is the website. Is there any other information you
+  need?" or if the agent has strongly determined it wants to stop the task like
+  "The task is infeasible. Can I help you with something else?"
+- CONTINUE: If the agent is asking for clarification before proceeding with a task
+  like "I'm about to clear cookies from this website. Would you like me to proceed?"
+  or "I've entered the right values into this form. Would you like me to continue
+  with the rest of the task?"
+Respond ONLY with one of these two options."""
 class ResponseAgent:
     """
@@ -17,48 +39,30 @@ class ResponseAgent:
     """
     def __init__(
-        self, api_key: str | None = None, model: str = "gpt-4o", system_prompt: str | None = None
+        self,
+        model: str = "gpt-4o",
+        system_prompt: str | None = None,
     ) -> None:
         """
         Initialize the ResponseAgent.
         Args:
-            api_key: The API key to use for the OpenAI client
-            model: The model to use for the OpenAI client (default: "gpt-4o")
-            system_prompt: The system prompt to use for the OpenAI client
+            model: The model to use via HUD inference gateway (default: "gpt-4o").
+                   Supports any model available through inference.hud.ai.
+            system_prompt: Optional custom system prompt for determining responses.
         """
-        self.api_key = api_key or settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
-        if not self.api_key:
+        api_key = settings.api_key
+        if not api_key:
             raise ValueError(
-                "OpenAI API key must be provided or set as OPENAI_API_KEY environment variable"
+                "HUD API key is required for auto_respond. Set HUD_API_KEY environment variable."
             )
-        self.client = AsyncOpenAI(api_key=self.api_key)
-        self.model = model
-        self.system_prompt = (
-            system_prompt
-            or """
-        You are an assistant that helps determine the appropriate response to an agent's message.
-        You will receive messages from an agent that is performing tasks for a user.
-        Your job is to analyze these messages and respond with one of the following:
-        - STOP: If the agent indicates it has successfully completed a task or is stuck,
-          struggling or says it cannot complete the task, even if phrased as a question
-          like "I have entered the right values into this form. Would you like me to do
-          anything else?" or "Here is the website. Is there any other information you
-          need?" or if the agent has strongly determined it wants to stop the task like
-          "The task is infeasible. Can I help you with something else?"
-        - CONTINUE: If the agent is asking for clarification before proceeding with a task
-          like "I'm about to clear cookies from this website. Would you like me to proceed?"
-          or "I've entered the right values into this form. Would you like me to continue
-          with the rest of the task?"
-        Respond ONLY with one of these two options.
-        """
+        self.client: AsyncOpenAI = AsyncOpenAI(
+            base_url=settings.hud_gateway_url,
+            api_key=api_key,
         )
+        self.model = model
+        self.system_prompt = system_prompt or DEFAULT_SYSTEM_PROMPT
     async def determine_response(self, agent_message: str) -> ResponseType:
         """
@@ -80,8 +84,8 @@ class ResponseAgent:
                         "content": f"Agent message: {agent_message}\n\nWhat is the appropriate response?",  # noqa: E501
                     },
                 ],
-                temperature=0.1,  # Low temperature for more deterministic responses
-                max_tokens=5,  # We only need a short response
+                temperature=0.1,
+                max_tokens=5,
             )
             response_text = response.choices[0].message.content
@@ -96,5 +100,6 @@ class ResponseAgent:
             else:
                 return "CONTINUE"
-        except Exception:
+        except Exception as e:
+            logger.warning("Auto-respond failed: %s", e)
             return "CONTINUE"  # Default to continue on error

hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

hud-python 0.4.45py3-none-any.whl → 0.5.1py3-none-any.whl