PyPI - hud-python - Versions diffs - 0.5.39__tar.gz → 0.5.41__tar.gz - Mend

hud-python 0.5.39tar.gz → 0.5.41tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (358) hide show

{hud_python-0.5.39 → hud_python-0.5.41}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.5.39
+Version: 0.5.41
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues

{hud_python-0.5.39 → hud_python-0.5.41}/hud/agents/gemini.py RENAMED Viewed

@@ -10,6 +10,11 @@ from google import genai
 from google.genai import types as genai_types
 from hud.settings import settings
+from hud.tools.computer.gemini import (
+    PREDEFINED_COMPUTER_USE_FUNCTIONS,
+    normalize_gemini_computer_use_args,
+)
+from hud.tools.computer.settings import computer_settings
 from hud.types import AgentType, BaseAgentConfig, InferenceResult, MCPToolCall, MCPToolResult
 from hud.utils.hud_console import HUDConsole
 from hud.utils.types import with_signature
@@ -107,10 +112,17 @@ class GeminiAgent(MCPAgent):
         self.top_p = self.config.top_p
         self.top_k = self.config.top_k
         self.max_output_tokens = self.config.max_output_tokens
+        self.thinking_level = self.config.thinking_level
+        self.include_thoughts = self.config.include_thoughts
         self.hud_console = HUDConsole(logger=logger)
         # Track mapping from Gemini tool names to MCP tool names
         self._gemini_to_mcp_tool_map: dict[str, str] = {}
+        self._computer_tool_name: str | None = None
+        self.excluded_predefined_functions = list(self.config.excluded_predefined_functions)
+        self.max_recent_turn_with_screenshots = (
+            computer_settings.GEMINI_MAX_RECENT_TURN_WITH_SCREENSHOTS
+        )
         self.gemini_tools: genai_types.ToolListUnion = []
     def _on_tools_ready(self) -> None:
@@ -146,6 +158,7 @@ class GeminiAgent(MCPAgent):
     async def get_response(self, messages: list[genai_types.Content]) -> InferenceResult:
         """Get response from Gemini including any tool calls."""
+        self._remove_old_screenshots(messages)
         tools = self.gemini_tools
         citations_enabled = bool(
@@ -154,6 +167,18 @@ class GeminiAgent(MCPAgent):
         if citations_enabled and not self._has_google_search_tool():
             tools = [*list(tools), genai_types.Tool(google_search=genai_types.GoogleSearch())]
+        thinking_config = None
+        if self.thinking_level is not None or self.include_thoughts:
+            thinking_level = (
+                genai_types.ThinkingLevel(self.thinking_level.upper())
+                if self.thinking_level is not None
+                else None
+            )
+            thinking_config = genai_types.ThinkingConfig(
+                thinking_level=thinking_level,
+                include_thoughts=self.include_thoughts,
+            )
         # Build generate content config
         generate_config = genai_types.GenerateContentConfig(
             temperature=self.temperature,
@@ -162,6 +187,7 @@ class GeminiAgent(MCPAgent):
             max_output_tokens=self.max_output_tokens,
             tools=tools,
             system_instruction=self.system_prompt,
+            thinking_config=thinking_config,
         )
         # Use async API to avoid blocking the event loop
@@ -181,8 +207,22 @@ class GeminiAgent(MCPAgent):
         collected_tool_calls: list[MCPToolCall] = []
         if not response.candidates:
-            self.hud_console.warning("Response has no candidates")
-            return result
+            detail_parts = []
+            for attr in ("prompt_feedback", "usage_metadata"):
+                value = getattr(response, attr, None)
+                if value is None:
+                    continue
+                if hasattr(value, "model_dump_json"):
+                    value_repr = value.model_dump_json()
+                elif hasattr(value, "model_dump"):
+                    value_repr = repr(value.model_dump())
+                else:
+                    value_repr = repr(value)
+                detail_parts.append(f"{attr}={value_repr}")
+            details = "; ".join(detail_parts) if detail_parts else "no response metadata"
+            raise RuntimeError(
+                f"Gemini response returned no candidates for model {self.config.model}. {details}"
+            )
         candidate = response.candidates[0]
@@ -282,11 +322,24 @@ class GeminiAgent(MCPAgent):
             return None
         func_name = part.function_call.name or ""
-        mcp_tool_name = self._gemini_to_mcp_tool_map.get(func_name, func_name)
         raw_args = dict(part.function_call.args) if part.function_call.args else {}
+        mcp_tool_name = self._gemini_to_mcp_tool_map.get(func_name)
+        if mcp_tool_name:
+            return MCPToolCall(
+                name=mcp_tool_name,
+                arguments=raw_args,
+            )
+        if self._computer_tool_name and func_name in PREDEFINED_COMPUTER_USE_FUNCTIONS:
+            return MCPToolCall(
+                name=self._computer_tool_name,
+                arguments=normalize_gemini_computer_use_args(func_name, raw_args),
+                gemini_name=func_name,  # type: ignore[arg-type]
+            )
         return MCPToolCall(
-            name=mcp_tool_name,
+            name=func_name,
             arguments=raw_args,
         )
@@ -303,18 +356,47 @@ class GeminiAgent(MCPAgent):
             # Convert MCP tool results to Gemini format
             response_dict: dict[str, Any] = {}
+            is_computer_call = (
+                self._computer_tool_name is not None and tool_call.name == self._computer_tool_name
+            )
             if result.isError:
                 # Extract error message from content
                 error_msg = "Tool execution failed"
                 for content in result.content:
                     if isinstance(content, types.TextContent):
+                        if content.text.startswith("__URL__:"):
+                            continue
                         error_msg = content.text
                         break
                 response_dict["error"] = error_msg
+                if is_computer_call:
+                    response_dict["url"] = self._extract_url(result) or "about:blank"
             else:
                 # Process success content
                 response_dict["success"] = True
+            screenshot_parts: list[genai_types.FunctionResponsePart] = []
+            if is_computer_call:
+                url = self._extract_url(result)
+                for content in result.content:
+                    if isinstance(content, types.ImageContent):
+                        import base64
+                        image_bytes = base64.b64decode(content.data)
+                        screenshot_parts.append(
+                            genai_types.FunctionResponsePart(
+                                inline_data=genai_types.FunctionResponseBlob(
+                                    mime_type=content.mimeType or "image/png",
+                                    data=image_bytes,
+                                )
+                            )
+                        )
+                response_dict["url"] = url or "about:blank"
+                if tool_call.arguments and tool_call.arguments.get("safety_decision"):
+                    response_dict["safety_acknowledgement"] = True
+            else:
                 # Add text content to response
                 for content in result.content:
                     if isinstance(content, types.TextContent):
@@ -325,6 +407,7 @@ class GeminiAgent(MCPAgent):
             function_response = genai_types.FunctionResponse(
                 name=gemini_name,
                 response=response_dict,
+                parts=screenshot_parts if screenshot_parts else None,
             )
             function_responses.append(function_response)
@@ -336,6 +419,13 @@ class GeminiAgent(MCPAgent):
             )
         ]
+    @staticmethod
+    def _extract_url(result: MCPToolResult) -> str | None:
+        for content in result.content:
+            if isinstance(content, types.TextContent) and content.text.startswith("__URL__:"):
+                return content.text.replace("__URL__:", "", 1)
+        return None
     def _map_role(self, role: str) -> str:
         """Gemini uses 'model' instead of 'assistant' for non-user turns."""
         if role == "assistant":
@@ -356,6 +446,7 @@ class GeminiAgent(MCPAgent):
         Uses shared categorize_tools() for role-based exclusion.
         """
         self._gemini_to_mcp_tool_map = {}
+        self._computer_tool_name = None
         self.gemini_tools = []
         categorized = self._categorized_tools
@@ -417,12 +508,19 @@ class GeminiAgent(MCPAgent):
         Returns:
             Gemini-specific tool or None if not supported
         """
-        # Currently Gemini native tools are similar to function tools
-        # This method exists for future expansion (e.g., computer_use native support)
         match spec.api_type:
             case "computer_use":
-                # Gemini computer use is still a function tool
-                return self._to_gemini_tool(tool)
+                self._computer_tool_name = tool.name
+                excluded_functions = [
+                    *self.excluded_predefined_functions,
+                    *self._colliding_predefined_function_names(tool.name),
+                ]
+                return genai_types.Tool(
+                    computer_use=genai_types.ComputerUse(
+                        environment=genai_types.Environment.ENVIRONMENT_BROWSER,
+                        excluded_predefined_functions=excluded_functions,
+                    )
+                )
             case _:
                 # Unknown native type - try as function tool
                 logger.debug(
@@ -432,6 +530,49 @@ class GeminiAgent(MCPAgent):
                 )
                 return self._to_gemini_tool(tool)
+    def _colliding_predefined_function_names(self, computer_tool_name: str) -> list[str]:
+        """Exclude predefined computer actions shadowed by generic MCP tools."""
+        if not self._available_tools:
+            return []
+        generic_names = {
+            tool.name
+            for tool in self._available_tools
+            if tool.name != computer_tool_name and not self.resolve_native_spec(tool)
+        }
+        return sorted(set(PREDEFINED_COMPUTER_USE_FUNCTIONS) & generic_names)
+    def _remove_old_screenshots(self, messages: list[genai_types.Content]) -> None:
+        """Drop older Gemini Computer Use screenshots to keep context growth bounded."""
+        if self._computer_tool_name is None:
+            return
+        turn_with_screenshots_found = 0
+        for content in reversed(messages):
+            if content.role != "user" or not content.parts:
+                continue
+            has_screenshot = any(
+                part.function_response
+                and part.function_response.parts
+                and part.function_response.name in PREDEFINED_COMPUTER_USE_FUNCTIONS
+                for part in content.parts
+            )
+            if not has_screenshot:
+                continue
+            turn_with_screenshots_found += 1
+            if turn_with_screenshots_found <= self.max_recent_turn_with_screenshots:
+                continue
+            for part in content.parts:
+                if (
+                    part.function_response
+                    and part.function_response.parts
+                    and part.function_response.name in PREDEFINED_COMPUTER_USE_FUNCTIONS
+                ):
+                    part.function_response.parts = None
     def _to_gemini_tool(self, tool: types.Tool) -> genai_types.Tool | None:
         """Convert a single MCP tool to Gemini function tool format.

hud_python-0.5.41/hud/agents/gemini_cua.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""Gemini Computer Use preset agent.
+The native Computer Use implementation lives in GeminiAgent. This class only
+keeps the gemini_cua agent type/default model preset.
+"""
+from __future__ import annotations
+from typing import Any, ClassVar
+from hud.tools.computer.settings import computer_settings
+from hud.types import AgentType, BaseAgentConfig
+from hud.utils.types import with_signature
+from .base import MCPAgent
+from .gemini import GeminiAgent
+from .types import GeminiCUAConfig, GeminiCUACreateParams
+class GeminiCUAAgent(GeminiAgent):
+    """
+    Gemini Computer Use Agent that extends GeminiAgent with computer use capabilities.
+    This agent uses Gemini's native computer use capabilities but executes
+    tools through MCP servers instead of direct implementation.
+    """
+    metadata: ClassVar[dict[str, Any] | None] = {
+        "display_width": computer_settings.GEMINI_COMPUTER_WIDTH,
+        "display_height": computer_settings.GEMINI_COMPUTER_HEIGHT,
+    }
+    required_tools: ClassVar[list[str]] = ["gemini_computer"]
+    config_cls: ClassVar[type[BaseAgentConfig]] = GeminiCUAConfig
+    @classmethod
+    def agent_type(cls) -> AgentType:
+        """Return the AgentType for Gemini CUA."""
+        return AgentType.GEMINI_CUA
+    @with_signature(GeminiCUACreateParams)
+    @classmethod
+    def create(cls, **kwargs: Any) -> GeminiCUAAgent:  # pyright: ignore[reportIncompatibleMethodOverride]
+        return MCPAgent.create.__func__(cls, **kwargs)  # type: ignore[return-value]

{hud_python-0.5.39 → hud_python-0.5.41}/hud/agents/tests/test_gemini.py RENAMED Viewed

@@ -229,6 +229,33 @@ class TestGeminiAgent:
             assert response.tool_calls == []
             assert response.done is True
+    @pytest.mark.asyncio
+    async def test_get_response_raises_on_no_candidates(
+        self, mock_gemini_client: MagicMock
+    ) -> None:
+        """A no-candidate Gemini response should fail loudly, not submit an empty answer."""
+        with patch("hud.settings.settings.telemetry_enabled", False):
+            agent = GeminiAgent.create(
+                model_client=mock_gemini_client,
+                model="gemini-3-flash-preview",
+                validate_api_key=False,
+            )
+            agent.gemini_tools = []
+            agent._initialized = True
+            mock_response = MagicMock()
+            mock_response.candidates = []
+            mock_response.prompt_feedback = "blocked"
+            mock_response.usage_metadata = None
+            mock_gemini_client.aio.models.generate_content = AsyncMock(return_value=mock_response)
+            messages = [
+                genai_types.Content(role="user", parts=[genai_types.Part.from_text(text="Status?")])
+            ]
+            with pytest.raises(RuntimeError, match="returned no candidates"):
+                await agent.get_response(messages)
     @pytest.mark.asyncio
     async def test_get_response_with_thinking(self, mock_gemini_client: MagicMock) -> None:
         """Test getting response with thinking content."""
@@ -271,6 +298,42 @@ class TestGeminiAgent:
             assert response.content == "Here is my answer"
             assert response.reasoning == "Let me reason through this..."
+    @pytest.mark.asyncio
+    async def test_get_response_passes_thinking_config(self, mock_gemini_client: MagicMock) -> None:
+        """Gemini 3 thinking options should be passed to GenerateContentConfig."""
+        with patch("hud.settings.settings.telemetry_enabled", False):
+            agent = GeminiAgent.create(
+                model_client=mock_gemini_client,
+                model="gemini-3-flash-preview",
+                validate_api_key=False,
+                thinking_level="high",
+                include_thoughts=True,
+            )
+            agent.gemini_tools = []
+            agent._initialized = True
+            mock_response = MagicMock()
+            mock_candidate = MagicMock()
+            text_part = MagicMock()
+            text_part.text = "Answer"
+            text_part.function_call = None
+            text_part.thought = False
+            mock_candidate.content = MagicMock()
+            mock_candidate.content.parts = [text_part]
+            mock_response.candidates = [mock_candidate]
+            mock_gemini_client.aio.models.generate_content = AsyncMock(return_value=mock_response)
+            messages = [
+                genai_types.Content(role="user", parts=[genai_types.Part.from_text(text="Hi")])
+            ]
+            await agent.get_response(messages)
+            config = mock_gemini_client.aio.models.generate_content.call_args.kwargs["config"]
+            assert config.thinking_config is not None
+            assert config.thinking_config.include_thoughts is True
+            assert config.thinking_config.thinking_level.value == "HIGH"
     @pytest.mark.asyncio
     async def test_convert_tools_for_gemini(self, mock_gemini_client: MagicMock) -> None:
         """Test converting MCP tools to Gemini format."""
@@ -298,6 +361,167 @@ class TestGeminiAgent:
         assert gemini_tool.function_declarations is not None
         assert gemini_tool.function_declarations[0].name == "my_tool"
+    @pytest.mark.asyncio
+    async def test_regular_agent_uses_native_computer_use(
+        self, mock_gemini_client: MagicMock
+    ) -> None:
+        """GeminiAgent should register GeminiComputerTool as native Computer Use."""
+        computer_tool = types.Tool(
+            name="gemini_computer",
+            description="Control computer with mouse, keyboard, and screenshots",
+            inputSchema={"type": "object", "properties": {}},
+        )
+        computer_tool.meta = {
+            "native_tools": {
+                "gemini": {
+                    "api_type": "computer_use",
+                    "api_name": "gemini_computer",
+                    "role": "computer",
+                    "supported_models": ["gemini-3-flash-preview"],
+                }
+            }
+        }
+        tools = [
+            computer_tool,
+        ]
+        ctx = MockEvalContext(tools=tools)
+        agent = GeminiAgent.create(
+            model_client=mock_gemini_client,
+            model="gemini-3-flash-preview",
+            validate_api_key=False,
+            excluded_predefined_functions=["drag_and_drop"],
+        )
+        agent.ctx = ctx
+        await agent._initialize_from_ctx(ctx)
+        assert agent._computer_tool_name == "gemini_computer"
+        assert len(agent.gemini_tools) == 1
+        computer_tool = agent.gemini_tools[0]
+        assert isinstance(computer_tool, genai_types.Tool)
+        assert computer_tool.computer_use is not None
+        assert computer_tool.computer_use.excluded_predefined_functions == ["drag_and_drop"]
+    @pytest.mark.asyncio
+    async def test_computer_use_excludes_colliding_generic_tool_names(
+        self, mock_gemini_client: MagicMock
+    ) -> None:
+        """Generic tools named like predefined actions should not be hijacked."""
+        computer_tool = types.Tool(
+            name="gemini_computer",
+            description="Control computer with mouse, keyboard, and screenshots",
+            inputSchema={"type": "object", "properties": {}},
+        )
+        computer_tool.meta = {
+            "native_tools": {
+                "gemini": {
+                    "api_type": "computer_use",
+                    "api_name": "gemini_computer",
+                    "role": "computer",
+                    "supported_models": ["gemini-3-flash-preview"],
+                }
+            }
+        }
+        navigate_tool = types.Tool(
+            name="navigate",
+            description="A non-computer navigation helper",
+            inputSchema={"type": "object", "properties": {"url": {"type": "string"}}},
+        )
+        ctx = MockEvalContext(tools=[computer_tool, navigate_tool])
+        agent = GeminiAgent.create(
+            model_client=mock_gemini_client,
+            model="gemini-3-flash-preview",
+            validate_api_key=False,
+        )
+        agent.ctx = ctx
+        await agent._initialize_from_ctx(ctx)
+        computer_use_tool = next(
+            tool for tool in agent.gemini_tools if getattr(tool, "computer_use", None) is not None
+        )
+        computer_use = getattr(computer_use_tool, "computer_use", None)
+        assert computer_use is not None
+        assert "navigate" in (computer_use.excluded_predefined_functions or [])
+        function_call = MagicMock()
+        function_call.name = "navigate"
+        function_call.args = {"url": "https://example.com"}
+        tool_call = agent._extract_tool_call(MagicMock(function_call=function_call))
+        assert tool_call is not None
+        assert tool_call.name == "navigate"
+        assert tool_call.arguments == {"url": "https://example.com"}
+    def test_regular_agent_routes_computer_use_function_call(
+        self, mock_gemini_client: MagicMock
+    ) -> None:
+        """Gemini Computer Use calls should route to the MCP computer tool."""
+        agent = GeminiAgent.create(
+            model_client=mock_gemini_client,
+            validate_api_key=False,
+        )
+        agent._computer_tool_name = "gemini_computer"
+        function_call = MagicMock()
+        function_call.name = "click_at"
+        function_call.args = {"x": 500, "y": 250, "safety_decision": {"decision": "allowed"}}
+        part = MagicMock(function_call=function_call)
+        tool_call = agent._extract_tool_call(part)
+        assert tool_call is not None
+        assert tool_call.name == "gemini_computer"
+        assert tool_call.arguments == {
+            "action": "click_at",
+            "safety_decision": {"decision": "allowed"},
+            "x": 500,
+            "y": 250,
+        }
+        assert getattr(tool_call, "gemini_name") == "click_at"
+    @pytest.mark.asyncio
+    async def test_regular_agent_formats_computer_use_results(
+        self, mock_gemini_client: MagicMock
+    ) -> None:
+        """GeminiAgent should return URL and screenshot parts for native computer use."""
+        agent = GeminiAgent.create(
+            model_client=mock_gemini_client,
+            validate_api_key=False,
+        )
+        agent._computer_tool_name = "gemini_computer"
+        screenshot = base64.b64encode(b"png bytes").decode()
+        tool_calls = [
+            MCPToolCall(
+                name="gemini_computer",
+                arguments={"action": "click_at", "safety_decision": {"decision": "allowed"}},
+                gemini_name="click_at",  # type: ignore[arg-type]
+            )
+        ]
+        tool_results = [
+            MCPToolResult(
+                content=[
+                    types.TextContent(type="text", text="__URL__:https://example.com"),
+                    types.ImageContent(type="image", data=screenshot, mimeType="image/png"),
+                ],
+                isError=False,
+            )
+        ]
+        messages = await agent.format_tool_results(tool_calls, tool_results)
+        parts = messages[0].parts
+        assert parts is not None
+        function_response = parts[0].function_response
+        assert function_response is not None
+        assert function_response.name == "click_at"
+        response = function_response.response
+        assert response is not None
+        assert response["url"] == "https://example.com"
+        assert response["safety_acknowledgement"] is True
+        assert function_response.parts is not None
+        inline_data = function_response.parts[0].inline_data
+        assert inline_data is not None
+        assert inline_data.mime_type == "image/png"
 class TestGeminiToolConversion:
     """Tests for tool conversion to Gemini format."""

{hud_python-0.5.39 → hud_python-0.5.41}/hud/agents/types.py RENAMED Viewed

@@ -64,6 +64,9 @@ class GeminiConfig(BaseAgentConfig):
     top_k: int = 40
     max_output_tokens: int = 8192
     validate_api_key: bool = True
+    excluded_predefined_functions: list[str] = Field(default_factory=list)
+    thinking_level: Literal["minimal", "low", "medium", "high"] | None = None
+    include_thoughts: bool = True
 class GeminiCreateParams(BaseCreateParams, GeminiConfig):
@@ -79,7 +82,6 @@ class GeminiCUAConfig(GeminiConfig):
     model: str = Field(
         default="gemini-2.5-computer-use-preview-10-2025", validation_alias=_model_alias
     )
-    excluded_predefined_functions: list[str] = Field(default_factory=list)
 class GeminiCUACreateParams(BaseCreateParams, GeminiCUAConfig):

hud-python 0.5.39__tar.gz → 0.5.41__tar.gz

hud-python 0.5.39tar.gz → 0.5.41tar.gz