PyPI - hud-python - Versions diffs - 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

hud-python 0.4.45py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (274) hide show

hud/__init__.py +27 -7
hud/agents/__init__.py +11 -5
hud/agents/base.py +220 -500
hud/agents/claude.py +200 -240
hud/agents/gemini.py +275 -0
hud/agents/gemini_cua.py +335 -0
hud/agents/grounded_openai.py +98 -100
hud/agents/misc/integration_test_agent.py +51 -20
hud/agents/misc/response_agent.py +41 -36
hud/agents/openai.py +291 -292
hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
hud/agents/operator.py +211 -0
hud/agents/tests/conftest.py +133 -0
hud/agents/tests/test_base.py +300 -622
hud/agents/tests/test_base_runtime.py +233 -0
hud/agents/tests/test_claude.py +379 -210
hud/agents/tests/test_client.py +9 -10
hud/agents/tests/test_gemini.py +369 -0
hud/agents/tests/test_grounded_openai_agent.py +65 -50
hud/agents/tests/test_openai.py +376 -140
hud/agents/tests/test_operator.py +362 -0
hud/agents/tests/test_run_eval.py +179 -0
hud/cli/__init__.py +461 -545
hud/cli/analyze.py +43 -5
hud/cli/build.py +664 -110
hud/cli/debug.py +8 -5
hud/cli/dev.py +882 -734
hud/cli/eval.py +782 -668
hud/cli/flows/dev.py +167 -0
hud/cli/flows/init.py +191 -0
hud/cli/flows/tasks.py +153 -56
hud/cli/flows/templates.py +151 -0
hud/cli/flows/tests/__init__.py +1 -0
hud/cli/flows/tests/test_dev.py +126 -0
hud/cli/init.py +60 -58
hud/cli/push.py +29 -11
hud/cli/rft.py +311 -0
hud/cli/rft_status.py +145 -0
hud/cli/tests/test_analyze.py +5 -5
hud/cli/tests/test_analyze_metadata.py +3 -2
hud/cli/tests/test_analyze_module.py +120 -0
hud/cli/tests/test_build.py +108 -6
hud/cli/tests/test_build_failure.py +41 -0
hud/cli/tests/test_build_module.py +50 -0
hud/cli/tests/test_cli_init.py +6 -1
hud/cli/tests/test_cli_more_wrappers.py +30 -0
hud/cli/tests/test_cli_root.py +140 -0
hud/cli/tests/test_convert.py +361 -0
hud/cli/tests/test_debug.py +12 -10
hud/cli/tests/test_dev.py +197 -0
hud/cli/tests/test_eval.py +251 -0
hud/cli/tests/test_eval_bedrock.py +51 -0
hud/cli/tests/test_init.py +124 -0
hud/cli/tests/test_main_module.py +11 -5
hud/cli/tests/test_mcp_server.py +12 -100
hud/cli/tests/test_push_happy.py +74 -0
hud/cli/tests/test_push_wrapper.py +23 -0
hud/cli/tests/test_registry.py +1 -1
hud/cli/tests/test_utils.py +1 -1
hud/cli/{rl → utils}/celebrate.py +14 -12
hud/cli/utils/config.py +18 -1
hud/cli/utils/docker.py +130 -4
hud/cli/utils/env_check.py +9 -9
hud/cli/utils/git.py +136 -0
hud/cli/utils/interactive.py +39 -5
hud/cli/utils/metadata.py +69 -0
hud/cli/utils/runner.py +1 -1
hud/cli/utils/server.py +2 -2
hud/cli/utils/source_hash.py +3 -3
hud/cli/utils/tasks.py +4 -1
hud/cli/utils/tests/__init__.py +0 -0
hud/cli/utils/tests/test_config.py +58 -0
hud/cli/utils/tests/test_docker.py +93 -0
hud/cli/utils/tests/test_docker_hints.py +71 -0
hud/cli/utils/tests/test_env_check.py +74 -0
hud/cli/utils/tests/test_environment.py +42 -0
hud/cli/utils/tests/test_git.py +142 -0
hud/cli/utils/tests/test_interactive_module.py +60 -0
hud/cli/utils/tests/test_local_runner.py +50 -0
hud/cli/utils/tests/test_logging_utils.py +23 -0
hud/cli/utils/tests/test_metadata.py +49 -0
hud/cli/utils/tests/test_package_runner.py +35 -0
hud/cli/utils/tests/test_registry_utils.py +49 -0
hud/cli/utils/tests/test_remote_runner.py +25 -0
hud/cli/utils/tests/test_runner_modules.py +52 -0
hud/cli/utils/tests/test_source_hash.py +36 -0
hud/cli/utils/tests/test_tasks.py +80 -0
hud/cli/utils/version_check.py +258 -0
hud/cli/{rl → utils}/viewer.py +2 -2
hud/clients/README.md +12 -11
hud/clients/__init__.py +4 -3
hud/clients/base.py +166 -26
hud/clients/environment.py +51 -0
hud/clients/fastmcp.py +13 -6
hud/clients/mcp_use.py +40 -15
hud/clients/tests/test_analyze_scenarios.py +206 -0
hud/clients/tests/test_protocol.py +9 -3
hud/datasets/__init__.py +23 -20
hud/datasets/loader.py +327 -0
hud/datasets/runner.py +192 -105
hud/datasets/tests/__init__.py +0 -0
hud/datasets/tests/test_loader.py +221 -0
hud/datasets/tests/test_utils.py +315 -0
hud/datasets/utils.py +270 -90
hud/environment/__init__.py +50 -0
hud/environment/connection.py +206 -0
hud/environment/connectors/__init__.py +33 -0
hud/environment/connectors/base.py +68 -0
hud/environment/connectors/local.py +177 -0
hud/environment/connectors/mcp_config.py +109 -0
hud/environment/connectors/openai.py +101 -0
hud/environment/connectors/remote.py +172 -0
hud/environment/environment.py +694 -0
hud/environment/integrations/__init__.py +45 -0
hud/environment/integrations/adk.py +67 -0
hud/environment/integrations/anthropic.py +196 -0
hud/environment/integrations/gemini.py +92 -0
hud/environment/integrations/langchain.py +82 -0
hud/environment/integrations/llamaindex.py +68 -0
hud/environment/integrations/openai.py +238 -0
hud/environment/mock.py +306 -0
hud/environment/router.py +112 -0
hud/environment/scenarios.py +493 -0
hud/environment/tests/__init__.py +1 -0
hud/environment/tests/test_connection.py +317 -0
hud/environment/tests/test_connectors.py +218 -0
hud/environment/tests/test_environment.py +161 -0
hud/environment/tests/test_integrations.py +257 -0
hud/environment/tests/test_local_connectors.py +201 -0
hud/environment/tests/test_scenarios.py +280 -0
hud/environment/tests/test_tools.py +208 -0
hud/environment/types.py +23 -0
hud/environment/utils/__init__.py +35 -0
hud/environment/utils/formats.py +215 -0
hud/environment/utils/schema.py +171 -0
hud/environment/utils/tool_wrappers.py +113 -0
hud/eval/__init__.py +67 -0
hud/eval/context.py +674 -0
hud/eval/display.py +299 -0
hud/eval/instrument.py +185 -0
hud/eval/manager.py +466 -0
hud/eval/parallel.py +268 -0
hud/eval/task.py +340 -0
hud/eval/tests/__init__.py +1 -0
hud/eval/tests/test_context.py +178 -0
hud/eval/tests/test_eval.py +210 -0
hud/eval/tests/test_manager.py +152 -0
hud/eval/tests/test_parallel.py +168 -0
hud/eval/tests/test_task.py +145 -0
hud/eval/types.py +63 -0
hud/eval/utils.py +183 -0
hud/patches/__init__.py +19 -0
hud/patches/mcp_patches.py +151 -0
hud/patches/warnings.py +54 -0
hud/samples/browser.py +4 -4
hud/server/__init__.py +2 -1
hud/server/low_level.py +2 -1
hud/server/router.py +164 -0
hud/server/server.py +567 -80
hud/server/tests/test_mcp_server_integration.py +11 -11
hud/server/tests/test_mcp_server_more.py +1 -1
hud/server/tests/test_server_extra.py +2 -0
hud/settings.py +45 -3
hud/shared/exceptions.py +36 -10
hud/shared/hints.py +26 -1
hud/shared/requests.py +15 -3
hud/shared/tests/test_exceptions.py +40 -31
hud/shared/tests/test_hints.py +167 -0
hud/telemetry/__init__.py +20 -19
hud/telemetry/exporter.py +201 -0
hud/telemetry/instrument.py +158 -253
hud/telemetry/tests/test_eval_telemetry.py +356 -0
hud/telemetry/tests/test_exporter.py +258 -0
hud/telemetry/tests/test_instrument.py +401 -0
hud/tools/__init__.py +16 -2
hud/tools/apply_patch.py +639 -0
hud/tools/base.py +54 -4
hud/tools/bash.py +2 -2
hud/tools/computer/__init__.py +4 -0
hud/tools/computer/anthropic.py +2 -2
hud/tools/computer/gemini.py +385 -0
hud/tools/computer/hud.py +23 -6
hud/tools/computer/openai.py +20 -21
hud/tools/computer/qwen.py +434 -0
hud/tools/computer/settings.py +37 -0
hud/tools/edit.py +3 -7
hud/tools/executors/base.py +4 -2
hud/tools/executors/pyautogui.py +1 -1
hud/tools/grounding/grounded_tool.py +13 -18
hud/tools/grounding/grounder.py +10 -31
hud/tools/grounding/tests/test_grounded_tool.py +26 -44
hud/tools/jupyter.py +330 -0
hud/tools/playwright.py +18 -3
hud/tools/shell.py +308 -0
hud/tools/tests/test_apply_patch.py +718 -0
hud/tools/tests/test_computer.py +4 -9
hud/tools/tests/test_computer_actions.py +24 -2
hud/tools/tests/test_jupyter_tool.py +181 -0
hud/tools/tests/test_shell.py +596 -0
hud/tools/tests/test_submit.py +85 -0
hud/tools/tests/test_types.py +193 -0
hud/tools/types.py +21 -1
hud/types.py +167 -57
hud/utils/__init__.py +2 -0
hud/utils/env.py +67 -0
hud/utils/hud_console.py +61 -3
hud/utils/mcp.py +15 -58
hud/utils/strict_schema.py +162 -0
hud/utils/tests/test_init.py +1 -2
hud/utils/tests/test_mcp.py +1 -28
hud/utils/tests/test_pretty_errors.py +186 -0
hud/utils/tests/test_tool_shorthand.py +154 -0
hud/utils/tests/test_version.py +1 -1
hud/utils/types.py +20 -0
hud/version.py +1 -1
hud_python-0.5.1.dist-info/METADATA +264 -0
hud_python-0.5.1.dist-info/RECORD +299 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
hud/agents/langchain.py +0 -261
hud/agents/lite_llm.py +0 -72
hud/cli/rl/__init__.py +0 -180
hud/cli/rl/config.py +0 -101
hud/cli/rl/display.py +0 -133
hud/cli/rl/gpu.py +0 -63
hud/cli/rl/gpu_utils.py +0 -321
hud/cli/rl/local_runner.py +0 -595
hud/cli/rl/presets.py +0 -96
hud/cli/rl/remote_runner.py +0 -463
hud/cli/rl/rl_api.py +0 -150
hud/cli/rl/vllm.py +0 -177
hud/cli/rl/wait_utils.py +0 -89
hud/datasets/parallel.py +0 -687
hud/misc/__init__.py +0 -1
hud/misc/claude_plays_pokemon.py +0 -292
hud/otel/__init__.py +0 -35
hud/otel/collector.py +0 -142
hud/otel/config.py +0 -181
hud/otel/context.py +0 -570
hud/otel/exporters.py +0 -369
hud/otel/instrumentation.py +0 -135
hud/otel/processors.py +0 -121
hud/otel/tests/__init__.py +0 -1
hud/otel/tests/test_processors.py +0 -197
hud/rl/README.md +0 -30
hud/rl/__init__.py +0 -1
hud/rl/actor.py +0 -176
hud/rl/buffer.py +0 -405
hud/rl/chat_template.jinja +0 -101
hud/rl/config.py +0 -192
hud/rl/distributed.py +0 -132
hud/rl/learner.py +0 -637
hud/rl/tests/__init__.py +0 -1
hud/rl/tests/test_learner.py +0 -186
hud/rl/train.py +0 -382
hud/rl/types.py +0 -101
hud/rl/utils/start_vllm_server.sh +0 -30
hud/rl/utils.py +0 -524
hud/rl/vllm_adapter.py +0 -143
hud/telemetry/job.py +0 -352
hud/telemetry/replay.py +0 -74
hud/telemetry/tests/test_replay.py +0 -40
hud/telemetry/tests/test_trace.py +0 -63
hud/telemetry/trace.py +0 -158
hud/utils/agent_factories.py +0 -86
hud/utils/async_utils.py +0 -65
hud/utils/group_eval.py +0 -223
hud/utils/progress.py +0 -149
hud/utils/tasks.py +0 -127
hud/utils/tests/test_async_utils.py +0 -173
hud/utils/tests/test_progress.py +0 -261
hud_python-0.4.45.dist-info/METADATA +0 -552
hud_python-0.4.45.dist-info/RECORD +0 -228
{hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0

hud/agents/tests/test_claude.py CHANGED Viewed

@@ -2,11 +2,11 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, Any, cast
 from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
-from anthropic import BadRequestError
+from anthropic import AsyncAnthropic, AsyncAnthropicBedrock
 from mcp import types
 from hud.agents.claude import (
@@ -15,18 +15,96 @@ from hud.agents.claude import (
     text_to_content_block,
     tool_use_content_block,
 )
+from hud.environment.router import ToolRouter
+from hud.eval.context import EvalContext
 from hud.types import MCPToolCall, MCPToolResult
 if TYPE_CHECKING:
+    from collections.abc import Generator
     from anthropic.types.beta import BetaImageBlockParam, BetaMessageParam, BetaTextBlockParam
+class MockEvalContext(EvalContext):
+    """Mock EvalContext for testing."""
+    def __init__(self, tools: list[types.Tool] | None = None) -> None:
+        # Core attributes
+        self.prompt = "Test prompt"
+        self._tools = tools or []
+        self._submitted: str | None = None
+        self.reward: float | None = None
+        # Environment attributes
+        self._router = ToolRouter()
+        self._agent_include: list[str] | None = None
+        self._agent_exclude: list[str] | None = None
+        # EvalContext attributes
+        self._task = None
+        self.trace_id = "test-trace-id"
+        self.eval_name = "test-eval"
+        self.job_id: str | None = None
+        self.group_id: str | None = None
+        self.index = 0
+        self.variants: dict[str, Any] = {}
+        self.answer: str | None = None
+        self.system_prompt: str | None = None
+        self.error: BaseException | None = None
+        self.metadata: dict[str, Any] = {}
+        self.results: list[Any] = []
+        self._is_summary = False
+    def as_tools(self) -> list[types.Tool]:
+        return self._tools
+    @property
+    def has_scenario(self) -> bool:
+        return False
+    async def list_tools(self) -> list[types.Tool]:
+        return self._tools
+    async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
+        return MCPToolResult(
+            content=[types.TextContent(type="text", text="ok")],
+            isError=False,
+        )
+    async def submit(self, answer: str) -> None:
+        self._submitted = answer
+class MockStreamContextManager:
+    """Mock for Claude's streaming context manager."""
+    def __init__(self, response: MagicMock) -> None:
+        self.response = response
+    async def __aenter__(self) -> MockStreamContextManager:
+        return self
+    async def __aexit__(
+        self, exc_type: type | None, exc_val: Exception | None, exc_tb: Any
+    ) -> bool:
+        return False
+    def __aiter__(self) -> MockStreamContextManager:
+        return self
+    async def __anext__(self) -> None:
+        raise StopAsyncIteration
+    async def get_final_message(self) -> MagicMock:
+        return self.response
 class TestClaudeHelperFunctions:
     """Test helper functions for Claude message formatting."""
-    def test_base64_to_content_block(self):
+    def test_base64_to_content_block(self) -> None:
         """Test base64 image conversion."""
-        base64_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="  # noqa: E501
+        base64_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk"
         result = base64_to_content_block(base64_data)
         assert result["type"] == "image"
@@ -34,7 +112,7 @@ class TestClaudeHelperFunctions:
         assert result["source"]["media_type"] == "image/png"
         assert result["source"]["data"] == base64_data
-    def test_text_to_content_block(self):
+    def test_text_to_content_block(self) -> None:
         """Test text conversion."""
         text = "Hello, world!"
         result = text_to_content_block(text)
@@ -42,7 +120,7 @@ class TestClaudeHelperFunctions:
         assert result["type"] == "text"
         assert result["text"] == text
-    def test_tool_use_content_block(self):
+    def test_tool_use_content_block(self) -> None:
         """Test tool result content block creation."""
         tool_use_id = "tool_123"
         content: list[BetaTextBlockParam | BetaImageBlockParam] = [
@@ -60,192 +138,331 @@ class TestClaudeAgent:
     """Test ClaudeAgent class."""
     @pytest.fixture
-    def mock_mcp_client(self):
-        """Create a mock MCP client."""
-        mcp_client = MagicMock()
-        return mcp_client
-    @pytest.fixture
-    def mock_anthropic(self):
-        """Create a mock Anthropic client."""
-        with patch("hud.agents.claude.AsyncAnthropic") as mock:
-            client = AsyncMock()
-            # Add beta attribute with messages
-            client.beta = AsyncMock()
-            client.beta.messages = AsyncMock()
-            mock.return_value = client
-            yield client
+    def mock_anthropic(self) -> Generator[AsyncAnthropic, None, None]:  # type: ignore[misc]
+        """Create a stub Anthropic client."""
+        with patch("hud.agents.claude.AsyncAnthropic") as mock_class:
+            client = MagicMock(spec=AsyncAnthropic)
+            client.api_key = "test-key"
+            mock_class.return_value = client
+            yield client  # type: ignore[misc]
     @pytest.mark.asyncio
-    async def test_init(self, mock_mcp_client, mock_anthropic):
-        """Test agent initialization."""
-        # Test with provided model_client
-        mock_model_client = MagicMock()
-        agent = ClaudeAgent(
-            mcp_client=mock_mcp_client,
-            model_client=mock_model_client,
-            model="claude-3-opus-20240229",
-            max_tokens=1000,
-            validate_api_key=False,  # Skip validation in tests
+    async def test_init_with_client(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test agent initialization with provided client."""
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            model="claude-sonnet-4-20250514",
+            validate_api_key=False,
         )
-        assert agent.model_name == "claude-3-opus-20240229"
-        assert agent.max_tokens == 1000
-        assert agent.anthropic_client == mock_model_client
+        assert agent.model_name == "Claude"
+        assert agent.config.model == "claude-sonnet-4-20250514"
+        assert agent.anthropic_client == mock_anthropic
     @pytest.mark.asyncio
-    async def test_init_without_model_client(self, mock_mcp_client, mock_anthropic):
-        """Test agent initialization without model client."""
-        with patch("hud.settings.settings.anthropic_api_key", "test_key"):
-            agent = ClaudeAgent(
-                mcp_client=mock_mcp_client,
-                model="claude-3-opus-20240229",
-                validate_api_key=False,  # Skip validation in tests
-            )
+    async def test_init_with_parameters(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test agent initialization with various parameters."""
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            model="claude-sonnet-4-20250514",
+            max_tokens=4096,
+            validate_api_key=False,
+        )
-            assert agent.model_name == "claude-3-opus-20240229"
-            assert agent.anthropic_client is not None
+        assert agent.max_tokens == 4096
     @pytest.mark.asyncio
-    async def test_format_blocks(self, mock_mcp_client):
-        """Test formatting content blocks into Claude messages."""
-        mock_model_client = MagicMock()
-        agent = ClaudeAgent(
-            mcp_client=mock_mcp_client,
-            model_client=mock_model_client,
-            validate_api_key=False,  # Skip validation in tests
+    async def test_format_blocks_text_only(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test formatting text content blocks."""
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            validate_api_key=False,
         )
-        # Test with text only
-        text_blocks: list[types.ContentBlock] = [
-            types.TextContent(type="text", text="Hello, Claude!")
+        blocks: list[types.ContentBlock] = [
+            types.TextContent(type="text", text="Hello, world!"),
+            types.TextContent(type="text", text="How are you?"),
         ]
-        messages = await agent.format_blocks(text_blocks)
+        messages = await agent.format_blocks(blocks)
         assert len(messages) == 1
         assert messages[0]["role"] == "user"
         content = messages[0]["content"]
         assert isinstance(content, list)
-        assert len(content) == 1
-        assert content[0]["type"] == "text"
-        assert content[0]["text"] == "Hello, Claude!"
+        assert len(content) == 2
+        assert content[0]["type"] == "text"  # type: ignore[index]
+        assert content[0]["text"] == "Hello, world!"  # type: ignore[index]
-        # Test with screenshot
-        image_blocks: list[types.ContentBlock] = [
-            types.TextContent(type="text", text="Look at this"),
+    @pytest.mark.asyncio
+    async def test_format_blocks_with_image(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test formatting image content blocks."""
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            validate_api_key=False,
+        )
+        blocks: list[types.ContentBlock] = [
+            types.TextContent(type="text", text="Look at this:"),
             types.ImageContent(type="image", data="base64data", mimeType="image/png"),
         ]
-        messages = await agent.format_blocks(image_blocks)
+        messages = await agent.format_blocks(blocks)
         assert len(messages) == 1
-        assert messages[0]["role"] == "user"
         content = messages[0]["content"]
         assert isinstance(content, list)
         assert len(content) == 2
-        # Content blocks are in order
-        assert content[0]["type"] == "text"
-        assert content[0]["text"] == "Look at this"
-        assert content[1]["type"] == "image"
-        assert content[1]["source"]["data"] == "base64data"
+        assert content[1]["type"] == "image"  # type: ignore[index]
     @pytest.mark.asyncio
-    async def test_format_tool_results_method(self, mock_mcp_client):
-        """Test the agent's format_tool_results method."""
-        mock_model_client = MagicMock()
-        agent = ClaudeAgent(
-            mcp_client=mock_mcp_client,
-            model_client=mock_model_client,
-            validate_api_key=False,  # Skip validation in tests
+    async def test_format_tool_results_text(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test formatting tool results with text content."""
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            validate_api_key=False,
         )
-        tool_calls = [
-            MCPToolCall(name="test_tool", arguments={}, id="id1"),
-        ]
+        tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
         tool_results = [
-            MCPToolResult(content=[types.TextContent(type="text", text="Success")], isError=False),
+            MCPToolResult(
+                content=[types.TextContent(type="text", text="Tool output")],
+                isError=False,
+            )
         ]
         messages = await agent.format_tool_results(tool_calls, tool_results)
-        # format_tool_results returns a single user message with tool result content
         assert len(messages) == 1
         assert messages[0]["role"] == "user"
-        # The content is wrapped in a tool result block
-        content = list(messages[0]["content"])
+        content = messages[0]["content"]
+        assert isinstance(content, list)
         assert len(content) == 1
-        assert content[0]["type"] == "tool_result"  # type: ignore
-        assert content[0]["tool_use_id"] == "id1"  # type: ignore
-        # The actual content is nested inside
-        inner_content = list(content[0]["content"])  # type: ignore
-        assert inner_content[0]["type"] == "text"  # type: ignore
-        assert inner_content[0]["text"] == "Success"  # type: ignore
+        assert content[0]["type"] == "tool_result"  # type: ignore[index]
+        assert content[0]["tool_use_id"] == "call_123"  # type: ignore[index]
+    @pytest.mark.asyncio
+    async def test_format_tool_results_with_error(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test formatting tool results with error."""
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            validate_api_key=False,
+        )
+        tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
+        tool_results = [
+            MCPToolResult(
+                content=[types.TextContent(type="text", text="Error message")],
+                isError=True,
+            )
+        ]
+        messages = await agent.format_tool_results(tool_calls, tool_results)
+        assert len(messages) == 1
+        content = messages[0]["content"]
+        # Error content should include "Error:" prefix
+        assert any("Error" in str(block) for block in content[0]["content"])  # type: ignore[index]
+    @pytest.mark.asyncio
+    async def test_get_system_messages(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test that system messages return empty (Claude uses system param)."""
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            system_prompt="You are a helpful assistant.",
+            validate_api_key=False,
+        )
+        messages = await agent.get_system_messages()
+        # Claude doesn't use system messages in the message list
+        assert messages == []
     @pytest.mark.asyncio
-    async def test_get_response(self, mock_mcp_client, mock_anthropic):
-        """Test getting model response from Claude API."""
-        # Disable telemetry for this test to avoid backend configuration issues
+    async def test_get_response_with_thinking(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test getting model response with thinking content."""
         with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = ClaudeAgent(
-                mcp_client=mock_mcp_client,
+            agent = ClaudeAgent.create(
                 model_client=mock_anthropic,
-                validate_api_key=False,  # Skip validation in tests
+                validate_api_key=False,
             )
+            # Set up agent as initialized
+            agent.claude_tools = []
+            agent.tool_mapping = {}
+            agent.has_computer_tool = False
+            agent._initialized = True
-            # Mock the API response
             mock_response = MagicMock()
-            # Create text block
+            thinking_block = MagicMock()
+            thinking_block.type = "thinking"
+            thinking_block.thinking = "Let me analyze this problem..."
             text_block = MagicMock()
             text_block.type = "text"
-            text_block.text = "Hello!"
+            text_block.text = "Here is the answer"
-            # Create tool use block
-            tool_block = MagicMock()
-            tool_block.type = "tool_use"
-            tool_block.id = "tool_123"
-            tool_block.name = "test_tool"
-            tool_block.input = {"param": "value"}
+            mock_response.content = [thinking_block, text_block]
+            mock_response.usage = MagicMock(input_tokens=10, output_tokens=30)
-            mock_response.content = [text_block, tool_block]
-            mock_response.usage = MagicMock(input_tokens=10, output_tokens=20)
-            mock_anthropic.beta.messages.create = AsyncMock(return_value=mock_response)
+            mock_stream = MockStreamContextManager(mock_response)
+            mock_anthropic.beta.messages.stream = MagicMock(return_value=mock_stream)
             messages = [
                 cast(
                     "BetaMessageParam",
-                    {"role": "user", "content": [{"type": "text", "text": "Hi"}]},
+                    {"role": "user", "content": [{"type": "text", "text": "Hard question"}]},
                 )
             ]
             response = await agent.get_response(messages)
-            assert response.content == "Hello!"
-            assert len(response.tool_calls) == 1
-            assert response.tool_calls[0].name == "test_tool"
-            assert response.tool_calls[0].arguments == {"param": "value"}
-            # The test was checking for Claude-specific attributes that aren't part of ModelResponse
-            # These would need to be accessed from the original Claude response if needed
+            assert response.content == "Here is the answer"
+            assert response.reasoning == "Let me analyze this problem..."
+    @pytest.mark.asyncio
+    async def test_convert_tools_for_claude(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test converting MCP tools to Claude format."""
+        tools = [
+            types.Tool(
+                name="my_tool",
+                description="A test tool",
+                inputSchema={"type": "object", "properties": {"x": {"type": "string"}}},
+            )
+        ]
+        ctx = MockEvalContext(tools=tools)
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            validate_api_key=False,
+        )
+        agent.ctx = ctx
+        await agent._initialize_from_ctx(ctx)
-            # Verify API was called correctly
-            mock_anthropic.beta.messages.create.assert_called_once()
+        # Check that tools were converted
+        assert len(agent.claude_tools) == 1
+        assert agent.claude_tools[0]["name"] == "my_tool"  # type: ignore[typeddict-item]
     @pytest.mark.asyncio
-    async def test_get_model_response_text_only(self, mock_mcp_client, mock_anthropic):
-        """Test getting text-only response."""
-        # Disable telemetry for this test to avoid backend configuration issues
+    async def test_computer_tool_detection(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test that computer tools are detected for beta API."""
+        tools = [
+            types.Tool(
+                name="computer",
+                description="Control computer",
+                inputSchema={"type": "object"},
+            )
+        ]
+        ctx = MockEvalContext(tools=tools)
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            validate_api_key=False,
+        )
+        agent.ctx = ctx
+        await agent._initialize_from_ctx(ctx)
+        assert agent.has_computer_tool is True
+    @pytest.mark.asyncio
+    async def test_get_response_with_text(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test getting response with text output."""
+        # Create mock response
+        mock_response = MagicMock()
+        mock_response.content = [MagicMock(type="text", text="Hello!")]
+        mock_stream = MockStreamContextManager(mock_response)
+        mock_anthropic.beta.messages.stream = MagicMock(return_value=mock_stream)
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            validate_api_key=False,
+        )
+        agent.claude_tools = []
+        agent.tool_mapping = {}
+        agent.has_computer_tool = False
+        agent._initialized = True
+        response = await agent.get_response([])
+        assert response.content == "Hello!"
+        assert response.done is True
+        assert len(response.tool_calls) == 0
+    @pytest.mark.asyncio
+    async def test_get_response_with_tool_call(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test getting response with tool call."""
+        mock_tool_use = MagicMock()
+        mock_tool_use.type = "tool_use"
+        mock_tool_use.id = "call_123"
+        mock_tool_use.name = "my_tool"
+        mock_tool_use.input = {"x": "value"}
+        mock_response = MagicMock()
+        mock_response.content = [mock_tool_use]
+        mock_stream = MockStreamContextManager(mock_response)
+        mock_anthropic.beta.messages.stream = MagicMock(return_value=mock_stream)
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            validate_api_key=False,
+        )
+        agent.claude_tools = []
+        agent.tool_mapping = {"my_tool": "my_tool"}
+        agent.has_computer_tool = False
+        agent._initialized = True
+        response = await agent.get_response([])
+        assert response.done is False
+        assert len(response.tool_calls) == 1
+        assert response.tool_calls[0].name == "my_tool"
+        assert response.tool_calls[0].arguments == {"x": "value"}
+class TestClaudeAgentBedrock:
+    """Test ClaudeAgent class with Bedrock."""
+    @pytest.fixture
+    def bedrock_client(self) -> AsyncAnthropicBedrock:
+        """Create a real AsyncAnthropicBedrock client and stub networked methods."""
+        client = AsyncAnthropicBedrock(
+            aws_access_key="AKIATEST",
+            aws_secret_key="secret",
+            aws_region="us-east-1",
+        )
+        # Stub the actual Bedrock call so tests are hermetic.
+        client.beta.messages.create = AsyncMock()
+        return client
+    @pytest.mark.asyncio
+    async def test_init(self, bedrock_client: AsyncAnthropicBedrock) -> None:
+        """Test agent initialization."""
+        agent = ClaudeAgent.create(
+            model_client=bedrock_client,
+            model="test-model-arn",
+            validate_api_key=False,
+        )
+        assert agent.model_name == "Claude"
+        assert agent.config.model == "test-model-arn"
+        assert agent.anthropic_client == bedrock_client
+    @pytest.mark.asyncio
+    async def test_get_response_bedrock_uses_create_not_stream(
+        self, bedrock_client: AsyncAnthropicBedrock
+    ) -> None:
+        """Bedrock path must call messages.create() (Bedrock doesn't support stream())."""
         with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = ClaudeAgent(
-                mcp_client=mock_mcp_client,
-                model_client=mock_anthropic,
-                validate_api_key=False,  # Skip validation in tests
+            agent = ClaudeAgent.create(
+                model_client=bedrock_client,
+                model="test-model-arn",
+                validate_api_key=False,
             )
+            # Enable computer tool to verify betas list includes computer-use in Bedrock mode.
+            agent.has_computer_tool = True
             mock_response = MagicMock()
-            # Create text block
             text_block = MagicMock()
             text_block.type = "text"
-            text_block.text = "Just text"
+            text_block.text = "Hello from Bedrock"
             mock_response.content = [text_block]
-            mock_response.usage = MagicMock(input_tokens=5, output_tokens=10)
-            mock_anthropic.beta.messages.create = AsyncMock(return_value=mock_response)
+            bedrock_client.beta.messages.create.return_value = mock_response  # type: ignore[union-attr]
             messages = [
                 cast(
@@ -255,95 +472,47 @@ class TestClaudeAgent:
             ]
             response = await agent.get_response(messages)
-            assert response.content == "Just text"
+            assert response.content == "Hello from Bedrock"
             assert response.tool_calls == []
+            # Bedrock-specific behavior: uses create() and appends assistant message directly.
+            assert not hasattr(bedrock_client.beta.messages, "stream")
+            bedrock_client.beta.messages.create.assert_awaited_once()  # type: ignore[union-attr]
+            assert len(messages) == 2
+            assert messages[-1]["role"] == "assistant"
+            # Ensure the Bedrock call shape is stable.
+            _, kwargs = bedrock_client.beta.messages.create.call_args  # type: ignore[union-attr]
+            assert kwargs["model"] == "test-model-arn"
+            assert kwargs["tool_choice"] == {"type": "auto", "disable_parallel_tool_use": True}
+            assert "fine-grained-tool-streaming-2025-05-14" in kwargs["betas"]
+            assert "computer-use-2025-01-24" in kwargs["betas"]
     @pytest.mark.asyncio
-    async def test_get_model_response_error(self, mock_mcp_client, mock_anthropic):
-        """Test handling API errors."""
-        # Disable telemetry for this test to avoid backend configuration issues
+    async def test_get_response_bedrock_missing_boto3_raises_value_error(
+        self, bedrock_client: AsyncAnthropicBedrock
+    ) -> None:
+        """If boto3 isn't installed, Bedrock client import path should raise a clear ValueError."""
         with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = ClaudeAgent(
-                mcp_client=mock_mcp_client,
-                model_client=mock_anthropic,
-                validate_api_key=False,  # Skip validation in tests
-            )
-            # Mock API error
-            mock_anthropic.beta.messages.create = AsyncMock(
-                side_effect=BadRequestError(
-                    message="Invalid request",
-                    response=MagicMock(status_code=400),
-                    body={"error": {"message": "Invalid request"}},
-                )
+            agent = ClaudeAgent.create(
+                model_client=bedrock_client,
+                model="test-model-arn",
+                validate_api_key=False,
             )
+            bedrock_client.beta.messages.create.side_effect = ModuleNotFoundError("boto3")  # type: ignore[union-attr]
             messages = [{"role": "user", "content": [{"type": "text", "text": "Hi"}]}]
-            with pytest.raises(BadRequestError):
+            with pytest.raises(ValueError, match=r"boto3 is required for AWS Bedrock"):
                 await agent.get_response(messages)  # type: ignore
-    # This test is commented out as it's testing complex integration scenarios
-    # that may have changed in the implementation
-    # @pytest.mark.asyncio
-    # async def test_run_with_tools(self, mock_mcp_client, mock_anthropic):
-    #     """Test running agent with tool usage."""
-    #     # Disable telemetry for this test to avoid backend configuration issues
-    #     with patch("hud.settings.settings.telemetry_enabled", False):
-    #         agent = ClaudeAgent(mcp_client=mock_mcp_client, model_client=mock_anthropic)
-    #         # Mock tool availability
-    #         agent._available_tools = [
-    #             types.Tool(
-    #                 name="calculator", description="Calculator", inputSchema={"type": "object"}
-    #             )
-    #         ]
-    #         agent._tool_map = {
-    #             "calculator": types.Tool(
-    #                 name="calculator", description="Calculator", inputSchema={"type": "object"}
-    #             )
-    #         }
-    #         # Mock initial response with tool use
-    #         initial_response = MagicMock()
-    #         # Create tool use block
-    #         tool_block = MagicMock()
-    #         tool_block.type = "tool_use"
-    #         tool_block.id = "calc_123"
-    #         tool_block.name = "calculator"
-    #         tool_block.input = {"operation": "add", "a": 2, "b": 3}
-    #         initial_response.content = [tool_block]
-    #         initial_response.usage = MagicMock(input_tokens=10, output_tokens=15)
-    #         # Mock follow-up response
-    #         final_response = MagicMock()
-    #         text_block = MagicMock()
-    #         text_block.type = "text"
-    #         text_block.text = "2 + 3 = 5"
-    #         final_response.content = [text_block]
-    #         final_response.usage = MagicMock(input_tokens=20, output_tokens=10)
-    #         mock_anthropic.beta.messages.create = AsyncMock(
-    #             side_effect=[initial_response, final_response]
-    #         )
-    #         # Mock tool execution
-    #         mock_mcp_client.call_tool = AsyncMock(
-    #             return_value=MCPToolResult(
-    #                 content=[types.TextContent(type="text", text="5")], isError=False
-    #             )
-    #         )
-    #         # Mock the mcp_client properties
-    #         mock_mcp_client.mcp_config = {"test_server": {"url": "http://localhost"}}
-    #         mock_mcp_client.list_tools = AsyncMock(return_value=agent._available_tools)
-    #         mock_mcp_client.initialize = AsyncMock()
-    #         # Initialize the agent
-    #         await agent.initialize()
-    #         # Use a string prompt instead of a task
-    #         result = await agent.run("What is 2 + 3?")
-    #         assert result.content == "2 + 3 = 5"
-    #         assert result.done is True
+    def test_init_with_bedrock_client_does_not_require_anthropic_api_key(
+        self, bedrock_client: AsyncAnthropicBedrock
+    ) -> None:
+        """Providing model_client should bypass ANTHROPIC_API_KEY validation."""
+        with patch("hud.settings.settings.anthropic_api_key", None):
+            agent = ClaudeAgent.create(
+                model_client=bedrock_client,
+                validate_api_key=False,
+            )
+            assert agent.anthropic_client == bedrock_client

hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

hud-python 0.4.45py3-none-any.whl → 0.5.1py3-none-any.whl