PyPI - hud-python - Versions diffs - 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl - Mend

hud-python 0.4.45py3-none-any.whl → 0.5.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (282) hide show

hud/__init__.py +27 -7
hud/agents/__init__.py +70 -5
hud/agents/base.py +238 -500
hud/agents/claude.py +236 -247
hud/agents/gateway.py +42 -0
hud/agents/gemini.py +264 -0
hud/agents/gemini_cua.py +324 -0
hud/agents/grounded_openai.py +98 -100
hud/agents/misc/integration_test_agent.py +51 -20
hud/agents/misc/response_agent.py +48 -36
hud/agents/openai.py +282 -296
hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
hud/agents/operator.py +199 -0
hud/agents/resolver.py +70 -0
hud/agents/tests/conftest.py +133 -0
hud/agents/tests/test_base.py +300 -622
hud/agents/tests/test_base_runtime.py +233 -0
hud/agents/tests/test_claude.py +381 -214
hud/agents/tests/test_client.py +9 -10
hud/agents/tests/test_gemini.py +369 -0
hud/agents/tests/test_grounded_openai_agent.py +65 -50
hud/agents/tests/test_openai.py +377 -140
hud/agents/tests/test_operator.py +362 -0
hud/agents/tests/test_resolver.py +192 -0
hud/agents/tests/test_run_eval.py +179 -0
hud/agents/types.py +148 -0
hud/cli/__init__.py +493 -546
hud/cli/analyze.py +43 -5
hud/cli/build.py +699 -113
hud/cli/debug.py +8 -5
hud/cli/dev.py +889 -732
hud/cli/eval.py +793 -667
hud/cli/flows/dev.py +167 -0
hud/cli/flows/init.py +191 -0
hud/cli/flows/tasks.py +153 -56
hud/cli/flows/templates.py +151 -0
hud/cli/flows/tests/__init__.py +1 -0
hud/cli/flows/tests/test_dev.py +126 -0
hud/cli/init.py +60 -58
hud/cli/pull.py +1 -1
hud/cli/push.py +38 -13
hud/cli/rft.py +311 -0
hud/cli/rft_status.py +145 -0
hud/cli/tests/test_analyze.py +5 -5
hud/cli/tests/test_analyze_metadata.py +3 -2
hud/cli/tests/test_analyze_module.py +120 -0
hud/cli/tests/test_build.py +110 -8
hud/cli/tests/test_build_failure.py +41 -0
hud/cli/tests/test_build_module.py +50 -0
hud/cli/tests/test_cli_init.py +6 -1
hud/cli/tests/test_cli_more_wrappers.py +30 -0
hud/cli/tests/test_cli_root.py +140 -0
hud/cli/tests/test_convert.py +361 -0
hud/cli/tests/test_debug.py +12 -10
hud/cli/tests/test_dev.py +197 -0
hud/cli/tests/test_eval.py +251 -0
hud/cli/tests/test_eval_bedrock.py +51 -0
hud/cli/tests/test_init.py +124 -0
hud/cli/tests/test_main_module.py +11 -5
hud/cli/tests/test_mcp_server.py +12 -100
hud/cli/tests/test_push.py +1 -1
hud/cli/tests/test_push_happy.py +74 -0
hud/cli/tests/test_push_wrapper.py +23 -0
hud/cli/tests/test_registry.py +1 -1
hud/cli/tests/test_utils.py +1 -1
hud/cli/{rl → utils}/celebrate.py +14 -12
hud/cli/utils/config.py +18 -1
hud/cli/utils/docker.py +130 -4
hud/cli/utils/env_check.py +9 -9
hud/cli/utils/git.py +136 -0
hud/cli/utils/interactive.py +39 -5
hud/cli/utils/metadata.py +70 -1
hud/cli/utils/runner.py +1 -1
hud/cli/utils/server.py +2 -2
hud/cli/utils/source_hash.py +3 -3
hud/cli/utils/tasks.py +4 -1
hud/cli/utils/tests/__init__.py +0 -0
hud/cli/utils/tests/test_config.py +58 -0
hud/cli/utils/tests/test_docker.py +93 -0
hud/cli/utils/tests/test_docker_hints.py +71 -0
hud/cli/utils/tests/test_env_check.py +74 -0
hud/cli/utils/tests/test_environment.py +42 -0
hud/cli/utils/tests/test_git.py +142 -0
hud/cli/utils/tests/test_interactive_module.py +60 -0
hud/cli/utils/tests/test_local_runner.py +50 -0
hud/cli/utils/tests/test_logging_utils.py +23 -0
hud/cli/utils/tests/test_metadata.py +49 -0
hud/cli/utils/tests/test_package_runner.py +35 -0
hud/cli/utils/tests/test_registry_utils.py +49 -0
hud/cli/utils/tests/test_remote_runner.py +25 -0
hud/cli/utils/tests/test_runner_modules.py +52 -0
hud/cli/utils/tests/test_source_hash.py +36 -0
hud/cli/utils/tests/test_tasks.py +80 -0
hud/cli/utils/version_check.py +258 -0
hud/cli/{rl → utils}/viewer.py +2 -2
hud/clients/README.md +12 -11
hud/clients/__init__.py +4 -3
hud/clients/base.py +166 -26
hud/clients/environment.py +51 -0
hud/clients/fastmcp.py +13 -6
hud/clients/mcp_use.py +45 -15
hud/clients/tests/test_analyze_scenarios.py +206 -0
hud/clients/tests/test_protocol.py +9 -3
hud/datasets/__init__.py +23 -20
hud/datasets/loader.py +326 -0
hud/datasets/runner.py +198 -105
hud/datasets/tests/__init__.py +0 -0
hud/datasets/tests/test_loader.py +221 -0
hud/datasets/tests/test_utils.py +315 -0
hud/datasets/utils.py +270 -90
hud/environment/__init__.py +52 -0
hud/environment/connection.py +258 -0
hud/environment/connectors/__init__.py +33 -0
hud/environment/connectors/base.py +68 -0
hud/environment/connectors/local.py +177 -0
hud/environment/connectors/mcp_config.py +137 -0
hud/environment/connectors/openai.py +101 -0
hud/environment/connectors/remote.py +172 -0
hud/environment/environment.py +835 -0
hud/environment/integrations/__init__.py +45 -0
hud/environment/integrations/adk.py +67 -0
hud/environment/integrations/anthropic.py +196 -0
hud/environment/integrations/gemini.py +92 -0
hud/environment/integrations/langchain.py +82 -0
hud/environment/integrations/llamaindex.py +68 -0
hud/environment/integrations/openai.py +238 -0
hud/environment/mock.py +306 -0
hud/environment/router.py +263 -0
hud/environment/scenarios.py +620 -0
hud/environment/tests/__init__.py +1 -0
hud/environment/tests/test_connection.py +317 -0
hud/environment/tests/test_connectors.py +205 -0
hud/environment/tests/test_environment.py +593 -0
hud/environment/tests/test_integrations.py +257 -0
hud/environment/tests/test_local_connectors.py +242 -0
hud/environment/tests/test_scenarios.py +1086 -0
hud/environment/tests/test_tools.py +208 -0
hud/environment/types.py +23 -0
hud/environment/utils/__init__.py +35 -0
hud/environment/utils/formats.py +215 -0
hud/environment/utils/schema.py +171 -0
hud/environment/utils/tool_wrappers.py +113 -0
hud/eval/__init__.py +67 -0
hud/eval/context.py +727 -0
hud/eval/display.py +299 -0
hud/eval/instrument.py +187 -0
hud/eval/manager.py +533 -0
hud/eval/parallel.py +268 -0
hud/eval/task.py +372 -0
hud/eval/tests/__init__.py +1 -0
hud/eval/tests/test_context.py +178 -0
hud/eval/tests/test_eval.py +210 -0
hud/eval/tests/test_manager.py +152 -0
hud/eval/tests/test_parallel.py +168 -0
hud/eval/tests/test_task.py +291 -0
hud/eval/types.py +65 -0
hud/eval/utils.py +194 -0
hud/patches/__init__.py +19 -0
hud/patches/mcp_patches.py +308 -0
hud/patches/warnings.py +54 -0
hud/samples/browser.py +4 -4
hud/server/__init__.py +2 -1
hud/server/low_level.py +2 -1
hud/server/router.py +164 -0
hud/server/server.py +567 -80
hud/server/tests/test_mcp_server_integration.py +11 -11
hud/server/tests/test_mcp_server_more.py +1 -1
hud/server/tests/test_server_extra.py +2 -0
hud/settings.py +45 -3
hud/shared/exceptions.py +36 -10
hud/shared/hints.py +26 -1
hud/shared/requests.py +15 -3
hud/shared/tests/test_exceptions.py +40 -31
hud/shared/tests/test_hints.py +167 -0
hud/telemetry/__init__.py +20 -19
hud/telemetry/exporter.py +201 -0
hud/telemetry/instrument.py +165 -253
hud/telemetry/tests/test_eval_telemetry.py +356 -0
hud/telemetry/tests/test_exporter.py +258 -0
hud/telemetry/tests/test_instrument.py +401 -0
hud/tools/__init__.py +18 -2
hud/tools/agent.py +223 -0
hud/tools/apply_patch.py +639 -0
hud/tools/base.py +54 -4
hud/tools/bash.py +2 -2
hud/tools/computer/__init__.py +36 -3
hud/tools/computer/anthropic.py +2 -2
hud/tools/computer/gemini.py +385 -0
hud/tools/computer/hud.py +23 -6
hud/tools/computer/openai.py +20 -21
hud/tools/computer/qwen.py +434 -0
hud/tools/computer/settings.py +37 -0
hud/tools/edit.py +3 -7
hud/tools/executors/base.py +4 -2
hud/tools/executors/pyautogui.py +1 -1
hud/tools/grounding/grounded_tool.py +13 -18
hud/tools/grounding/grounder.py +10 -31
hud/tools/grounding/tests/test_grounded_tool.py +26 -44
hud/tools/jupyter.py +330 -0
hud/tools/playwright.py +18 -3
hud/tools/shell.py +308 -0
hud/tools/tests/test_agent_tool.py +355 -0
hud/tools/tests/test_apply_patch.py +718 -0
hud/tools/tests/test_computer.py +4 -9
hud/tools/tests/test_computer_actions.py +24 -2
hud/tools/tests/test_jupyter_tool.py +181 -0
hud/tools/tests/test_shell.py +596 -0
hud/tools/tests/test_submit.py +85 -0
hud/tools/tests/test_types.py +193 -0
hud/tools/types.py +21 -1
hud/types.py +194 -56
hud/utils/__init__.py +2 -0
hud/utils/env.py +67 -0
hud/utils/hud_console.py +89 -18
hud/utils/mcp.py +15 -58
hud/utils/strict_schema.py +162 -0
hud/utils/tests/test_init.py +1 -2
hud/utils/tests/test_mcp.py +1 -28
hud/utils/tests/test_pretty_errors.py +186 -0
hud/utils/tests/test_tool_shorthand.py +154 -0
hud/utils/tests/test_version.py +1 -1
hud/utils/types.py +20 -0
hud/version.py +1 -1
hud_python-0.5.13.dist-info/METADATA +264 -0
hud_python-0.5.13.dist-info/RECORD +305 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
hud/agents/langchain.py +0 -261
hud/agents/lite_llm.py +0 -72
hud/cli/rl/__init__.py +0 -180
hud/cli/rl/config.py +0 -101
hud/cli/rl/display.py +0 -133
hud/cli/rl/gpu.py +0 -63
hud/cli/rl/gpu_utils.py +0 -321
hud/cli/rl/local_runner.py +0 -595
hud/cli/rl/presets.py +0 -96
hud/cli/rl/remote_runner.py +0 -463
hud/cli/rl/rl_api.py +0 -150
hud/cli/rl/vllm.py +0 -177
hud/cli/rl/wait_utils.py +0 -89
hud/datasets/parallel.py +0 -687
hud/misc/__init__.py +0 -1
hud/misc/claude_plays_pokemon.py +0 -292
hud/otel/__init__.py +0 -35
hud/otel/collector.py +0 -142
hud/otel/config.py +0 -181
hud/otel/context.py +0 -570
hud/otel/exporters.py +0 -369
hud/otel/instrumentation.py +0 -135
hud/otel/processors.py +0 -121
hud/otel/tests/__init__.py +0 -1
hud/otel/tests/test_processors.py +0 -197
hud/rl/README.md +0 -30
hud/rl/__init__.py +0 -1
hud/rl/actor.py +0 -176
hud/rl/buffer.py +0 -405
hud/rl/chat_template.jinja +0 -101
hud/rl/config.py +0 -192
hud/rl/distributed.py +0 -132
hud/rl/learner.py +0 -637
hud/rl/tests/__init__.py +0 -1
hud/rl/tests/test_learner.py +0 -186
hud/rl/train.py +0 -382
hud/rl/types.py +0 -101
hud/rl/utils/start_vllm_server.sh +0 -30
hud/rl/utils.py +0 -524
hud/rl/vllm_adapter.py +0 -143
hud/telemetry/job.py +0 -352
hud/telemetry/replay.py +0 -74
hud/telemetry/tests/test_replay.py +0 -40
hud/telemetry/tests/test_trace.py +0 -63
hud/telemetry/trace.py +0 -158
hud/utils/agent_factories.py +0 -86
hud/utils/async_utils.py +0 -65
hud/utils/group_eval.py +0 -223
hud/utils/progress.py +0 -149
hud/utils/tasks.py +0 -127
hud/utils/tests/test_async_utils.py +0 -173
hud/utils/tests/test_progress.py +0 -261
hud_python-0.4.45.dist-info/METADATA +0 -552
hud_python-0.4.45.dist-info/RECORD +0 -228
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0

hud/cli/tests/test_dev.py ADDED Viewed

@@ -0,0 +1,197 @@
+"""Tests for CLI dev module."""
+from __future__ import annotations
+from unittest import mock
+from hud.cli.dev import auto_detect_module, should_use_docker_mode
+class TestShouldUseDockerMode:
+    """Test Docker mode detection."""
+    def test_docker_mode_with_dockerfile(self, tmp_path):
+        """Test detection when Dockerfile exists."""
+        dockerfile = tmp_path / "Dockerfile"
+        dockerfile.write_text("FROM python:3.11")
+        assert should_use_docker_mode(tmp_path) is True
+    def test_no_docker_mode_without_dockerfile(self, tmp_path):
+        """Test detection when Dockerfile doesn't exist."""
+        assert should_use_docker_mode(tmp_path) is False
+    def test_docker_mode_empty_dockerfile(self, tmp_path):
+        """Test detection with empty Dockerfile."""
+        dockerfile = tmp_path / "Dockerfile"
+        dockerfile.write_text("")
+        assert should_use_docker_mode(tmp_path) is True
+class TestAutoDetectModule:
+    """Test MCP module auto-detection."""
+    def test_detect_module_from_init_with_mcpserver(self, tmp_path, monkeypatch):
+        """Test detection from __init__.py with MCPServer."""
+        monkeypatch.chdir(tmp_path)
+        init_file = tmp_path / "__init__.py"
+        init_file.write_text("""
+from hud.server import MCPServer
+mcp = MCPServer(name='test')
+""")
+        module_name, extra_path = auto_detect_module()
+        assert module_name == tmp_path.name
+        assert extra_path is None
+    def test_detect_module_from_init_with_fastmcp(self, tmp_path, monkeypatch):
+        """Test detection from __init__.py with FastMCP."""
+        monkeypatch.chdir(tmp_path)
+        init_file = tmp_path / "__init__.py"
+        init_file.write_text("""
+from fastmcp import FastMCP
+mcp = FastMCP(name='test')
+""")
+        module_name, extra_path = auto_detect_module()
+        assert module_name == tmp_path.name
+        assert extra_path is None
+    def test_detect_module_from_main_py(self, tmp_path, monkeypatch):
+        """Test detection from main.py with MCPServer."""
+        monkeypatch.chdir(tmp_path)
+        # Need both __init__.py and main.py
+        init_file = tmp_path / "__init__.py"
+        init_file.write_text("")
+        main_file = tmp_path / "main.py"
+        main_file.write_text("""
+from hud.server import MCPServer
+mcp = MCPServer(name='test')
+""")
+        module_name, extra_path = auto_detect_module()
+        assert module_name == f"{tmp_path.name}.main"
+        assert extra_path == tmp_path.parent
+    def test_detect_module_from_init_with_environment(self, tmp_path, monkeypatch):
+        """Test detection from __init__.py with Environment."""
+        monkeypatch.chdir(tmp_path)
+        init_file = tmp_path / "__init__.py"
+        init_file.write_text("""
+from hud import Environment
+env = Environment(name='test')
+""")
+        module_name, extra_path = auto_detect_module()
+        assert module_name == tmp_path.name
+        assert extra_path is None
+    def test_detect_module_from_main_py_with_environment(self, tmp_path, monkeypatch):
+        """Test detection from main.py with Environment."""
+        monkeypatch.chdir(tmp_path)
+        # Need both __init__.py and main.py
+        init_file = tmp_path / "__init__.py"
+        init_file.write_text("")
+        main_file = tmp_path / "main.py"
+        main_file.write_text("""
+from hud import Environment
+env = Environment(name='test')
+""")
+        module_name, extra_path = auto_detect_module()
+        assert module_name == f"{tmp_path.name}.main"
+        assert extra_path == tmp_path.parent
+    def test_no_detection_without_mcp_or_env(self, tmp_path, monkeypatch):
+        """Test no detection when neither mcp nor env is defined."""
+        monkeypatch.chdir(tmp_path)
+        init_file = tmp_path / "__init__.py"
+        init_file.write_text("# Just a comment")
+        module_name, extra_path = auto_detect_module()
+        assert module_name is None
+        assert extra_path is None
+    def test_no_detection_empty_dir(self, tmp_path, monkeypatch):
+        """Test no detection in empty directory."""
+        monkeypatch.chdir(tmp_path)
+        module_name, extra_path = auto_detect_module()
+        assert module_name is None
+        assert extra_path is None
+class TestShowDevServerInfo:
+    """Test dev server info display."""
+    @mock.patch("hud.cli.dev.hud_console")
+    def test_show_dev_server_info_http(self, mock_console):
+        """Test showing server info for HTTP transport."""
+        from hud.cli.dev import show_dev_server_info
+        result = show_dev_server_info(
+            server_name="test-server",
+            port=8000,
+            transport="http",
+            inspector=False,
+            interactive=False,
+        )
+        # Returns cursor deeplink
+        assert result.startswith("cursor://")
+        assert "test-server" in result
+        # Console should have been called
+        assert mock_console.section_title.called
+        assert mock_console.info.called
+    @mock.patch("hud.cli.dev.hud_console")
+    def test_show_dev_server_info_stdio(self, mock_console):
+        """Test showing server info for stdio transport."""
+        from hud.cli.dev import show_dev_server_info
+        result = show_dev_server_info(
+            server_name="test-server",
+            port=8000,
+            transport="stdio",
+            inspector=False,
+            interactive=False,
+        )
+        # Returns cursor deeplink
+        assert result.startswith("cursor://")
+    @mock.patch("hud.cli.dev.hud_console")
+    def test_show_dev_server_info_with_telemetry(self, mock_console):
+        """Test showing server info with telemetry URLs."""
+        from hud.cli.dev import show_dev_server_info
+        result = show_dev_server_info(
+            server_name="browser-env",
+            port=8000,
+            transport="http",
+            inspector=False,
+            interactive=False,
+            telemetry={
+                "live_url": "https://hud.ai/trace/123",
+                "vnc_url": "http://localhost:5900",
+            },
+        )
+        assert result.startswith("cursor://")

hud/cli/tests/test_eval.py ADDED Viewed

@@ -0,0 +1,251 @@
+"""Tests for hud.cli.eval module and run_dataset function."""
+from __future__ import annotations
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock, patch
+import pytest
+from mcp import types
+from hud.environment.router import ToolRouter
+from hud.eval.context import EvalContext
+from hud.types import AgentType, MCPToolResult, Trace
+class MockEvalContext(EvalContext):
+    """Mock EvalContext for testing."""
+    def __init__(
+        self,
+        prompt: str = "Test prompt",
+        tools: list[types.Tool] | None = None,
+    ) -> None:
+        # Core attributes
+        self.prompt = prompt
+        self._tools = tools or []
+        self._submitted: str | None = None
+        self.reward: float | None = None
+        self.results: list[EvalContext] = []
+        # Environment attributes
+        self._router = ToolRouter()
+        self._agent_include: list[str] | None = None
+        self._agent_exclude: list[str] | None = None
+        # EvalContext attributes
+        self._task = None
+        self.trace_id = "test-trace-id"
+        self.eval_name = "test-eval"
+        self.job_id: str | None = None
+        self.group_id: str | None = None
+        self.index = 0
+        self.variants: dict[str, Any] = {}
+        self.answer: str | None = None
+        self.system_prompt: str | None = None
+        self.error: BaseException | None = None
+        self.metadata: dict[str, Any] = {}
+        self._is_summary = False
+    def as_tools(self) -> list[types.Tool]:
+        return self._tools
+    @property
+    def has_scenario(self) -> bool:
+        return False
+    async def list_tools(self) -> list[types.Tool]:
+        return self._tools
+    async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
+        return MCPToolResult(
+            content=[types.TextContent(type="text", text="ok")],
+            isError=False,
+        )
+    async def submit(self, answer: str) -> None:
+        self._submitted = answer
+def _create_mock_agent_cls() -> tuple[MagicMock, MagicMock]:
+    """Create a mock agent class and instance for testing."""
+    mock_agent_instance = MagicMock()
+    mock_agent_instance.run = AsyncMock(return_value=Trace(reward=1.0, done=True))
+    mock_agent_cls = MagicMock()
+    mock_agent_cls.create.return_value = mock_agent_instance
+    return mock_agent_cls, mock_agent_instance
+class TestRunDataset:
+    """Test the new run_dataset function."""
+    @pytest.mark.asyncio
+    async def test_run_dataset_with_task_list(self) -> None:
+        """Test run_dataset with a list of tasks."""
+        from hud.eval.task import Task
+        tasks = [
+            Task(env={"name": "test"}, id="task1", scenario="test"),
+            Task(env={"name": "test"}, id="task2", scenario="test"),
+        ]
+        mock_agent_cls, mock_agent_instance = _create_mock_agent_cls()
+        # Mock hud.eval to return our mock context
+        mock_ctx = MockEvalContext()
+        with (
+            patch("hud.datasets.runner.hud.eval") as mock_eval,
+            patch("hud.agents.claude.ClaudeAgent", mock_agent_cls),
+        ):
+            # Set up the async context manager
+            mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
+            mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
+            from hud.datasets.runner import run_dataset
+            await run_dataset(tasks, agent_type="claude", max_steps=5)
+            # Verify hud.eval was called with correct params
+            mock_eval.assert_called_once()
+            call_kwargs = mock_eval.call_args[1]
+            assert call_kwargs["group"] == 1
+            assert call_kwargs["max_concurrent"] == 30
+            # Agent should have run
+            mock_agent_instance.run.assert_called_once()
+    @pytest.mark.asyncio
+    async def test_run_dataset_with_string_source(self) -> None:
+        """Test run_dataset with a string source (loads via load_dataset)."""
+        from hud.eval.task import Task
+        mock_tasks = [Task(env={"name": "test"}, id="loaded_task", scenario="loaded")]
+        mock_agent_cls, _ = _create_mock_agent_cls()
+        mock_ctx = MockEvalContext()
+        with (
+            patch("hud.datasets.loader.load_tasks", return_value=mock_tasks) as mock_load,
+            patch("hud.datasets.runner.hud.eval") as mock_eval,
+            patch("hud.agents.OpenAIAgent", mock_agent_cls),
+        ):
+            mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
+            mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
+            from hud.datasets.runner import run_dataset
+            await run_dataset("my-tasks.json", agent_type="openai")
+            # Verify load_dataset was called
+            mock_load.assert_called_once_with("my-tasks.json")
+    @pytest.mark.asyncio
+    async def test_run_dataset_empty_tasks_raises(self) -> None:
+        """Test run_dataset raises ValueError for empty tasks."""
+        with patch("hud.datasets.loader.load_dataset", return_value=[]):
+            from hud.datasets.runner import run_dataset
+            with pytest.raises(ValueError, match="No tasks to run"):
+                await run_dataset([], agent_type=AgentType.CLAUDE)
+    @pytest.mark.asyncio
+    async def test_run_dataset_with_group_size(self) -> None:
+        """Test run_dataset passes group_size to hud.eval."""
+        from hud.eval.task import Task
+        tasks = [Task(env={"name": "test"}, id="task1", scenario="test")]
+        mock_agent_cls, _ = _create_mock_agent_cls()
+        mock_ctx = MockEvalContext()
+        with (
+            patch("hud.datasets.runner.hud.eval") as mock_eval,
+            patch("hud.agents.claude.ClaudeAgent", mock_agent_cls),
+        ):
+            mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
+            mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
+            from hud.datasets.runner import run_dataset
+            await run_dataset(tasks, agent_type="claude", group_size=3)
+            call_kwargs = mock_eval.call_args[1]
+            assert call_kwargs["group"] == 3
+    @pytest.mark.asyncio
+    async def test_run_dataset_with_max_concurrent(self) -> None:
+        """Test run_dataset passes max_concurrent to hud.eval."""
+        from hud.eval.task import Task
+        tasks = [Task(env={"name": "test"}, id="task1", scenario="test")]
+        mock_agent_cls, _ = _create_mock_agent_cls()
+        mock_ctx = MockEvalContext()
+        with (
+            patch("hud.datasets.runner.hud.eval") as mock_eval,
+            patch("hud.agents.claude.ClaudeAgent", mock_agent_cls),
+        ):
+            mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
+            mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
+            from hud.datasets.runner import run_dataset
+            await run_dataset(tasks, agent_type="claude", max_concurrent=10)
+            call_kwargs = mock_eval.call_args[1]
+            assert call_kwargs["max_concurrent"] == 10
+    @pytest.mark.asyncio
+    async def test_run_dataset_returns_results(self) -> None:
+        """Test run_dataset returns EvalContext results."""
+        from hud.eval.task import Task
+        tasks = [Task(env={"name": "test"}, id="task1", scenario="test")]
+        mock_agent_cls, _ = _create_mock_agent_cls()
+        mock_ctx = MockEvalContext()
+        with (
+            patch("hud.datasets.runner.hud.eval") as mock_eval,
+            patch("hud.agents.claude.ClaudeAgent", mock_agent_cls),
+        ):
+            mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
+            mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
+            from hud.datasets.runner import run_dataset
+            results = await run_dataset(tasks, agent_type="claude")
+            # Should return list with the context
+            assert len(results) == 1
+            assert results[0] is mock_ctx
+    @pytest.mark.asyncio
+    async def test_run_dataset_parallel_results(self) -> None:
+        """Test run_dataset returns ctx.results for parallel execution."""
+        from hud.eval.task import Task
+        tasks = [Task(env={"name": "test"}, id="task1", scenario="test")]
+        mock_agent_cls, _ = _create_mock_agent_cls()
+        # Create mock context with results (parallel execution)
+        mock_result1 = MockEvalContext(prompt="result1")
+        mock_result1.reward = 0.8
+        mock_result2 = MockEvalContext(prompt="result2")
+        mock_result2.reward = 0.9
+        mock_ctx = MockEvalContext()
+        mock_ctx.results = [mock_result1, mock_result2]
+        with (
+            patch("hud.datasets.runner.hud.eval") as mock_eval,
+            patch("hud.agents.claude.ClaudeAgent", mock_agent_cls),
+        ):
+            mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
+            mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
+            from hud.datasets.runner import run_dataset
+            results = await run_dataset(tasks, agent_type="claude")
+            # Should return the parallel results
+            assert len(results) == 2
+            assert results[0].reward == 0.8
+            assert results[1].reward == 0.9

hud/cli/tests/test_eval_bedrock.py ADDED Viewed

@@ -0,0 +1,51 @@
+"""Tests for AWS Bedrock auto-detection in hud.cli.eval."""
+from __future__ import annotations
+from unittest.mock import MagicMock, patch
+import pytest
+import typer
+from hud.cli.eval import EvalConfig
+from hud.types import AgentType
+class TestBedrockAutoDetection:
+    VALID_ARN = "arn:aws:bedrock:us-east-1:123456789012:inference-profile/my-profile"
+    def test_get_agent_kwargs_detects_bedrock_arn_from_config_checkpoint_name(self) -> None:
+        """Regression: ARN in [claude].checkpoint_name should trigger Bedrock client."""
+        cfg = EvalConfig(
+            agent_type=AgentType.CLAUDE,
+            model=None,  # no CLI --model
+            agent_config={"claude": {"checkpoint_name": self.VALID_ARN}},
+        )
+        with (
+            patch("hud.settings.settings.aws_access_key_id", "AKIATEST"),
+            patch("hud.settings.settings.aws_secret_access_key", "secret"),
+            patch("hud.settings.settings.aws_region", "us-east-1"),
+            patch("anthropic.AsyncAnthropicBedrock", return_value=MagicMock()) as mock_bedrock,
+        ):
+            kwargs = cfg.get_agent_kwargs()
+        assert kwargs.get("checkpoint_name") == self.VALID_ARN
+        assert "model_client" in kwargs
+        mock_bedrock.assert_called_once()
+    def test_get_agent_kwargs_bedrock_arn_missing_aws_creds_exits(self) -> None:
+        """Should fail fast if ARN is detected but AWS creds are missing."""
+        cfg = EvalConfig(
+            agent_type=AgentType.CLAUDE,
+            model=None,
+            agent_config={"claude": {"checkpoint_name": self.VALID_ARN}},
+        )
+        with (
+            patch("hud.settings.settings.aws_access_key_id", None),
+            patch("hud.settings.settings.aws_secret_access_key", None),
+            patch("hud.settings.settings.aws_region", None),
+            pytest.raises(typer.Exit),
+        ):
+            cfg.get_agent_kwargs()

hud/cli/tests/test_init.py ADDED Viewed

@@ -0,0 +1,124 @@
+"""Tests for CLI init module."""
+from __future__ import annotations
+from hud.cli.init import _replace_placeholders
+class TestReplacePlaceholders:
+    """Test placeholder replacement in template files."""
+    def test_replace_in_pyproject(self, tmp_path):
+        """Test replacing placeholders in pyproject.toml."""
+        # Create server directory structure
+        server_dir = tmp_path / "server"
+        server_dir.mkdir()
+        pyproject = server_dir / "pyproject.toml"
+        pyproject.write_text("""
+[project]
+name = "blank"
+description = "blank environment"
+""")
+        modified = _replace_placeholders(tmp_path, "my-cool-env")
+        # Normalize paths for cross-platform comparison
+        modified_normalized = [p.replace("\\", "/") for p in modified]
+        assert "server/pyproject.toml" in modified_normalized
+        content = pyproject.read_text()
+        assert "my_cool_env" in content
+        assert "blank" not in content
+    def test_replace_in_readme(self, tmp_path):
+        """Test replacing placeholders in README.md."""
+        readme = tmp_path / "README.md"
+        readme.write_text("# blank\n\nThis is the blank environment.")
+        modified = _replace_placeholders(tmp_path, "test-env")
+        assert "README.md" in modified
+        content = readme.read_text()
+        assert "test_env" in content
+        assert "blank" not in content
+    def test_replace_in_tasks_json(self, tmp_path):
+        """Test replacing placeholders in tasks.json."""
+        tasks = tmp_path / "tasks.json"
+        tasks.write_text('{"name": "blank", "tasks": []}')
+        modified = _replace_placeholders(tmp_path, "my-tasks")
+        assert "tasks.json" in modified
+        content = tasks.read_text()
+        assert "my_tasks" in content
+    def test_no_replace_in_non_placeholder_files(self, tmp_path):
+        """Test that non-placeholder files are not modified."""
+        other_file = tmp_path / "other.py"
+        other_file.write_text("# blank comment")
+        modified = _replace_placeholders(tmp_path, "test")
+        assert "other.py" not in modified
+        content = other_file.read_text()
+        assert "blank" in content  # Should be unchanged
+    def test_skip_pycache_directories(self, tmp_path):
+        """Test that __pycache__ directories are skipped."""
+        pycache = tmp_path / "__pycache__"
+        pycache.mkdir()
+        cached_file = pycache / "module.pyc"
+        cached_file.write_text("blank")
+        modified = _replace_placeholders(tmp_path, "test")
+        # __pycache__ files should not be in modified list
+        assert not any("__pycache__" in f for f in modified)
+    def test_normalize_special_characters(self, tmp_path):
+        """Test that environment name is normalized for Python identifiers."""
+        server_dir = tmp_path / "server"
+        server_dir.mkdir()
+        pyproject = server_dir / "pyproject.toml"
+        pyproject.write_text('name = "blank"')
+        _replace_placeholders(tmp_path, "my cool-env.v2!")
+        content = pyproject.read_text()
+        # Special characters should be replaced with underscores
+        assert "my_cool_env_v2_" in content
+    def test_no_changes_when_no_placeholder(self, tmp_path):
+        """Test that files without placeholder are not modified."""
+        server_dir = tmp_path / "server"
+        server_dir.mkdir()
+        pyproject = server_dir / "pyproject.toml"
+        pyproject.write_text('name = "other-name"')
+        modified = _replace_placeholders(tmp_path, "test")
+        assert "server/pyproject.toml" not in modified
+    def test_nested_directory_structure(self, tmp_path):
+        """Test replacement in nested directory structure."""
+        # Create nested structure
+        server_dir = tmp_path / "server"
+        server_dir.mkdir()
+        (server_dir / "pyproject.toml").write_text('name = "blank"')
+        env_dir = tmp_path / "environment"
+        env_dir.mkdir()
+        (env_dir / "pyproject.toml").write_text('name = "blank"')
+        (env_dir / "README.md").write_text("# blank environment")
+        modified = _replace_placeholders(tmp_path, "nested-test")
+        # Normalize paths for cross-platform comparison
+        modified_normalized = [p.replace("\\", "/") for p in modified]
+        assert "server/pyproject.toml" in modified_normalized
+        assert "environment/pyproject.toml" in modified_normalized
+        assert "environment/README.md" in modified_normalized

hud/cli/tests/test_main_module.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
+import os
 import subprocess
 import sys
@@ -20,11 +21,16 @@ class TestMainModule:
     def test_main_module_executes(self):
         """Test that running the module as main executes correctly."""
         # Use subprocess to run the module as __main__ and check it doesn't crash
-        # We expect it to show help/error since we're not providing arguments
+        # Use --version flag for a quick, deterministic test that doesn't require user input
+        env = {**os.environ, "HUD_SKIP_VERSION_CHECK": "1"}
         result = subprocess.run(
-            [sys.executable, "-m", "hud.cli"], capture_output=True, text=True, timeout=10
+            [sys.executable, "-m", "hud.cli", "--version"],
+            capture_output=True,
+            text=True,
+            timeout=30,
+            env=env,
         )
-        # Should exit with an error code but not crash
-        # (The actual main function will show help or error for missing args)
-        assert result.returncode != 0  # CLI should exit with error for no args
+        # Should exit successfully with version info
+        assert result.returncode == 0
+        assert "version" in result.stdout.lower() or "hud" in result.stdout.lower()

hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

hud-python 0.4.45py3-none-any.whl → 0.5.13py3-none-any.whl