PyPI - hud-python - Versions diffs - 0.4.48__py3-none-any.whl → 0.4.50__py3-none-any.whl - Mend

hud-python 0.4.48py3-none-any.whl → 0.4.50py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (27) hide show

hud/agents/base.py +40 -34
hud/agents/grounded_openai.py +1 -1
hud/cli/__init__.py +78 -213
hud/cli/build.py +105 -45
hud/cli/dev.py +614 -743
hud/cli/flows/tasks.py +98 -17
hud/cli/init.py +18 -14
hud/cli/push.py +27 -9
hud/cli/rl/local_runner.py +3 -3
hud/cli/tests/test_eval.py +168 -119
hud/cli/tests/test_mcp_server.py +6 -95
hud/cli/utils/env_check.py +9 -9
hud/cli/utils/source_hash.py +1 -1
hud/server/__init__.py +2 -1
hud/server/router.py +160 -0
hud/server/server.py +246 -79
hud/tools/base.py +9 -1
hud/tools/bash.py +2 -2
hud/tools/edit.py +3 -7
hud/utils/hud_console.py +43 -0
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.48.dist-info → hud_python-0.4.50.dist-info}/METADATA +1 -1
{hud_python-0.4.48.dist-info → hud_python-0.4.50.dist-info}/RECORD +27 -26
{hud_python-0.4.48.dist-info → hud_python-0.4.50.dist-info}/WHEEL +0 -0
{hud_python-0.4.48.dist-info → hud_python-0.4.50.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.48.dist-info → hud_python-0.4.50.dist-info}/licenses/LICENSE +0 -0

hud/cli/tests/test_eval.py CHANGED Viewed

@@ -7,9 +7,13 @@ from unittest.mock import AsyncMock, MagicMock, Mock, patch
 import pytest
 from mcp import types
-from hud.cli.eval import build_agent, eval_command, get_available_models, run_full_dataset, run_single_task
+from hud.cli.eval import (
+    build_agent,
+    run_single_task,
+)
 from hud.types import Task, Trace
 class TestBuildAgent:
     """Test the build_agent function."""
@@ -20,10 +24,10 @@ class TestBuildAgent:
         with patch("hud.agents.misc.integration_test_agent.IntegrationTestRunner") as mock_runner:
             mock_instance = Mock()
             mock_runner.return_value = mock_instance
             # Test with verbose=False
             result = build_agent("integration_test", verbose=False)
             mock_runner.assert_called_once_with(verbose=False)
             assert result == mock_instance
@@ -34,14 +38,11 @@ class TestBuildAgent:
         with patch("hud.agents.ClaudeAgent") as mock_runner:
             mock_instance = Mock()
             mock_runner.return_value = mock_instance
             # Test with verbose=False
             result = build_agent("claude", verbose=False)
-            mock_runner.assert_called_once_with(
-                model="claude-sonnet-4-20250514",
-                verbose=False
-            )
+            mock_runner.assert_called_once_with(model="claude-sonnet-4-20250514", verbose=False)
             assert result == mock_instance
     def test_builds_claude_agent_with_custom_model_and_allowed_tools(self) -> None:
@@ -51,7 +52,7 @@ class TestBuildAgent:
         with patch("hud.agents.ClaudeAgent") as mock_runner:
             mock_instance = Mock()
             mock_runner.return_value = mock_instance
             # Test with verbose=False
             result = build_agent(
                 "claude",
@@ -59,7 +60,7 @@ class TestBuildAgent:
                 allowed_tools=["act"],
                 verbose=True,
             )
             mock_runner.assert_called_once_with(
                 model="claude-sonnet-4-20250514",
                 allowed_tools=["act"],
@@ -81,19 +82,23 @@ class TestRunSingleTask:
                 "system_prompt": "Custom instructions",
                 "allowed_tools": ["tool1", "tool2"],
                 "append_setup_output": False,
-            }
+            },
         )
         mock_agent = AsyncMock(
-            initialize=AsyncMock(),
-            run=AsyncMock(return_value=Trace(reward=1.0, done=True))
+            initialize=AsyncMock(), run=AsyncMock(return_value=Trace(reward=1.0, done=True))
         )
-        with patch("hud.utils.tasks.load_tasks", return_value=[mock_task]), \
-             patch("hud.agents.misc.integration_test_agent.IntegrationTestRunner", return_value=mock_agent), \
-             patch("hud.cli.eval.find_environment_dir", return_value=None), \
-             patch("hud.cli.eval.hud.trace"):
+        with (
+            patch("hud.utils.tasks.load_tasks", return_value=[mock_task]),
+            patch(
+                "hud.agents.misc.integration_test_agent.IntegrationTestRunner",
+                return_value=mock_agent,
+            ),
+            patch("hud.cli.eval.find_environment_dir", return_value=None),
+            patch("hud.cli.eval.hud.trace"),
+        ):
             await run_single_task("test.json", agent_type="integration_test", max_steps=10)
             # Verify agent.run was called with the task containing agent_config
             mock_agent.run.assert_called_once()
             called_task = mock_agent.run.call_args[0][0]
@@ -103,17 +108,20 @@ class TestRunSingleTask:
     async def test_runs_with_group_size_greater_than_one(self) -> None:
         """Test that group_size > 1 triggers run_tasks_grouped instead of agent.run."""
         mock_task = Task(prompt="Test", mcp_config={"local": {"url": "http://localhost:8765/mcp"}})
-        with patch("hud.utils.tasks.load_tasks", return_value=[mock_task]), \
-             patch("hud.cli.eval.run_tasks_grouped", new_callable=AsyncMock) as mock_grouped, \
-             patch("hud.cli.eval.display_group_statistics"), \
-             patch("hud.cli.eval.find_environment_dir", return_value=None), \
-             patch("hud.cli.eval.hud.trace"):
+        with (
+            patch("hud.utils.tasks.load_tasks", return_value=[mock_task]),
+            patch("hud.cli.eval.run_tasks_grouped", new_callable=AsyncMock) as mock_grouped,
+            patch("hud.cli.eval.display_group_statistics"),
+            patch("hud.cli.eval.find_environment_dir", return_value=None),
+            patch("hud.cli.eval.hud.trace"),
+        ):
             mock_grouped.return_value = [{"task": mock_task, "rewards": [1.0, 0.5]}]
-            await run_single_task("test.json", agent_type="integration_test", group_size=3, max_steps=10)
+            await run_single_task(
+                "test.json", agent_type="integration_test", group_size=3, max_steps=10
+            )
             # Verify run_tasks_grouped was called with correct group_size
             mock_grouped.assert_called_once()
             assert mock_grouped.call_args.kwargs["group_size"] == 3
@@ -145,20 +153,20 @@ class TestToolFiltering:
     ) -> list[types.Tool]:
         """Helper to create agent, initialize with tools and config, return filtered tools."""
         from hud.agents import ClaudeAgent
         mock_mcp_client.list_tools = AsyncMock(return_value=tools)
         task = Task(
             prompt="Test",
             mcp_config={"local": {"url": "http://localhost"}},
-            agent_config=agent_config or {}
+            agent_config=agent_config or {},
         )
         agent = ClaudeAgent(
             mcp_client=mock_mcp_client,
             model_client=mock_model_client,
             model="test",
-            validate_api_key=False
+            validate_api_key=False,
         )
         await agent.initialize(task)
         return agent.get_available_tools()
@@ -171,13 +179,15 @@ class TestToolFiltering:
             types.Tool(name="tool2", description="Tool 2", inputSchema={}),
             types.Tool(name="debug_tool", description="Debug", inputSchema={}),
         ]
         result = await self._run_agent_with_tools(mock_mcp_client, mock_model_client, tools)
         assert len(result) == 3
     @pytest.mark.asyncio
-    async def test_allowed_tools_filters_correctly(self, mock_mcp_client, mock_model_client) -> None:
+    async def test_allowed_tools_filters_correctly(
+        self, mock_mcp_client, mock_model_client
+    ) -> None:
         """Test that allowed_tools in agent_config filters to matching patterns."""
         tools = [
             types.Tool(name="screenshot_take", description="Tool 1", inputSchema={}),
@@ -185,14 +195,18 @@ class TestToolFiltering:
             types.Tool(name="click", description="Tool 3", inputSchema={}),
         ]
         agent_config = {"allowed_tools": ["screenshot_*"]}
-        result = await self._run_agent_with_tools(mock_mcp_client, mock_model_client, tools, agent_config)
+        result = await self._run_agent_with_tools(
+            mock_mcp_client, mock_model_client, tools, agent_config
+        )
         assert len(result) == 2
         assert all("screenshot" in t.name for t in result)
     @pytest.mark.asyncio
-    async def test_disallowed_tools_excludes_correctly(self, mock_mcp_client, mock_model_client) -> None:
+    async def test_disallowed_tools_excludes_correctly(
+        self, mock_mcp_client, mock_model_client
+    ) -> None:
         """Test that disallowed_tools in agent_config excludes matching patterns."""
         tools = [
             types.Tool(name="tool1", description="Tool 1", inputSchema={}),
@@ -200,27 +214,30 @@ class TestToolFiltering:
             types.Tool(name="internal_secret", description="Tool 3", inputSchema={}),
         ]
         agent_config = {"disallowed_tools": ["debug_*", "internal_*"]}
-        result = await self._run_agent_with_tools(mock_mcp_client, mock_model_client, tools, agent_config)
+        result = await self._run_agent_with_tools(
+            mock_mcp_client, mock_model_client, tools, agent_config
+        )
         assert len(result) == 1
         assert result[0].name == "tool1"
     @pytest.mark.asyncio
-    async def test_both_filters_applies_allowed_then_disallowed(self, mock_mcp_client, mock_model_client) -> None:
+    async def test_both_filters_applies_allowed_then_disallowed(
+        self, mock_mcp_client, mock_model_client
+    ) -> None:
         """Test that both filters in agent_config work together (disallowed takes precedence)."""
         tools = [
             types.Tool(name="browser_click", description="Tool 1", inputSchema={}),
             types.Tool(name="browser_debug", description="Tool 2", inputSchema={}),
             types.Tool(name="system_click", description="Tool 3", inputSchema={}),
         ]
-        agent_config = {
-            "allowed_tools": ["browser_*"],
-            "disallowed_tools": ["*_debug"]
-        }
-        result = await self._run_agent_with_tools(mock_mcp_client, mock_model_client, tools, agent_config)
+        agent_config = {"allowed_tools": ["browser_*"], "disallowed_tools": ["*_debug"]}
+        result = await self._run_agent_with_tools(
+            mock_mcp_client, mock_model_client, tools, agent_config
+        )
         assert len(result) == 1
         assert result[0].name == "browser_click"
@@ -247,16 +264,20 @@ class TestRunDatasetToolFiltering:
     @pytest.fixture
     def mock_run_context(self, captured_agent_fixture):
         """Fixture for mocking _run_context."""
         async def _mock(self, context, max_steps=10):
             captured_agent_fixture["agent"] = self
             return Trace(reward=1.0, done=True, content="Done")
         return _mock
     @pytest.fixture
     def mock_call_tools(self):
         """Fixture for mocking call_tools."""
         async def _mock(self, tool_call=None):
             return []
         return _mock
     @pytest.fixture
@@ -271,35 +292,47 @@ class TestRunDatasetToolFiltering:
     @pytest.mark.asyncio
     async def test_agent_config_intersection_union_via_run_dataset(
-        self, all_tools, captured_agent_fixture, mock_run_context, mock_call_tools, mock_client_instance
+        self,
+        all_tools,
+        captured_agent_fixture,
+        mock_run_context,
+        mock_call_tools,
+        mock_client_instance,
     ) -> None:
-        """Test that allowed_tools intersect and disallowed_tools union when set in both __init__ and task.agent_config."""
+        """Test that allowed_tools intersect and disallowed_tools union when set in both __init__ and task.agent_config."""  # noqa: E501
         from hud.agents import ClaudeAgent
         from hud.datasets.runner import run_dataset
         # Create a task with its own agent_config
         task_dict = {
             "prompt": "Test task",
             "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
             "agent_config": {
-                "allowed_tools": ["browser_*", "system_screenshot"],  # Task wants browser_* and system_screenshot
-                "disallowed_tools": ["*_debug", "*_execute"],  # Task disallows *_debug and *_execute
-            }
+                "allowed_tools": [
+                    "browser_*",
+                    "system_screenshot",
+                ],  # Task wants browser_* and system_screenshot
+                "disallowed_tools": [
+                    "*_debug",
+                    "*_execute",
+                ],  # Task disallows *_debug and *_execute
+            },
         }
         # Agent config passed to __init__ via run_dataset
         agent_init_config = {
             "allowed_tools": ["browser_*", "system_*"],  # Agent init wants browser_* and system_*
             "disallowed_tools": ["browser_debug"],  # Agent init disallows browser_debug
             "validate_api_key": False,
         }
-        with patch("hud.job"), \
-             patch("hud.trace"), \
-             patch.object(ClaudeAgent, "_run_context", mock_run_context), \
-             patch.object(ClaudeAgent, "call_tools", mock_call_tools), \
-             patch("hud.clients.MCPClient", return_value=mock_client_instance):
+        with (
+            patch("hud.job"),
+            patch("hud.trace"),
+            patch.object(ClaudeAgent, "_run_context", mock_run_context),
+            patch.object(ClaudeAgent, "call_tools", mock_call_tools),
+            patch("hud.clients.MCPClient", return_value=mock_client_instance),
+        ):
             # Run the dataset
             await run_dataset(
                 name="test_job",
@@ -308,35 +341,42 @@ class TestRunDatasetToolFiltering:
                 agent_config=agent_init_config,
                 max_steps=10,
             )
             # Verify agent was created and ran
             captured_agent = captured_agent_fixture["agent"]
             assert captured_agent is not None
             # Get the filtered tools
             filtered_tools = captured_agent.get_available_tools()
             filtered_names = {tool.name for tool in filtered_tools}
             # Expected behavior:
-            # 1. allowed_tools intersection: ["browser_*", "system_*"] ∩ ["browser_*", "system_screenshot"]
+            # 1. allowed_tools intersection: ["browser_*", "system_*"] ∩ ["browser_*", "system_screenshot"] # noqa: E501
             #    Exact string intersection: only "browser_*" is in both lists
-            #    So only tools matching browser_* are allowed: browser_click, browser_type, browser_debug
-            # 2. disallowed_tools union: ["browser_debug"] ∪ ["*_debug", "*_execute"]
+            #    So only tools matching browser_* are allowed: browser_click, browser_type, browser_debug # noqa: E501
+            # 2. disallowed_tools union: ["browser_debug"] U ["*_debug", "*_execute"]
             #    Result: ["browser_debug", "*_debug", "*_execute"] (all patterns included)
             # 3. Final: {browser_click, browser_type, browser_debug} - {browser_debug}
             #    Result: browser_click, browser_type
             expected_tools = {"browser_click", "browser_type"}
-            assert filtered_names == expected_tools, f"Expected {expected_tools}, got {filtered_names}"
+            assert filtered_names == expected_tools, (
+                f"Expected {expected_tools}, got {filtered_names}"
+            )
     @pytest.mark.asyncio
     async def test_no_allowed_tools_keeps_all_tools_except_disallowed(
-        self, all_tools, captured_agent_fixture, mock_run_context, mock_call_tools, mock_client_instance
+        self,
+        all_tools,
+        captured_agent_fixture,
+        mock_run_context,
+        mock_call_tools,
+        mock_client_instance,
     ) -> None:
-        """Test that when allowed_tools is not set, all tools are available except disallowed ones."""
+        """Test that when allowed_tools is not set, all tools are available except disallowed ones."""  # noqa: E501
         from hud.agents import ClaudeAgent
         from hud.datasets.runner import run_dataset
         # Create a task with its own agent_config (no allowed_tools)
         task_dict = {
             "prompt": "Test task",
@@ -344,22 +384,23 @@ class TestRunDatasetToolFiltering:
             "agent_config": {
                 # No allowed_tools set - should allow all tools
                 "disallowed_tools": ["*_execute"],  # Task disallows *_execute
-            }
+            },
         }
         # Agent config passed to __init__ via run_dataset (no allowed_tools)
         agent_init_config = {
             # No allowed_tools set - should allow all tools
             "disallowed_tools": ["browser_debug"],  # Agent init disallows browser_debug
             "validate_api_key": False,
         }
-        with patch("hud.job"), \
-             patch("hud.trace"), \
-             patch.object(ClaudeAgent, "_run_context", mock_run_context), \
-             patch.object(ClaudeAgent, "call_tools", mock_call_tools), \
-             patch("hud.clients.MCPClient", return_value=mock_client_instance):
+        with (
+            patch("hud.job"),
+            patch("hud.trace"),
+            patch.object(ClaudeAgent, "_run_context", mock_run_context),
+            patch.object(ClaudeAgent, "call_tools", mock_call_tools),
+            patch("hud.clients.MCPClient", return_value=mock_client_instance),
+        ):
             # Run the dataset
             await run_dataset(
                 name="test_job",
@@ -368,25 +409,27 @@ class TestRunDatasetToolFiltering:
                 agent_config=agent_init_config,
                 max_steps=10,
             )
             # Verify agent was created and ran
             captured_agent = captured_agent_fixture["agent"]
             assert captured_agent is not None
             # Get the filtered tools
             filtered_tools = captured_agent.get_available_tools()
             filtered_names = {tool.name for tool in filtered_tools}
             # Expected behavior:
             # 1. allowed_tools: None (no allowed_tools set in either init or task)
             #    Result: All tools are initially allowed
-            # 2. disallowed_tools union: ["browser_debug"] ∪ ["*_execute"]
+            # 2. disallowed_tools union: ["browser_debug"] U ["*_execute"]
             #    Result: ["browser_debug", "*_execute"] (all patterns included)
             # 3. Final: {all tools} - {browser_debug, system_execute}
             #    Result: browser_click, browser_type, system_screenshot
             expected_tools = {"browser_click", "browser_type", "system_screenshot"}
-            assert filtered_names == expected_tools, f"Expected {expected_tools}, got {filtered_names}"
+            assert filtered_names == expected_tools, (
+                f"Expected {expected_tools}, got {filtered_names}"
+            )
 class TestSystemPromptHandling:
@@ -410,16 +453,20 @@ class TestSystemPromptHandling:
     @pytest.fixture
     def mock_run_context(self, captured_agent_fixture):
         """Fixture for mocking _run_context to capture agent."""
         async def _mock(self, context, max_steps=10):
             captured_agent_fixture["agent"] = self
             return Trace(reward=1.0, done=True, content="Done")
         return _mock
     @pytest.fixture
     def mock_call_tools(self):
         """Fixture for mocking call_tools."""
         async def _mock(self, tool_call=None):
             return []
         return _mock
     @pytest.mark.asyncio
@@ -430,29 +477,30 @@ class TestSystemPromptHandling:
         from hud.agents import ClaudeAgent
         from hud.agents.base import GLOBAL_SYSTEM_PROMPT
         from hud.datasets.runner import run_dataset
         task_system_prompt = "Task prompt"
         # Create a task with its own system_prompt in agent_config
         task_dict = {
             "prompt": "Test task",
             "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
             "agent_config": {
                 "system_prompt": task_system_prompt,
-            }
+            },
         }
         # Agent config with no custom system_prompt (will use default)
         agent_init_config = {
             "validate_api_key": False,
         }
-        with patch("hud.job"), \
-             patch("hud.trace"), \
-             patch.object(ClaudeAgent, "_run_context", mock_run_context), \
-             patch.object(ClaudeAgent, "call_tools", mock_call_tools), \
-             patch("hud.clients.MCPClient", return_value=mock_mcp_client):
+        with (
+            patch("hud.job"),
+            patch("hud.trace"),
+            patch.object(ClaudeAgent, "_run_context", mock_run_context),
+            patch.object(ClaudeAgent, "call_tools", mock_call_tools),
+            patch("hud.clients.MCPClient", return_value=mock_mcp_client),
+        ):
             # Run the dataset
             await run_dataset(
                 name="test_job",
@@ -461,11 +509,11 @@ class TestSystemPromptHandling:
                 agent_config=agent_init_config,
                 max_steps=10,
             )
             # Verify agent was created and ran
             captured_agent = captured_agent_fixture["agent"]
             assert captured_agent is not None
             # Verify the task system prompt was appended
             assert captured_agent.system_prompt.endswith(f"\n\n{task_system_prompt}")
             # Verify it starts with the base global system prompt
@@ -478,31 +526,32 @@ class TestSystemPromptHandling:
         """Test that both agent init and task system prompts are present when both are set."""
         from hud.agents import ClaudeAgent
         from hud.datasets.runner import run_dataset
         agent_custom_prompt = "Agent init prompt"
         task_system_prompt = "Task prompt"
         # Create a task with its own system_prompt in agent_config
         task_dict = {
             "prompt": "Test task",
             "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
             "agent_config": {
                 "system_prompt": task_system_prompt,
-            }
+            },
         }
         # Agent config WITH custom system_prompt
         agent_init_config = {
             "system_prompt": agent_custom_prompt,
             "validate_api_key": False,
         }
-        with patch("hud.job"), \
-             patch("hud.trace"), \
-             patch.object(ClaudeAgent, "_run_context", mock_run_context), \
-             patch.object(ClaudeAgent, "call_tools", mock_call_tools), \
-             patch("hud.clients.MCPClient", return_value=mock_mcp_client):
+        with (
+            patch("hud.job"),
+            patch("hud.trace"),
+            patch.object(ClaudeAgent, "_run_context", mock_run_context),
+            patch.object(ClaudeAgent, "call_tools", mock_call_tools),
+            patch("hud.clients.MCPClient", return_value=mock_mcp_client),
+        ):
             # Run the dataset
             await run_dataset(
                 name="test_job",
@@ -511,11 +560,11 @@ class TestSystemPromptHandling:
                 agent_config=agent_init_config,
                 max_steps=10,
             )
             # Verify agent was created and ran
             captured_agent = captured_agent_fixture["agent"]
             assert captured_agent is not None
             # Verify the task system prompt was appended at the end
             assert captured_agent.system_prompt.endswith(f"\n\n{task_system_prompt}")
             # Verify it starts with the agent custom prompt

hud-python 0.4.48__py3-none-any.whl → 0.4.50__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.48py3-none-any.whl → 0.4.50py3-none-any.whl