PyPI - hud-python - Versions diffs - 0.4.46__py3-none-any.whl → 0.4.48__py3-none-any.whl - Mend

hud-python 0.4.46py3-none-any.whl → 0.4.48py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (33) hide show

hud/agents/base.py +49 -142
hud/agents/claude.py +5 -6
hud/agents/misc/integration_test_agent.py +2 -0
hud/agents/tests/test_base.py +2 -5
hud/cli/__init__.py +2 -2
hud/cli/eval.py +14 -9
hud/cli/flows/tasks.py +2 -4
hud/cli/rl/local_runner.py +25 -13
hud/cli/rl/vllm.py +2 -0
hud/cli/tests/test_analyze_metadata.py +3 -2
hud/cli/tests/test_eval.py +525 -0
hud/cli/tests/test_utils.py +1 -1
hud/datasets/parallel.py +0 -12
hud/datasets/runner.py +1 -4
hud/rl/actor.py +4 -2
hud/rl/distributed.py +1 -1
hud/rl/learner.py +2 -1
hud/rl/train.py +1 -1
hud/telemetry/trace.py +1 -1
hud/tools/base.py +11 -9
hud/tools/computer/__init__.py +2 -0
hud/tools/computer/qwen.py +431 -0
hud/tools/computer/settings.py +16 -0
hud/tools/executors/pyautogui.py +1 -1
hud/tools/playwright.py +1 -1
hud/types.py +2 -3
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.46.dist-info → hud_python-0.4.48.dist-info}/METADATA +1 -1
{hud_python-0.4.46.dist-info → hud_python-0.4.48.dist-info}/RECORD +33 -31
{hud_python-0.4.46.dist-info → hud_python-0.4.48.dist-info}/WHEEL +0 -0
{hud_python-0.4.46.dist-info → hud_python-0.4.48.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.46.dist-info → hud_python-0.4.48.dist-info}/licenses/LICENSE +0 -0

hud/cli/tests/test_eval.py ADDED Viewed

@@ -0,0 +1,525 @@
+"""Tests for hud.cli.eval module."""
+from __future__ import annotations
+from unittest.mock import AsyncMock, MagicMock, Mock, patch
+import pytest
+from mcp import types
+from hud.cli.eval import build_agent, eval_command, get_available_models, run_full_dataset, run_single_task
+from hud.types import Task, Trace
+class TestBuildAgent:
+    """Test the build_agent function."""
+    def test_builds_integration_test_agent(self) -> None:
+        """
+        Test building an integration test agent.
+        """
+        with patch("hud.agents.misc.integration_test_agent.IntegrationTestRunner") as mock_runner:
+            mock_instance = Mock()
+            mock_runner.return_value = mock_instance
+            # Test with verbose=False
+            result = build_agent("integration_test", verbose=False)
+            mock_runner.assert_called_once_with(verbose=False)
+            assert result == mock_instance
+    def test_builds_claude_agent(self) -> None:
+        """
+        Test building a Claude agent with default model.
+        """
+        with patch("hud.agents.ClaudeAgent") as mock_runner:
+            mock_instance = Mock()
+            mock_runner.return_value = mock_instance
+            # Test with verbose=False
+            result = build_agent("claude", verbose=False)
+            mock_runner.assert_called_once_with(
+                model="claude-sonnet-4-20250514",
+                verbose=False
+            )
+            assert result == mock_instance
+    def test_builds_claude_agent_with_custom_model_and_allowed_tools(self) -> None:
+        """
+        Test building a Claude agent with custom model name and allowed tools.
+        """
+        with patch("hud.agents.ClaudeAgent") as mock_runner:
+            mock_instance = Mock()
+            mock_runner.return_value = mock_instance
+            # Test with verbose=False
+            result = build_agent(
+                "claude",
+                model="claude-sonnet-4-20250514",
+                allowed_tools=["act"],
+                verbose=True,
+            )
+            mock_runner.assert_called_once_with(
+                model="claude-sonnet-4-20250514",
+                allowed_tools=["act"],
+                verbose=True,
+            )
+            assert result == mock_instance
+class TestRunSingleTask:
+    """Test the run_single_task function."""
+    @pytest.mark.asyncio
+    async def test_applies_agent_config_from_task(self) -> None:
+        """Test that task.agent_config is applied during agent initialization."""
+        mock_task = Task(
+            prompt="Test",
+            mcp_config={"local": {"url": "http://localhost:8765/mcp"}},
+            agent_config={
+                "system_prompt": "Custom instructions",
+                "allowed_tools": ["tool1", "tool2"],
+                "append_setup_output": False,
+            }
+        )
+        mock_agent = AsyncMock(
+            initialize=AsyncMock(),
+            run=AsyncMock(return_value=Trace(reward=1.0, done=True))
+        )
+        with patch("hud.utils.tasks.load_tasks", return_value=[mock_task]), \
+             patch("hud.agents.misc.integration_test_agent.IntegrationTestRunner", return_value=mock_agent), \
+             patch("hud.cli.eval.find_environment_dir", return_value=None), \
+             patch("hud.cli.eval.hud.trace"):
+            await run_single_task("test.json", agent_type="integration_test", max_steps=10)
+            # Verify agent.run was called with the task containing agent_config
+            mock_agent.run.assert_called_once()
+            called_task = mock_agent.run.call_args[0][0]
+            assert called_task.agent_config == mock_task.agent_config
+    @pytest.mark.asyncio
+    async def test_runs_with_group_size_greater_than_one(self) -> None:
+        """Test that group_size > 1 triggers run_tasks_grouped instead of agent.run."""
+        mock_task = Task(prompt="Test", mcp_config={"local": {"url": "http://localhost:8765/mcp"}})
+        with patch("hud.utils.tasks.load_tasks", return_value=[mock_task]), \
+             patch("hud.cli.eval.run_tasks_grouped", new_callable=AsyncMock) as mock_grouped, \
+             patch("hud.cli.eval.display_group_statistics"), \
+             patch("hud.cli.eval.find_environment_dir", return_value=None), \
+             patch("hud.cli.eval.hud.trace"):
+            mock_grouped.return_value = [{"task": mock_task, "rewards": [1.0, 0.5]}]
+            await run_single_task("test.json", agent_type="integration_test", group_size=3, max_steps=10)
+            # Verify run_tasks_grouped was called with correct group_size
+            mock_grouped.assert_called_once()
+            assert mock_grouped.call_args.kwargs["group_size"] == 3
+            assert mock_grouped.call_args.kwargs["max_steps"] == 10
+class TestToolFiltering:
+    """Test wildcard tool filtering via agent_config in tasks."""
+    @pytest.fixture
+    def mock_mcp_client(self):
+        """Fixture for mock MCP client."""
+        client = MagicMock()
+        client.initialize = AsyncMock()
+        client.mcp_config = {"local": {"url": "http://localhost"}}
+        return client
+    @pytest.fixture
+    def mock_model_client(self):
+        """Fixture for mock Anthropic client."""
+        return MagicMock()
+    async def _run_agent_with_tools(
+        self,
+        mock_mcp_client: MagicMock,
+        mock_model_client: MagicMock,
+        tools: list[types.Tool],
+        agent_config: dict | None = None,
+    ) -> list[types.Tool]:
+        """Helper to create agent, initialize with tools and config, return filtered tools."""
+        from hud.agents import ClaudeAgent
+        mock_mcp_client.list_tools = AsyncMock(return_value=tools)
+        task = Task(
+            prompt="Test",
+            mcp_config={"local": {"url": "http://localhost"}},
+            agent_config=agent_config or {}
+        )
+        agent = ClaudeAgent(
+            mcp_client=mock_mcp_client,
+            model_client=mock_model_client,
+            model="test",
+            validate_api_key=False
+        )
+        await agent.initialize(task)
+        return agent.get_available_tools()
+    @pytest.mark.asyncio
+    async def test_no_filters_returns_all_tools(self, mock_mcp_client, mock_model_client) -> None:
+        """Test that no filters in agent_config returns all tools."""
+        tools = [
+            types.Tool(name="tool1", description="Tool 1", inputSchema={}),
+            types.Tool(name="tool2", description="Tool 2", inputSchema={}),
+            types.Tool(name="debug_tool", description="Debug", inputSchema={}),
+        ]
+        result = await self._run_agent_with_tools(mock_mcp_client, mock_model_client, tools)
+        assert len(result) == 3
+    @pytest.mark.asyncio
+    async def test_allowed_tools_filters_correctly(self, mock_mcp_client, mock_model_client) -> None:
+        """Test that allowed_tools in agent_config filters to matching patterns."""
+        tools = [
+            types.Tool(name="screenshot_take", description="Tool 1", inputSchema={}),
+            types.Tool(name="screenshot_full", description="Tool 2", inputSchema={}),
+            types.Tool(name="click", description="Tool 3", inputSchema={}),
+        ]
+        agent_config = {"allowed_tools": ["screenshot_*"]}
+        result = await self._run_agent_with_tools(mock_mcp_client, mock_model_client, tools, agent_config)
+        assert len(result) == 2
+        assert all("screenshot" in t.name for t in result)
+    @pytest.mark.asyncio
+    async def test_disallowed_tools_excludes_correctly(self, mock_mcp_client, mock_model_client) -> None:
+        """Test that disallowed_tools in agent_config excludes matching patterns."""
+        tools = [
+            types.Tool(name="tool1", description="Tool 1", inputSchema={}),
+            types.Tool(name="debug_tool", description="Tool 2", inputSchema={}),
+            types.Tool(name="internal_secret", description="Tool 3", inputSchema={}),
+        ]
+        agent_config = {"disallowed_tools": ["debug_*", "internal_*"]}
+        result = await self._run_agent_with_tools(mock_mcp_client, mock_model_client, tools, agent_config)
+        assert len(result) == 1
+        assert result[0].name == "tool1"
+    @pytest.mark.asyncio
+    async def test_both_filters_applies_allowed_then_disallowed(self, mock_mcp_client, mock_model_client) -> None:
+        """Test that both filters in agent_config work together (disallowed takes precedence)."""
+        tools = [
+            types.Tool(name="browser_click", description="Tool 1", inputSchema={}),
+            types.Tool(name="browser_debug", description="Tool 2", inputSchema={}),
+            types.Tool(name="system_click", description="Tool 3", inputSchema={}),
+        ]
+        agent_config = {
+            "allowed_tools": ["browser_*"],
+            "disallowed_tools": ["*_debug"]
+        }
+        result = await self._run_agent_with_tools(mock_mcp_client, mock_model_client, tools, agent_config)
+        assert len(result) == 1
+        assert result[0].name == "browser_click"
+class TestRunDatasetToolFiltering:
+    """Test tool filtering via run_dataset with agent_config in both init and task."""
+    @pytest.fixture
+    def all_tools(self):
+        """Fixture for a standard set of tools."""
+        return [
+            types.Tool(name="browser_click", description="Click", inputSchema={}),
+            types.Tool(name="browser_type", description="Type", inputSchema={}),
+            types.Tool(name="browser_debug", description="Debug", inputSchema={}),
+            types.Tool(name="system_screenshot", description="Screenshot", inputSchema={}),
+            types.Tool(name="system_execute", description="Execute", inputSchema={}),
+        ]
+    @pytest.fixture
+    def captured_agent_fixture(self):
+        """Fixture that returns a dictionary to capture the agent instance."""
+        return {"agent": None}
+    @pytest.fixture
+    def mock_run_context(self, captured_agent_fixture):
+        """Fixture for mocking _run_context."""
+        async def _mock(self, context, max_steps=10):
+            captured_agent_fixture["agent"] = self
+            return Trace(reward=1.0, done=True, content="Done")
+        return _mock
+    @pytest.fixture
+    def mock_call_tools(self):
+        """Fixture for mocking call_tools."""
+        async def _mock(self, tool_call=None):
+            return []
+        return _mock
+    @pytest.fixture
+    def mock_client_instance(self, all_tools):
+        """Fixture for mock MCP client instance."""
+        mock_client = MagicMock()
+        mock_client.initialize = AsyncMock()
+        mock_client.list_tools = AsyncMock(return_value=all_tools)
+        mock_client.shutdown = AsyncMock()
+        mock_client.mcp_config = {"local": {"url": "http://localhost:8765/mcp"}}
+        return mock_client
+    @pytest.mark.asyncio
+    async def test_agent_config_intersection_union_via_run_dataset(
+        self, all_tools, captured_agent_fixture, mock_run_context, mock_call_tools, mock_client_instance
+    ) -> None:
+        """Test that allowed_tools intersect and disallowed_tools union when set in both __init__ and task.agent_config."""
+        from hud.agents import ClaudeAgent
+        from hud.datasets.runner import run_dataset
+        # Create a task with its own agent_config
+        task_dict = {
+            "prompt": "Test task",
+            "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
+            "agent_config": {
+                "allowed_tools": ["browser_*", "system_screenshot"],  # Task wants browser_* and system_screenshot
+                "disallowed_tools": ["*_debug", "*_execute"],  # Task disallows *_debug and *_execute
+            }
+        }
+        # Agent config passed to __init__ via run_dataset
+        agent_init_config = {
+            "allowed_tools": ["browser_*", "system_*"],  # Agent init wants browser_* and system_*
+            "disallowed_tools": ["browser_debug"],  # Agent init disallows browser_debug
+            "validate_api_key": False,
+        }
+        with patch("hud.job"), \
+             patch("hud.trace"), \
+             patch.object(ClaudeAgent, "_run_context", mock_run_context), \
+             patch.object(ClaudeAgent, "call_tools", mock_call_tools), \
+             patch("hud.clients.MCPClient", return_value=mock_client_instance):
+            # Run the dataset
+            await run_dataset(
+                name="test_job",
+                dataset=[task_dict],
+                agent_class=ClaudeAgent,
+                agent_config=agent_init_config,
+                max_steps=10,
+            )
+            # Verify agent was created and ran
+            captured_agent = captured_agent_fixture["agent"]
+            assert captured_agent is not None
+            # Get the filtered tools
+            filtered_tools = captured_agent.get_available_tools()
+            filtered_names = {tool.name for tool in filtered_tools}
+            # Expected behavior:
+            # 1. allowed_tools intersection: ["browser_*", "system_*"] ∩ ["browser_*", "system_screenshot"]
+            #    Exact string intersection: only "browser_*" is in both lists
+            #    So only tools matching browser_* are allowed: browser_click, browser_type, browser_debug
+            # 2. disallowed_tools union: ["browser_debug"] ∪ ["*_debug", "*_execute"]
+            #    Result: ["browser_debug", "*_debug", "*_execute"] (all patterns included)
+            # 3. Final: {browser_click, browser_type, browser_debug} - {browser_debug}
+            #    Result: browser_click, browser_type
+            expected_tools = {"browser_click", "browser_type"}
+            assert filtered_names == expected_tools, f"Expected {expected_tools}, got {filtered_names}"
+    @pytest.mark.asyncio
+    async def test_no_allowed_tools_keeps_all_tools_except_disallowed(
+        self, all_tools, captured_agent_fixture, mock_run_context, mock_call_tools, mock_client_instance
+    ) -> None:
+        """Test that when allowed_tools is not set, all tools are available except disallowed ones."""
+        from hud.agents import ClaudeAgent
+        from hud.datasets.runner import run_dataset
+        # Create a task with its own agent_config (no allowed_tools)
+        task_dict = {
+            "prompt": "Test task",
+            "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
+            "agent_config": {
+                # No allowed_tools set - should allow all tools
+                "disallowed_tools": ["*_execute"],  # Task disallows *_execute
+            }
+        }
+        # Agent config passed to __init__ via run_dataset (no allowed_tools)
+        agent_init_config = {
+            # No allowed_tools set - should allow all tools
+            "disallowed_tools": ["browser_debug"],  # Agent init disallows browser_debug
+            "validate_api_key": False,
+        }
+        with patch("hud.job"), \
+             patch("hud.trace"), \
+             patch.object(ClaudeAgent, "_run_context", mock_run_context), \
+             patch.object(ClaudeAgent, "call_tools", mock_call_tools), \
+             patch("hud.clients.MCPClient", return_value=mock_client_instance):
+            # Run the dataset
+            await run_dataset(
+                name="test_job",
+                dataset=[task_dict],
+                agent_class=ClaudeAgent,
+                agent_config=agent_init_config,
+                max_steps=10,
+            )
+            # Verify agent was created and ran
+            captured_agent = captured_agent_fixture["agent"]
+            assert captured_agent is not None
+            # Get the filtered tools
+            filtered_tools = captured_agent.get_available_tools()
+            filtered_names = {tool.name for tool in filtered_tools}
+            # Expected behavior:
+            # 1. allowed_tools: None (no allowed_tools set in either init or task)
+            #    Result: All tools are initially allowed
+            # 2. disallowed_tools union: ["browser_debug"] ∪ ["*_execute"]
+            #    Result: ["browser_debug", "*_execute"] (all patterns included)
+            # 3. Final: {all tools} - {browser_debug, system_execute}
+            #    Result: browser_click, browser_type, system_screenshot
+            expected_tools = {"browser_click", "browser_type", "system_screenshot"}
+            assert filtered_names == expected_tools, f"Expected {expected_tools}, got {filtered_names}"
+class TestSystemPromptHandling:
+    """Test system prompt handling through run_dataset flow."""
+    @pytest.fixture
+    def mock_mcp_client(self):
+        """Fixture for mock MCP client."""
+        client = MagicMock()
+        client.initialize = AsyncMock()
+        client.list_tools = AsyncMock(return_value=[])
+        client.shutdown = AsyncMock()
+        client.mcp_config = {"local": {"url": "http://localhost:8765/mcp"}}
+        return client
+    @pytest.fixture
+    def captured_agent_fixture(self):
+        """Fixture that returns a dictionary to capture the agent instance."""
+        return {"agent": None}
+    @pytest.fixture
+    def mock_run_context(self, captured_agent_fixture):
+        """Fixture for mocking _run_context to capture agent."""
+        async def _mock(self, context, max_steps=10):
+            captured_agent_fixture["agent"] = self
+            return Trace(reward=1.0, done=True, content="Done")
+        return _mock
+    @pytest.fixture
+    def mock_call_tools(self):
+        """Fixture for mocking call_tools."""
+        async def _mock(self, tool_call=None):
+            return []
+        return _mock
+    @pytest.mark.asyncio
+    async def test_task_system_prompt_only(
+        self, captured_agent_fixture, mock_run_context, mock_call_tools, mock_mcp_client
+    ) -> None:
+        """Test that task system_prompt is appended when agent has default system prompt."""
+        from hud.agents import ClaudeAgent
+        from hud.agents.base import GLOBAL_SYSTEM_PROMPT
+        from hud.datasets.runner import run_dataset
+        task_system_prompt = "Task prompt"
+        # Create a task with its own system_prompt in agent_config
+        task_dict = {
+            "prompt": "Test task",
+            "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
+            "agent_config": {
+                "system_prompt": task_system_prompt,
+            }
+        }
+        # Agent config with no custom system_prompt (will use default)
+        agent_init_config = {
+            "validate_api_key": False,
+        }
+        with patch("hud.job"), \
+             patch("hud.trace"), \
+             patch.object(ClaudeAgent, "_run_context", mock_run_context), \
+             patch.object(ClaudeAgent, "call_tools", mock_call_tools), \
+             patch("hud.clients.MCPClient", return_value=mock_mcp_client):
+            # Run the dataset
+            await run_dataset(
+                name="test_job",
+                dataset=[task_dict],
+                agent_class=ClaudeAgent,
+                agent_config=agent_init_config,
+                max_steps=10,
+            )
+            # Verify agent was created and ran
+            captured_agent = captured_agent_fixture["agent"]
+            assert captured_agent is not None
+            # Verify the task system prompt was appended
+            assert captured_agent.system_prompt.endswith(f"\n\n{task_system_prompt}")
+            # Verify it starts with the base global system prompt
+            assert captured_agent.system_prompt.startswith(GLOBAL_SYSTEM_PROMPT)
+    @pytest.mark.asyncio
+    async def test_both_agent_and_task_system_prompts(
+        self, captured_agent_fixture, mock_run_context, mock_call_tools, mock_mcp_client
+    ) -> None:
+        """Test that both agent init and task system prompts are present when both are set."""
+        from hud.agents import ClaudeAgent
+        from hud.datasets.runner import run_dataset
+        agent_custom_prompt = "Agent init prompt"
+        task_system_prompt = "Task prompt"
+        # Create a task with its own system_prompt in agent_config
+        task_dict = {
+            "prompt": "Test task",
+            "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
+            "agent_config": {
+                "system_prompt": task_system_prompt,
+            }
+        }
+        # Agent config WITH custom system_prompt
+        agent_init_config = {
+            "system_prompt": agent_custom_prompt,
+            "validate_api_key": False,
+        }
+        with patch("hud.job"), \
+             patch("hud.trace"), \
+             patch.object(ClaudeAgent, "_run_context", mock_run_context), \
+             patch.object(ClaudeAgent, "call_tools", mock_call_tools), \
+             patch("hud.clients.MCPClient", return_value=mock_mcp_client):
+            # Run the dataset
+            await run_dataset(
+                name="test_job",
+                dataset=[task_dict],
+                agent_class=ClaudeAgent,
+                agent_config=agent_init_config,
+                max_steps=10,
+            )
+            # Verify agent was created and ran
+            captured_agent = captured_agent_fixture["agent"]
+            assert captured_agent is not None
+            # Verify the task system prompt was appended at the end
+            assert captured_agent.system_prompt.endswith(f"\n\n{task_system_prompt}")
+            # Verify it starts with the agent custom prompt
+            assert captured_agent.system_prompt.startswith(agent_custom_prompt)
+            # Verify both prompts are present
+            assert agent_custom_prompt in captured_agent.system_prompt
+            assert task_system_prompt in captured_agent.system_prompt

hud/cli/tests/test_utils.py CHANGED Viewed

@@ -22,7 +22,7 @@ class TestColors:
         assert Colors.YELLOW == "\033[93m"
         assert Colors.GOLD == "\033[33m"
         assert Colors.RED == "\033[91m"
-        assert Colors.GRAY == "\033[90m"
+        assert Colors.GRAY == "\033[37m"
         assert Colors.ENDC == "\033[0m"
         assert Colors.BOLD == "\033[1m"

hud/datasets/parallel.py CHANGED Viewed

@@ -261,7 +261,6 @@ async def run_dataset_parallel_manual(
     max_steps: int = 10,
     split: str = "train",
     auto_respond: bool = False,
-    custom_system_prompt: str | None = None,
 ) -> list[Any]:
     """
     Run all tasks in a dataset using process-based parallelism with manual configuration.
@@ -282,7 +281,6 @@ async def run_dataset_parallel_manual(
         max_steps: Maximum steps per task
         split: Dataset split when loading from string
         auto_respond: Whether to use ResponseAgent
-        custom_system_prompt: Override system prompt for all tasks
     Returns:
         List of results in the same order as the input dataset
@@ -349,14 +347,6 @@ async def run_dataset_parallel_manual(
     else:
         raise ValueError(f"Dataset must be string, Dataset, or list, got {type(dataset)}")
-    # Apply custom system prompt if provided
-    if custom_system_prompt:
-        for task_dict in task_dicts:
-            if "system_prompt" not in task_dict:
-                task_dict["system_prompt"] = custom_system_prompt
-            else:
-                task_dict["system_prompt"] += "\n" + custom_system_prompt
     # Prepare job metadata
     job_metadata = metadata or {}
     job_metadata.update(
@@ -380,8 +370,6 @@ async def run_dataset_parallel_manual(
         except Exception:
             logger.warning("Failed to extract dataset verification info")
-    # task_dicts = task_dicts[:10]
     # Create job context
     with hud.job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
         # Prepare agent class info for pickling

hud/datasets/runner.py CHANGED Viewed

@@ -27,7 +27,6 @@ async def run_dataset(
     max_steps: int = 10,
     split: str = "train",
     auto_respond: bool = False,
-    custom_system_prompt: str | None = None,
 ) -> list[Any]:
     """
     Run all tasks in a dataset with automatic job tracking.
@@ -43,7 +42,6 @@ async def run_dataset(
         max_steps: Maximum steps per task
         split: Dataset split to use when loading from string (default: "train")
         auto_respond: Whether to use auto-response agent
-        custom_system_prompt: Override system prompt for all tasks
     Returns:
         List of results from agent.run() in dataset order
@@ -102,8 +100,7 @@ async def run_dataset(
             async with sem:
                 # Create trace for this task
                 task_name = task_dict.get("prompt") or f"Task {index}"
-                if custom_system_prompt and "system_prompt" not in task_dict:
-                    task_dict["system_prompt"] = custom_system_prompt
                 # Ensure task_id is a string for baggage propagation
                 raw_task_id = task_dict.get("id")
                 safe_task_id = str(raw_task_id) if raw_task_id is not None else None

hud/rl/actor.py CHANGED Viewed

@@ -37,7 +37,7 @@ class Actor:
         # Match connection limits to parallel_episodes to avoid bottlenecks
         # Use shorter per-request timeout and keep retries modest to avoid long blocking
         http_client = create_retry_httpx_client(
-            timeout=httpx.Timeout(30.0),
+            timeout=httpx.Timeout(60.0),
         )
         return AsyncOpenAI(
             base_url=base_url,
@@ -151,7 +151,9 @@ if __name__ == "__main__":
                 "name": "evaluate",
                 "arguments": {"name": "game_2048_max_number", "arguments": {"target": 128}},
             },
-            "system_prompt": "You are an expert 2048 game player. Use arrow keys to reach the target tile. First take a screenshot, then make strategic moves.",  # noqa: E501
+            "agent_config": {
+                "system_prompt": "You are an expert 2048 game player. Use arrow keys to reach the target tile. First take a screenshot, then make strategic moves.",  # noqa: E501
+            },
         }
         task = Task(**task_data)

hud/rl/distributed.py CHANGED Viewed

@@ -81,7 +81,7 @@ def broadcast_object(obj: Any, src: int = 0) -> Any:
         return obj
     obj_list = [obj] if dist.get_rank() == src else [None]
-    dist.broadcast_object_list(obj_list, src=src, device=torch.device("cpu"))
+    dist.broadcast_object_list(obj_list, src=src)
     return obj_list[0]

hud/rl/learner.py CHANGED Viewed

@@ -148,11 +148,12 @@ class GRPOLearner:
         # Add LoRA adapters or load existing adapter
         policy.config.use_cache = False
         if model_cfg.adapter_path:
             # Load existing adapter as baseline
             self.log(f"Loading existing LoRA adapter from: {model_cfg.adapter_path}")
             from peft import PeftModel
             policy = PeftModel.from_pretrained(policy, model_cfg.adapter_path)
             # Enable adapter training
             policy.train()

hud/rl/train.py CHANGED Viewed

@@ -95,7 +95,7 @@ async def train(config: Config, tasks: list[Task]) -> None:
         if is_main_process()
         else None
     )
     # Load initial adapter if provided
     if is_main_process() and config.model.adapter_path and vllm:
         hud_console.info(f"Loading baseline adapter from: {config.model.adapter_path}")

hud/telemetry/trace.py CHANGED Viewed

@@ -139,7 +139,7 @@ def trace(
     else:
         # Use a placeholder for custom backends
         logger.warning(
-            "HUD API key is not set, using a placeholder for the task run ID. If this looks wrong, check your API key." # noqa: E501
+            "HUD API key is not set, using a placeholder for the task run ID. If this looks wrong, check your API key."  # noqa: E501
         )
         task_run_id = str(uuid.uuid4())

hud-python 0.4.46__py3-none-any.whl → 0.4.48__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.46py3-none-any.whl → 0.4.48py3-none-any.whl