PyPI - hud-python - Versions diffs - 0.4.47__tar.gz → 0.4.48__tar.gz - Mend

hud-python 0.4.47tar.gz → 0.4.48tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (249) hide show

{hud_python-0.4.47 → hud_python-0.4.48}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.4.47
+Version: 0.4.48
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues

{hud_python-0.4.47 → hud_python-0.4.48}/hud/agents/base.py RENAMED Viewed

@@ -3,10 +3,11 @@
 from __future__ import annotations
 import asyncio
+import fnmatch
 import json
 import logging
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, ClassVar, Literal
+from typing import TYPE_CHECKING, Any, ClassVar, List, Literal
 import mcp.types as types
@@ -96,12 +97,9 @@ class MCPAgent(ABC):
             self.console.set_verbose(True)
         # User filtering
-        self.allowed_tools = allowed_tools
-        self.disallowed_tools = disallowed_tools or []
-        # Task filtering
-        self.agent_tools = None
-        self.lifecycle_tools = []
+        self.allowed_tools: List[str] | None = allowed_tools
+        self.disallowed_tools: List[str] | None = disallowed_tools
+        self._available_tools: List[types.Tool] | None = None
         # Messages
         self.system_prompt = system_prompt
@@ -109,7 +107,6 @@ class MCPAgent(ABC):
         self.initial_screenshot = initial_screenshot
         # Initialize these here so methods can be called before initialize()
-        self._available_tools: list[types.Tool] = []
         self._tool_map: dict[str, types.Tool] = {}  # Simplified: just name to tool
         self.response_tool_name = None
@@ -146,37 +143,48 @@ class MCPAgent(ABC):
         except Exception as e:
             self._handle_connection_error(e)
-        # If task is provided, add lifecycle tools
+        # If task is provided, apply agent_config and add lifecycle tools
         if isinstance(task, Task):
-            if task.agent_tools:
-                self.agent_tools = task.agent_tools
-            if task.setup_tool:
-                if isinstance(task.setup_tool, list):
-                    for tool in task.setup_tool:
-                        if not self.agent_tools or (
-                            self.agent_tools and tool.name not in self.agent_tools
-                        ):
-                            self.lifecycle_tools.append(tool.name)
-                elif not self.agent_tools or (
-                    self.agent_tools and task.setup_tool.name not in self.agent_tools
-                ):
-                    self.lifecycle_tools.append(task.setup_tool.name)
-            if task.evaluate_tool:
-                if isinstance(task.evaluate_tool, list):
-                    for tool in task.evaluate_tool:
-                        if not self.agent_tools or (
-                            self.agent_tools and tool.name not in self.agent_tools
-                        ):
-                            self.lifecycle_tools.append(tool.name)
-                elif not self.agent_tools or (
-                    self.agent_tools and task.evaluate_tool.name not in self.agent_tools
-                ):
-                    self.lifecycle_tools.append(task.evaluate_tool.name)
-            if task.system_prompt:
-                self.system_prompt += "\n\n" + task.system_prompt
-        # Re-apply filtering with updated lifecycle tools
-        await self._filter_tools()
+            # Apply agent_config if present
+            if task.agent_config:
+                if "system_prompt" in task.agent_config and task.agent_config["system_prompt"]:
+                    self.system_prompt += "\n\n" + task.agent_config["system_prompt"]
+                if "append_setup_output" in task.agent_config:
+                    self.append_setup_output = task.agent_config["append_setup_output"]
+                if "initial_screenshot" in task.agent_config:
+                    self.initial_screenshot = task.agent_config["initial_screenshot"]
+                if "allowed_tools" in task.agent_config:
+                    # If allowed_tools has already been set, we take the intersection of the two
+                    # If the list had been empty, we were allowing all tools, so we overwrite in this
+                    if isinstance(self.allowed_tools, list) and len(self.allowed_tools) > 0:
+                        self.allowed_tools = [tool for tool in self.allowed_tools if tool in task.agent_config["allowed_tools"]]
+                    else:  # If allowed_tools is None, we overwrite it
+                        self.allowed_tools = task.agent_config["allowed_tools"]
+                if "disallowed_tools" in task.agent_config:
+                    # If disallowed_tools has already been set, we take the union of the two
+                    if isinstance(self.disallowed_tools, list):
+                        self.disallowed_tools.extend(task.agent_config["disallowed_tools"])
+                    else:  # If disallowed_tools is None, we overwrite it
+                        self.disallowed_tools = task.agent_config["disallowed_tools"]
+        all_tools = await self.mcp_client.list_tools()
+        self._available_tools = []
+        # Filter tools based on allowed and disallowed patterns
+        # No allowed tools and no disallowed tools -> we accept all tools
+        # No allowed tools and disallowed tools -> we accept all tools except the disallowed ones
+        for tool in all_tools:
+            if self.allowed_tools is not None:
+                if not any(fnmatch.fnmatch(tool.name, pattern) for pattern in self.allowed_tools):
+                    continue
+            if self.disallowed_tools is not None:
+                if any(fnmatch.fnmatch(tool.name, pattern) for pattern in self.disallowed_tools):
+                    continue
+            self._available_tools.append(tool)
+        self.console.info(
+            f"Agent initialized with {len(self.get_available_tools())} tools: {', '.join([t.name for t in self.get_available_tools()])}"  # noqa: E501
+        )
     async def run(self, prompt_or_task: str | Task | dict[str, Any], max_steps: int = 10) -> Trace:
         """
@@ -575,108 +583,6 @@ class MCPAgent(ABC):
         return await self.format_blocks(blocks)
-    async def _filter_tools(self) -> None:
-        """Apply tool filtering based on allowed/disallowed lists."""
-        # Get all tools from client
-        if self.mcp_client is None:
-            raise ValueError("MCP client is not initialized")
-        all_tools = await self.mcp_client.list_tools()
-        response_tools_by_server: dict[str, str] = {}  # server_name -> tool_name
-        for tool in all_tools:
-            if "response" in tool.name or tool.name == "response":
-                self.console.debug(f"Found response tool: '{tool.name}'")
-                # Extract server name from tool name (e.g., "grader_response" -> "grader")
-                if "_" in tool.name:
-                    server_name = tool.name.split("_", 1)[0]
-                    response_tools_by_server[server_name] = tool.name
-                else:
-                    response_tools_by_server["_default"] = tool.name
-        # Add response tool to lifecycle tools BEFORE filtering
-        if response_tools_by_server and hasattr(self.mcp_client, "mcp_config"):
-            # Get server names in order from mcp_config
-            server_names = list(self.mcp_client.mcp_config.keys())
-            self.console.debug(f"Server names: {server_names}")
-            # Try to find response tool from last server first
-            response_tool_name = None
-            for server_name in reversed(server_names):
-                if server_name in response_tools_by_server:
-                    response_tool_name = response_tools_by_server[server_name]
-                    self.console.debug(
-                        f"Found response tool '{response_tool_name}' from server '{server_name}'"
-                    )
-                    break
-            # Fallback to any response tool
-            if not response_tool_name and response_tools_by_server:
-                response_tool_name = next(iter(response_tools_by_server.values()))
-                self.console.debug(f"Using fallback response tool '{response_tool_name}'")
-            # Add to lifecycle tools if found
-            if response_tool_name and response_tool_name not in self.lifecycle_tools:
-                self.console.debug(f"Auto-detected '{response_tool_name}' tool as a lifecycle tool")
-                self.response_tool_name = response_tool_name
-                self.lifecycle_tools.append(response_tool_name)
-            elif response_tool_name:
-                self.console.debug(
-                    f"Response tool '{response_tool_name}' already in lifecycle_tools"
-                )
-                self.response_tool_name = response_tool_name
-        else:
-            self.console.debug("No response tools found or no mcp_config")
-        # Filter tools
-        self._available_tools = []
-        self._tool_map = {}
-        self.console.debug(f"All tools: {[t.name for t in all_tools]}")
-        self.console.debug(f"Allowed tools: {self.allowed_tools}")
-        self.console.debug(f"Agent tools: {self.agent_tools}")
-        self.console.debug(f"Disallowed tools: {self.disallowed_tools}")
-        self.console.debug(f"Lifecycle tools: {self.lifecycle_tools}")
-        for tool in all_tools:
-            # Lifecycle tools (setup, evaluate, response) should always be included
-            is_lifecycle = tool.name in self.lifecycle_tools
-            # Check if tool should be included
-            if not is_lifecycle:
-                if self.allowed_tools and tool.name not in self.allowed_tools:
-                    self.console.debug(f"Skipping tool '{tool.name}' - not in allowed_tools")
-                    continue
-                if self.agent_tools and tool.name not in self.agent_tools:
-                    self.console.debug(f"Skipping tool '{tool.name}' - not in agent_tools")
-                    continue
-                if tool.name in self.disallowed_tools:
-                    self.console.debug(f"Skipping tool '{tool.name}' - in disallowed_tools")
-                    continue
-            self.console.debug(
-                f"Adding tool '{tool.name}' to available tools (lifecycle={is_lifecycle})"
-            )
-            self._available_tools.append(tool)
-            self._tool_map[tool.name] = tool
-        # Check if all required tools are available
-        if self.required_tools:
-            available_tool_names = {tool.name for tool in self._available_tools}
-            missing_tools = [
-                tool for tool in self.required_tools if tool not in available_tool_names
-            ]
-            if missing_tools:
-                raise ValueError(
-                    f"Required tools not available: {missing_tools}. "
-                    f"Available tools: {list(available_tool_names)}"
-                )
-        available_tools = self.get_available_tools()
-        self.console.info(
-            f"Agent initialized with {len(available_tools)} tools: {', '.join([t.name for t in available_tools])}"  # noqa: E501
-        )
     async def _maybe_submit_response(self, response: AgentResponse, messages: list[Any]) -> None:
         """Submit response through lifecycle tool if available.
@@ -715,8 +621,9 @@ class MCPAgent(ABC):
     def get_available_tools(self) -> list[types.Tool]:
         """Get list of available MCP tools for LLM use (excludes lifecycle tools)."""
-        lifecycle_tool_names = self.lifecycle_tools
-        return [tool for tool in self._available_tools if tool.name not in lifecycle_tool_names]
+        if self._available_tools is None:
+            raise RuntimeError("Tools have not been initialized. Call initialize() before accessing available tools.")
+        return self._available_tools
     def get_tool_schemas(self) -> list[dict]:
         """Get tool schemas in a format suitable for the model."""

{hud_python-0.4.47 → hud_python-0.4.48}/hud/agents/claude.py RENAMED Viewed

@@ -326,7 +326,7 @@ class ClaudeAgent(MCPAgent):
         selected_computer_tool = None
         for priority_name in computer_tool_priority:
-            for tool in self._available_tools:
+            for tool in self.get_available_tools():
                 # Check both exact match and suffix match (for prefixed tools)
                 if tool.name == priority_name or tool.name.endswith(f"_{priority_name}"):
                     selected_computer_tool = tool
@@ -350,13 +350,12 @@ class ClaudeAgent(MCPAgent):
             )
         # Add other non-computer tools
-        for tool in self._available_tools:
-            # Skip computer tools (already handled) and lifecycle tools
-            is_computer_tool = any(
+        for tool in self.get_available_tools():
+            # Skip computer tools (already handled)
+            if any(
                 tool.name == priority_name or tool.name.endswith(f"_{priority_name}")
                 for priority_name in computer_tool_priority
-            )
-            if is_computer_tool or tool.name in self.lifecycle_tools:
+            ):
                 continue
             claude_tool = {

{hud_python-0.4.47 → hud_python-0.4.48}/hud/agents/misc/integration_test_agent.py RENAMED Viewed

@@ -17,6 +17,8 @@ class IntegrationTestRunner(MCPAgent):
             # Initialize using base to set up client and telemetry correctly
             await self.initialize(task)
+            self.console.info(f"Full system prompt: {self.system_prompt}")
             # Validate task shape
             if not getattr(task, "integration_test_tool", None):
                 raise ValueError(

{hud_python-0.4.47 → hud_python-0.4.48}/hud/agents/tests/test_base.py RENAMED Viewed

@@ -326,9 +326,6 @@ class TestBaseMCPAgent:
         """Test getting tool schemas."""
         agent = MockMCPAgent()
-        # Add setup to lifecycle tools to test filtering
-        agent.lifecycle_tools = ["setup"]
         agent._available_tools = [
             types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
             types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
@@ -598,7 +595,7 @@ class TestMCPAgentExtended:
         agent = MockAgentExtended(mcp_client=mock_client, allowed_tools=["tool1", "tool3"])
         await agent.initialize("test")
-        available_names = [tool.name for tool in agent._available_tools]
+        available_names = [tool.name for tool in agent.get_available_tools()]
         assert "tool1" in available_names
         assert "tool3" in available_names
         assert "tool2" not in available_names
@@ -617,7 +614,7 @@ class TestMCPAgentExtended:
         agent = MockAgentExtended(mcp_client=mock_client, disallowed_tools=["tool2"])
         await agent.initialize("test")
-        available_names = [tool.name for tool in agent._available_tools]
+        available_names = [tool.name for tool in agent.get_available_tools()]
         assert "tool1" in available_names
         assert "tool3" in available_names
         assert "tool2" not in available_names

{hud_python-0.4.47 → hud_python-0.4.48}/hud/cli/__init__.py RENAMED Viewed

@@ -935,8 +935,8 @@ def eval(
         "--max-concurrent",
         help="Max concurrent tasks (prevents rate limits in both asyncio and parallel modes)",
     ),
-    max_steps: int = typer.Option(
-        30,
+    max_steps: int | None = typer.Option(
+        None,
         "--max-steps",
         help="Maximum steps per task (default: 10 for single, 50 for full)",
     ),

{hud_python-0.4.47 → hud_python-0.4.48}/hud/cli/eval.py RENAMED Viewed

@@ -199,6 +199,8 @@ async def run_single_task(
 ) -> None:
     """Load one task and execute it, or detect if JSON contains a list and run as dataset."""
+    # Provide early feedback to user
+    hud_console.info("🔧 Initializing evaluation...")
     # Import Task and run_dataset lazily
     try:
         from hud.utils.tasks import load_tasks
@@ -318,7 +320,10 @@ async def run_single_task(
         )
         display_group_statistics(stats, show_details=True)
     else:
-        # Original single-run logic
+        # Enable agent step logging for single task mode
+        logging.getLogger("hud.agents").setLevel(logging.INFO)
+        logging.getLogger("hud.agents.base").setLevel(logging.INFO)
         with hud.trace(name=task_prompt):
             agent = build_agent(
                 agent_type,
@@ -352,6 +357,9 @@ async def run_full_dataset(
     Uses either asyncio-based run_dataset or process-based parallel execution
     depending on the parallel flag."""
+    # Provide early feedback to user
+    hud_console.info("🔧 Initializing evaluation...")
     # Import run_dataset lazily
     try:
         from hud.datasets import run_dataset, run_dataset_parallel, run_dataset_parallel_manual
@@ -367,7 +375,7 @@ async def run_full_dataset(
     hud_console.info(f"📊 Loading tasks from: {source}…")
     tasks: list[Task] = load_tasks(source)  # type: ignore[assignment]
-    if not tasks:
+    if len(tasks) == 0:
         hud_console.error(f"No tasks found in: {source}")
         raise typer.Exit(1)
@@ -646,10 +654,10 @@ def eval_command(
         hud eval hud-evals/SheetBench-50 --full --agent claude
         # Run large dataset with PARALLEL execution (auto-optimized)
-        hud eval hud-evals/OSWorld-Verified-XLang --full --parallel
+        hud eval hud-evals/OSWorld-Verified-Gold --full --parallel
         # Parallel mode with manual configuration (16 workers, 25 tasks each)
-        hud eval hud-evals/OSWorld-Verified-XLang --full --parallel --max-workers 16
+        hud eval hud-evals/OSWorld-Verified-Gold --full --parallel --max-workers 16
         # Limit total concurrent tasks to prevent rate limits
         hud eval hud-evals/SheetBench-50 --full --parallel --max-concurrent 20
@@ -674,6 +682,8 @@ def eval_command(
     """
     from hud.settings import settings
+    # Always configure basic logging so agent steps can be logged
+    # Set to INFO by default for consistency with run_evaluation.py
     if very_verbose:
         logging.basicConfig(
             level=logging.DEBUG,
@@ -683,11 +693,6 @@ def eval_command(
         logging.getLogger("hud.agents").setLevel(logging.DEBUG)
         logging.getLogger("hud.agents.base").setLevel(logging.DEBUG)
     elif verbose:
-        logging.basicConfig(
-            level=logging.INFO,
-            format="%(asctime)s - %(name)s - %(message)s",
-            datefmt="%H:%M:%S",
-        )
         logging.getLogger("hud.agents").setLevel(logging.INFO)
         logging.getLogger("hud.agents.base").setLevel(logging.INFO)

{hud_python-0.4.47 → hud_python-0.4.48}/hud/cli/flows/tasks.py RENAMED Viewed

@@ -364,10 +364,8 @@ def convert_tasks_to_remote(tasks_file: str) -> str:
             item["setup_tool"] = _simplify_tool_call(t.setup_tool)
         if t.evaluate_tool is not None:
             item["evaluate_tool"] = _simplify_tool_call(t.evaluate_tool)
-        if t.agent_tools is not None:
-            item["agent_tools"] = t.agent_tools
-        if t.system_prompt is not None:
-            item["system_prompt"] = t.system_prompt
+        if t.agent_config is not None:
+            item["agent_config"] = t.agent_config
         if t.metadata:
             item["metadata"] = t.metadata
         if t.id is not None:

{hud_python-0.4.47 → hud_python-0.4.48}/hud/cli/rl/local_runner.py RENAMED Viewed

@@ -230,19 +230,33 @@ def run_local_training(
                 console.print("Enter the model name (HuggingFace ID):")
                 model = input().strip()
-    # Validate model is a VL model (whether provided via CLI or selected)
-    if model:
+    # try to get model from config file
+    if config_file:
+        console.print(f"\n[cyan]Loading configuration from: {config_file}[/cyan]")
+        config = load_config(config_file)
+        if hasattr(config, "model") and hasattr(config.model, "base_model"):
+            if model is None:
+                model = config.model.base_model
+            else:
+                console.print(
+                    f"[yellow]Model already set to {model}, using that instead "
+                    f"of {config.model.base_model}[/yellow] (override)"
+                )
+    if model is None:
+        console.print("[red]❌ No model specified either through CLI or config file[/red]")
         try:
-            validate_vl_model(model)
-        except ValueError as e:
-            console.print(f"\n[red]❌ {e}[/red]")
-            try:
-                import typer
+            import typer
-                raise typer.Exit(1)
-            except Exception:
-                return
-    else:
+            raise typer.Exit(1)
+        except Exception:
+            return
+    # Validate model is a VL model (whether provided via CLI or selected)
+    try:
+        validate_vl_model(model)
+    except ValueError as e:
+        console.print(f"\n[red]❌ {e}[/red]")
         try:
             import typer
@@ -488,7 +502,6 @@ def run_local_training(
         from .vllm import start_vllm_server, wait_for_vllm_server
         start_vllm_server(config.model.base_model, vllm_gpu_idx, restart=restart)
         server_ready = asyncio.run(wait_for_vllm_server())
         if not server_ready:
             console.print("[red]❌ Failed to start vLLM server[/red]")
@@ -507,7 +520,6 @@ def run_local_training(
             f"\n[bold green]🎯 Starting DDP training on {len(training_gpus)} GPUs...[/bold green]\n"
         )
         launch_ddp_training(training_gpus, tasks_file, temp_config_path, verbose)
-        console.print("\n[green]✅ Training completed successfully![/green]")
     else:
         console.print("\n[bold green]🎯 Starting single-GPU training...[/bold green]\n")
         try:

{hud_python-0.4.47 → hud_python-0.4.48}/hud/cli/rl/vllm.py RENAMED Viewed

@@ -165,6 +165,8 @@ async def wait_for_vllm_server(timeout: int = 360) -> bool:  # noqa: ASYNC109
                 if response.status_code == 200:
                     console.print("[green]✅ vLLM server is ready![/green]")
                     return True
+            except httpx.ConnectError:
+                pass
             except Exception as e:
                 hud_console.error(f"Failed to connect to vLLM server: {e}")

{hud_python-0.4.47 → hud_python-0.4.48}/hud/cli/tests/test_analyze_metadata.py RENAMED Viewed

@@ -214,6 +214,7 @@ class TestAnalyzeFromMetadata:
     @mock.patch("hud.cli.utils.metadata.check_local_cache")
     @mock.patch("hud.cli.utils.metadata.fetch_lock_from_registry")
+    @mock.patch("hud.cli.utils.metadata.hud_console")
     @mock.patch("hud.cli.utils.metadata.console")
     async def test_analyze_not_found(self, mock_console, mock_hud_console, mock_fetch, mock_check):
         """Test when environment not found anywhere."""
@@ -222,9 +223,9 @@ class TestAnalyzeFromMetadata:
         await analyze_from_metadata("test/notfound:latest", "json", verbose=False)
-        # Should show error
+        # Should show error via hud_console
         mock_hud_console.error.assert_called_with("Environment metadata not found")
-        # Should print suggestions
+        # Should print suggestions via console
         mock_console.print.assert_called()
     @mock.patch("hud.cli.utils.metadata.check_local_cache")

hud-python 0.4.47__tar.gz → 0.4.48__tar.gz

Potentially problematic release.

hud-python 0.4.47tar.gz → 0.4.48tar.gz