PyPI - hud-python - Versions diffs - 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl - Mend

hud-python 0.4.20py3-none-any.whl → 0.4.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (54) hide show

hud/__init__.py +7 -0
hud/agents/base.py +42 -10
hud/agents/claude.py +24 -14
hud/agents/grounded_openai.py +280 -0
hud/agents/tests/test_client.py +11 -27
hud/agents/tests/test_grounded_openai_agent.py +155 -0
hud/cli/__init__.py +50 -20
hud/cli/build.py +3 -44
hud/cli/eval.py +25 -6
hud/cli/init.py +4 -4
hud/cli/push.py +3 -1
hud/cli/tests/test_push.py +6 -6
hud/cli/utils/interactive.py +1 -1
hud/clients/__init__.py +3 -2
hud/clients/base.py +20 -9
hud/clients/mcp_use.py +44 -22
hud/datasets/task.py +6 -2
hud/native/__init__.py +6 -0
hud/native/comparator.py +546 -0
hud/native/tests/__init__.py +1 -0
hud/native/tests/test_comparator.py +539 -0
hud/native/tests/test_native_init.py +79 -0
hud/otel/instrumentation.py +0 -2
hud/server/server.py +9 -2
hud/settings.py +6 -0
hud/shared/exceptions.py +204 -31
hud/shared/hints.py +177 -0
hud/shared/requests.py +15 -3
hud/shared/tests/test_exceptions.py +385 -144
hud/tools/__init__.py +2 -0
hud/tools/executors/tests/test_base_executor.py +1 -1
hud/tools/executors/xdo.py +1 -1
hud/tools/grounding/__init__.py +13 -0
hud/tools/grounding/config.py +54 -0
hud/tools/grounding/grounded_tool.py +314 -0
hud/tools/grounding/grounder.py +301 -0
hud/tools/grounding/tests/__init__.py +1 -0
hud/tools/grounding/tests/test_grounded_tool.py +196 -0
hud/tools/submit.py +66 -0
hud/tools/tests/test_playwright_tool.py +1 -1
hud/tools/tests/test_tools_init.py +1 -1
hud/tools/tests/test_utils.py +2 -2
hud/types.py +33 -5
hud/utils/agent_factories.py +86 -0
hud/utils/design.py +57 -0
hud/utils/mcp.py +6 -0
hud/utils/pretty_errors.py +68 -0
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/METADATA +2 -4
{hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/RECORD +54 -37
{hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/WHEEL +0 -0
{hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/licenses/LICENSE +0 -0

hud/cli/__init__.py CHANGED Viewed

@@ -43,6 +43,12 @@ app = typer.Typer(
 console = Console()
+# Standard support hint appended to error outputs
+SUPPORT_HINT = (
+    "If this looks like an issue with the sdk, please make a github issue at "
+    "https://github.com/hud-evals/hud-python/issues"
+)
 # Capture IMAGE and any following Docker args as a single variadic argument list.
 @app.command(context_settings={"allow_extra_args": True, "ignore_unknown_options": True})
@@ -818,6 +824,11 @@ def eval(
         "--max-concurrent-per-worker",
         help="Maximum concurrent tasks per worker in parallel mode",
     ),
+    verbose: bool = typer.Option(
+        False,
+        "--verbose",
+        help="Enable verbose output from the agent",
+    ),
 ) -> None:
     """🚀 Run evaluation on datasets or individual tasks with agents."""
     from hud.utils.design import HUDDesign
@@ -912,6 +923,7 @@ def eval(
         parallel=parallel,
         max_workers=max_workers,
         max_concurrent_per_worker=max_concurrent_per_worker,
+        verbose=verbose,
     )
@@ -950,27 +962,45 @@ def hf(
 def main() -> None:
     """Main entry point for the CLI."""
-    # Show header for main help
-    if len(sys.argv) == 1 or (len(sys.argv) == 2 and sys.argv[1] in ["--help", "-h"]):
-        console.print(
-            Panel.fit(
-                "[bold cyan]🚀 HUD CLI[/bold cyan]\nMCP Environment Analysis & Debugging",
-                border_style="cyan",
+    try:
+        # Show header for main help
+        if len(sys.argv) == 1 or (len(sys.argv) == 2 and sys.argv[1] in ["--help", "-h"]):
+            console.print(
+                Panel.fit(
+                    "[bold cyan]🚀 HUD CLI[/bold cyan]\nMCP Environment Analysis & Debugging",
+                    border_style="cyan",
+                )
             )
-        )
-        console.print("\n[yellow]Quick Start:[/yellow]")
-        console.print("  1. Create a new environment: [cyan]hud init my-env && cd my-env[/cyan]")
-        console.print("  2. Develop with hot-reload: [cyan]hud dev --interactive[/cyan]")
-        console.print("  3. Build for production: [cyan]hud build[/cyan]")
-        console.print("  4. Share your environment: [cyan]hud push[/cyan]")
-        console.print("  5. Get shared environments: [cyan]hud pull <org/name:tag>[/cyan]")
-        console.print("  6. Run and test: [cyan]hud run <image>[/cyan]")
-        console.print("\n[yellow]RL Training:[/yellow]")
-        console.print("  1. Generate config: [cyan]hud rl init my-env:latest[/cyan]")
-        console.print("  2. Create dataset: [cyan]hud hf tasks.json --name my-org/my-tasks[/cyan]")
-        console.print("  3. Start training: [cyan]hud rl --model Qwen/Qwen2.5-3B[/cyan]\n")
-    app()
+            console.print("\n[yellow]Quick Start:[/yellow]")
+            console.print(
+                "  1. Create a new environment: [cyan]hud init my-env && cd my-env[/cyan]"
+            )
+            console.print("  2. Develop with hot-reload: [cyan]hud dev --interactive[/cyan]")
+            console.print("  3. Build for production: [cyan]hud build[/cyan]")
+            console.print("  4. Share your environment: [cyan]hud push[/cyan]")
+            console.print("  5. Get shared environments: [cyan]hud pull <org/name:tag>[/cyan]")
+            console.print("  6. Run and test: [cyan]hud run <image>[/cyan]")
+            console.print("\n[yellow]RL Training:[/yellow]")
+            console.print("  1. Generate config: [cyan]hud rl init my-env:latest[/cyan]")
+            console.print(
+                "  2. Create dataset: [cyan]hud hf tasks.json --name my-org/my-tasks[/cyan]"
+            )
+            console.print("  3. Start training: [cyan]hud rl --model Qwen/Qwen2.5-3B[/cyan]\n")
+        app()
+    except typer.Exit as e:
+        # Append SDK support hint for non-zero exits
+        try:
+            exit_code = getattr(e, "exit_code", 0)
+        except Exception:
+            exit_code = 1
+        if exit_code != 0:
+            from hud.utils.design import design
+            design.info(SUPPORT_HINT)
+        raise
+    except Exception:
+        raise
 if __name__ == "__main__":

hud/cli/build.py CHANGED Viewed

@@ -204,30 +204,10 @@ async def analyze_mcp_environment(
             "success": True,
         }
     except Exception as e:
-        import traceback
+        from hud.shared.exceptions import HudException
-        error_msg = str(e)
-        if verbose:
-            design.error(f"Failed to analyze environment: {error_msg}")
-            design.error(f"Traceback:\n{traceback.format_exc()}")
-        # Common issues
-        if "Connection reset" in error_msg or "EOF" in error_msg:
-            design.warning(
-                "The MCP server may have crashed on startup. Check your server.py for errors."
-            )
-        elif "timeout" in error_msg:
-            design.warning(
-                "The MCP server took too long to initialize. It might need more startup time."
-            )
-        return {
-            "initializeMs": 0,
-            "toolCount": 0,
-            "tools": [],
-            "success": False,
-            "error": error_msg,
-        }
+        # Convert to HudException for better error messages and hints
+        raise HudException from e
     finally:
         # Only shutdown if we successfully initialized
         if initialized:
@@ -340,27 +320,6 @@ def build_environment(
     finally:
         loop.close()
-    if not analysis["success"]:
-        design.error("Failed to analyze MCP environment")
-        if "error" in analysis:
-            design.error(f"Error: {analysis['error']}")
-        # Provide helpful debugging tips
-        design.section_title("Debugging Tips")
-        design.info("1. Debug your environment build:")
-        design.command_example("hud debug . --build")
-        design.dim_info("   This will", "test MCP server connection and show detailed logs")
-        design.info("")
-        design.info("2. Check for common issues:")
-        design.info("   - Server crashes on startup")
-        design.info("   - Missing dependencies")
-        design.info("   - Syntax errors in server.py")
-        design.info("")
-        design.info("3. Run with verbose mode:")
-        design.command_example("hud build . --verbose")
-        raise typer.Exit(1)
     design.success(f"Analyzed environment: {analysis['toolCount']} tools found")
     # Extract environment variables from Dockerfile

hud/cli/eval.py CHANGED Viewed

@@ -22,6 +22,7 @@ def build_agent(
     *,
     model: str | None = None,
     allowed_tools: list[str] | None = None,
+    verbose: bool = False,
 ) -> Any:
     """Create and return the requested agent type."""
@@ -39,9 +40,10 @@ def build_agent(
         if allowed_tools:
             return OperatorAgent(
                 allowed_tools=allowed_tools,
+                verbose=verbose,
             )
         else:
-            return OperatorAgent()
+            return OperatorAgent(verbose=verbose)
     # Fallback Claude agent (Anthropic)
     try:
@@ -59,10 +61,12 @@ def build_agent(
         return ClaudeAgent(
             model=model,
             allowed_tools=allowed_tools,
+            verbose=verbose,
         )
     else:
         return ClaudeAgent(
             model=model,
+            verbose=verbose,
         )
@@ -73,6 +77,7 @@ async def run_single_task(
     model: str | None = None,
     allowed_tools: list[str] | None = None,
     max_steps: int = 10,
+    verbose: bool = False,
 ) -> None:
     """Load one task and execute it, or detect if JSON contains a list and run as dataset."""
@@ -82,7 +87,7 @@ async def run_single_task(
     except ImportError as e:
         design.error(
             "Dataset dependencies are not installed. "
-            "Please install with: pip install 'hud-python[agent]'"
+            "Please install with: pip install 'hud-python\u27e6agent\u27e7'"
         )
         raise typer.Exit(1) from e
@@ -106,11 +111,11 @@ async def run_single_task(
                 except ImportError as e:
                     design.error(
                         "OpenAI agent dependencies are not installed. "
-                        "Please install with: pip install 'hud-python[agent]'"
+                        "Please install with: pip install 'hud-python\u27e6agent\u27e7'"
                     )
                     raise typer.Exit(1) from e
-                agent_config: dict[str, Any] = {}
+                agent_config: dict[str, Any] = {"verbose": verbose}
                 if allowed_tools:
                     agent_config["allowed_tools"] = allowed_tools
@@ -128,6 +133,7 @@ async def run_single_task(
                 agent_config = {
                     "model": model or "claude-sonnet-4-20250514",
+                    "verbose": verbose,
                 }
                 if allowed_tools:
                     agent_config["allowed_tools"] = allowed_tools
@@ -182,6 +188,7 @@ async def run_single_task(
             agent_type,
             model=model,
             allowed_tools=allowed_tools,
+            verbose=verbose,
         )
         design.info(task.prompt)
         result = await agent.run(task, max_steps=max_steps)
@@ -199,6 +206,7 @@ async def run_full_dataset(
     parallel: bool = False,
     max_workers: int | None = None,
     max_concurrent_per_worker: int = 25,
+    verbose: bool = False,
 ) -> list[Any]:
     """Run evaluation across the entire dataset.
@@ -211,7 +219,7 @@ async def run_full_dataset(
     except ImportError as e:
         design.error(
             "Dataset dependencies are not installed. "
-            "Please install with: pip install 'hud-python[agent]'"
+            "Please install with: pip install 'hud-python[[agent]]'"
         )
         raise typer.Exit(1) from e
@@ -245,7 +253,7 @@ async def run_full_dataset(
             )
             raise typer.Exit(1) from e
-        agent_config: dict[str, Any] = {}
+        agent_config: dict[str, Any] = {"verbose": verbose}
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
@@ -263,6 +271,7 @@ async def run_full_dataset(
         agent_config = {
             "model": model or "claude-sonnet-4-20250514",
+            "verbose": verbose,
         }
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
@@ -360,6 +369,11 @@ def eval_command(
         "--max-concurrent-per-worker",
         help="Maximum concurrent tasks per worker in parallel mode",
     ),
+    verbose: bool = typer.Option(
+        False,
+        "--verbose",
+        help="Enable verbose output from the agent",
+    ),
 ) -> None:
     """🚀 Run evaluation on datasets or individual tasks with agents.
@@ -387,6 +401,9 @@ def eval_command(
         # Run with OpenAI Operator agent
         hud eval hud-evals/OSWorld-Gold-Beta --agent openai
+        # Run with verbose output for debugging
+        hud eval task.json --verbose
     """
     from hud.settings import settings
@@ -428,6 +445,7 @@ def eval_command(
                 parallel=parallel,
                 max_workers=max_workers,
                 max_concurrent_per_worker=max_concurrent_per_worker,
+                verbose=verbose,
             )
         )
     else:
@@ -438,5 +456,6 @@ def eval_command(
                 model=model,
                 allowed_tools=allowed_tools_list,
                 max_steps=max_steps,
+                verbose=verbose,
             )
         )

hud/cli/init.py CHANGED Viewed

@@ -182,15 +182,15 @@ async def run_task(task_data: dict):
         await client.initialize()
         result = await client.call_tool(task.setup_tool) # type: ignore
-        print(f"✅ Setup: {{result.content}}")
+        print(f"✅ Setup: {result.content}")
         print("\\n🔄 Performing actions:")
         for _ in range(10):
-            result = await client.call_tool(name="act", arguments={{}})
-            print(f"  {{result.content}}")
+            result = await client.call_tool(name="act", arguments={})
+            print(f"  {result.content}")
         result = await client.call_tool(task.evaluate_tool) # type: ignore
-        print(f"\\n📊 Evaluation: {{result.content}}")
+        print(f"\\n📊 Evaluation: {result.content}")
         return result.content
     except Exception as e:

hud/cli/push.py CHANGED Viewed

@@ -11,7 +11,6 @@ import requests
 import typer
 import yaml
-from hud.settings import settings
 from hud.utils.design import HUDDesign
@@ -127,6 +126,9 @@ def push_environment(
     design = HUDDesign()
     design.header("HUD Environment Push")
+    # Import settings lazily after any environment setup
+    from hud.settings import settings
     # Find hud.lock.yaml in specified directory
     env_dir = Path(directory)
     lock_path = env_dir / "hud.lock.yaml"

hud/cli/tests/test_push.py CHANGED Viewed

@@ -123,7 +123,7 @@ class TestPushEnvironment:
         mock_design.error.assert_called()
     @mock.patch("hud.cli.push.HUDDesign")
-    @mock.patch("hud.cli.push.settings")
+    @mock.patch("hud.settings.settings")
     def test_push_no_api_key(self, mock_settings, mock_design_class, tmp_path):
         """Test pushing without API key."""
         mock_design = mock.Mock()
@@ -143,7 +143,7 @@ class TestPushEnvironment:
     @mock.patch("subprocess.Popen")
     @mock.patch("subprocess.run")
     @mock.patch("hud.cli.push.get_docker_username")
-    @mock.patch("hud.cli.push.settings")
+    @mock.patch("hud.settings.settings")
     @mock.patch("hud.cli.push.HUDDesign")
     def test_push_auto_detect_username(
         self,
@@ -205,7 +205,7 @@ class TestPushEnvironment:
         assert "testuser/image%3A0.1.0" in call_args[0][0]
     @mock.patch("subprocess.run")
-    @mock.patch("hud.cli.push.settings")
+    @mock.patch("hud.settings.settings")
     @mock.patch("hud.cli.push.HUDDesign")
     def test_push_explicit_image(self, mock_design_class, mock_settings, mock_run, tmp_path):
         """Test pushing with explicit image name."""
@@ -226,7 +226,7 @@ class TestPushEnvironment:
     @mock.patch("subprocess.Popen")
     @mock.patch("subprocess.run")
-    @mock.patch("hud.cli.push.settings")
+    @mock.patch("hud.settings.settings")
     @mock.patch("hud.cli.push.HUDDesign")
     def test_push_with_tag(self, mock_design_class, mock_settings, mock_run, mock_popen, tmp_path):
         """Test pushing with explicit tag."""
@@ -282,7 +282,7 @@ class TestPushEnvironment:
         mock_process.returncode = 1
         mock_popen.return_value = mock_process
-        with mock.patch("hud.cli.push.settings") as mock_settings:
+        with mock.patch("hud.settings.settings") as mock_settings:
             mock_settings.api_key = "test-key"
             with (
                 mock.patch("subprocess.run"),
@@ -292,7 +292,7 @@ class TestPushEnvironment:
     @mock.patch("hud.cli.push.get_docker_image_labels")
     @mock.patch("subprocess.run")
-    @mock.patch("hud.cli.push.settings")
+    @mock.patch("hud.settings.settings")
     @mock.patch("hud.cli.push.HUDDesign")
     def test_push_with_labels(
         self, mock_design_class, mock_settings, mock_run, mock_get_labels, tmp_path

hud/cli/utils/interactive.py CHANGED Viewed

@@ -74,7 +74,7 @@ class InteractiveMCPTester:
         for tool in self.tools:
             if "/" in tool.name:
-                hub, name = tool.name.split("/", 1)
+                hub, _ = tool.name.split("/", 1)
                 if hub not in hub_tools:
                     hub_tools[hub] = []
                 hub_tools[hub].append(tool)

hud/clients/__init__.py CHANGED Viewed

@@ -4,9 +4,10 @@ from __future__ import annotations
 from .base import AgentMCPClient, BaseHUDClient
 from .fastmcp import FastMCPHUDClient
+from .mcp_use import MCPUseHUDClient
-# Default to FastMCP for new features
-MCPClient = FastMCPHUDClient
+# Default to MCP-use for new features
+MCPClient = MCPUseHUDClient
 __all__ = [
     "AgentMCPClient",

hud/clients/base.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, Any, Protocol, overload, runtime_checkable
 from mcp.types import Implementation
+from hud.shared.exceptions import HudAuthenticationError, HudException
 from hud.types import MCPToolCall, MCPToolResult
 from hud.utils.mcp import setup_hud_telemetry
 from hud.version import __version__ as hud_version
@@ -120,8 +121,10 @@ class BaseHUDClient(AgentMCPClient):
         self._mcp_config = mcp_config or self._mcp_config
         if self._mcp_config is None:
-            raise ValueError(
-                "An MCP server configuration is required"
+            from hud.shared.exceptions import HudConfigError
+            raise HudConfigError(
+                "An MCP server configuration is required. "
                 "Either pass it to the constructor or call initialize with a configuration"
             )
@@ -135,14 +138,18 @@ class BaseHUDClient(AgentMCPClient):
                 url = server_config.get("url", "")
                 headers = server_config.get("headers", {})
                 if "mcp.hud.so" in url and len(headers.get("Authorization", "")) < 10:
-                    raise RuntimeError(
-                        "Please ensure your HUD_API_KEY environment variable is set correctly."
-                        "You can get an API key at https://app.hud.so"
+                    raise HudAuthenticationError(
+                        f'Sending authorization "{headers.get("Authorization", "")}", which may'
+                        " be incomplete. Ensure HUD_API_KEY environment variable is set or send it"
+                        " as a header. You can get an API key at https://app.hud.so"
                     )
             # Subclasses implement connection
             await self._connect(self._mcp_config)
+        except HudException:
+            raise
         except Exception as e:
-            raise e
+            # Auto-converts to appropriate HUD exception type with hints
+            raise HudException from e
         # Common hud behavior - fetch telemetry
         await self._fetch_telemetry()
@@ -168,7 +175,7 @@ class BaseHUDClient(AgentMCPClient):
             self._initialized = False
             logger.info("Client disconnected")
         else:
-            logger.warning("Client is not running, cannot disconnect")
+            logger.debug("Client was not initialized, skipping disconnect")
     @overload
     async def call_tool(self, tool_call: MCPToolCall, /) -> MCPToolResult: ...
@@ -236,7 +243,9 @@ class BaseHUDClient(AgentMCPClient):
     def mcp_config(self) -> dict[str, dict[str, Any]]:
         """Get the MCP config."""
         if self._mcp_config is None:
-            raise ValueError("Please initialize the client with a valid MCP config")
+            from hud.shared.exceptions import HudConfigError
+            raise HudConfigError("Please initialize the client with a valid MCP config")
         return self._mcp_config
     async def __aenter__(self: Any) -> Any:
@@ -305,7 +314,9 @@ class BaseHUDClient(AgentMCPClient):
             - metadata: Environment metadata
         """
         if not self._initialized:
-            raise ValueError("Client must be initialized before analyzing the environment")
+            from hud.shared.exceptions import HudClientError
+            raise HudClientError("Client must be initialized before analyzing the environment")
         analysis: dict[str, Any] = {
             "tools": [],

hud/clients/mcp_use.py CHANGED Viewed

@@ -3,10 +3,12 @@
 from __future__ import annotations
 import logging
-from typing import TYPE_CHECKING, Any
+from typing import Any
-from mcp import Implementation
+from mcp import Implementation, types
 from mcp.shared.exceptions import McpError
+from mcp_use.client import MCPClient as MCPUseClient
+from mcp_use.session import MCPSession as MCPUseSession
 from pydantic import AnyUrl
 from hud.types import MCPToolCall, MCPToolResult
@@ -14,18 +16,6 @@ from hud.version import __version__ as hud_version
 from .base import BaseHUDClient
-if TYPE_CHECKING:
-    from mcp import types
-    from mcp_use.client import MCPClient as MCPUseClient  # type: ignore[attr-defined]
-    from mcp_use.session import MCPSession as MCPUseSession  # type: ignore[attr-defined]
-try:
-    from mcp_use.client import MCPClient as MCPUseClient  # type: ignore[attr-defined]
-    from mcp_use.session import MCPSession as MCPUseSession  # type: ignore[attr-defined]
-except ImportError:
-    MCPUseClient = None  # type: ignore[misc, assignment]
-    MCPUseSession = None  # type: ignore[misc, assignment]
 logger = logging.getLogger(__name__)
@@ -53,7 +43,9 @@ class MCPUseHUDClient(BaseHUDClient):
             )
         self._sessions: dict[str, Any] = {}  # Will be MCPUseSession when available
-        self._tool_map: dict[str, tuple[str, types.Tool]] = {}
+        self._tool_map: dict[
+            str, tuple[str, types.Tool, types.Tool]
+        ] = {}  # server_name, original_tool, prefixed_tool
         self._client: Any | None = None  # Will be MCPUseClient when available
     async def _connect(self, mcp_config: dict[str, dict[str, Any]]) -> None:
@@ -106,14 +98,23 @@ class MCPUseHUDClient(BaseHUDClient):
                 logger.info("Check that the MCP server is running and accessible")
             raise
+        # Populate tool map during initialization
+        await self.list_tools()
     async def list_tools(self) -> list[types.Tool]:
         """List all available tools from all sessions."""
         if self._client is None or not self._sessions:
             raise ValueError("Client is not connected, call initialize() first")
+        if self._tool_map:
+            return [tool[2] for tool in self._tool_map.values()]
         all_tools = []
         self._tool_map = {}
+        # Check if we need to prefix (more than one server)
+        use_prefix = len(self._sessions) > 1
         for server_name, session in self._sessions.items():
             try:
                 # Ensure session is initialized
@@ -136,10 +137,26 @@ class MCPUseHUDClient(BaseHUDClient):
                     [tool.name for tool in tools_result.tools],
                 )
-                # Add to collections
+                # Add to collections with optional prefix
                 for tool in tools_result.tools:
-                    all_tools.append(tool)
-                    self._tool_map[tool.name] = (server_name, tool)
+                    if use_prefix:
+                        # Create a new tool with prefixed name
+                        prefixed_name = f"{server_name}_{tool.name}"
+                        # Create a new tool instance with prefixed name
+                        from mcp import types as mcp_types
+                        prefixed_tool = mcp_types.Tool(
+                            name=prefixed_name,
+                            description=tool.description,
+                            inputSchema=tool.inputSchema,
+                        )
+                        all_tools.append(prefixed_tool)
+                        # Map prefixed name to (server_name, original_tool)
+                        self._tool_map[prefixed_name] = (server_name, tool, prefixed_tool)
+                    else:
+                        # Single server - no prefix needed
+                        all_tools.append(tool)
+                        self._tool_map[tool.name] = (server_name, tool, tool)
                 # Log detailed tool info in verbose mode
                 if self.verbose:
@@ -164,15 +181,20 @@ class MCPUseHUDClient(BaseHUDClient):
             raise ValueError("Client is not connected, call initialize() first")
         if tool_call.name not in self._tool_map:
-            raise ValueError(f"Tool '{tool_call.name}' not found")
+            return MCPToolResult(
+                content=[types.TextContent(type="text", text=f"Tool '{tool_call.name}' not found")],
+                isError=True,
+                structuredContent=None,
+            )
-        server_name, _ = self._tool_map[tool_call.name]
+        server_name, original_tool, _ = self._tool_map[tool_call.name]
         session = self._sessions[server_name]
         if self.verbose:
             logger.debug(
-                "Calling tool '%s' on server '%s' with arguments: %s",
+                "Calling tool '%s' (original: '%s') on server '%s' with arguments: %s",
                 tool_call.name,
+                original_tool.name,
                 server_name,
                 tool_call.arguments,
             )
@@ -181,7 +203,7 @@ class MCPUseHUDClient(BaseHUDClient):
             raise ValueError(f"Client session not initialized for {server_name}")
         result = await session.connector.client_session.call_tool(
-            name=tool_call.name,
+            name=original_tool.name,  # Use original tool name, not prefixed
             arguments=tool_call.arguments or {},
         )

hud/datasets/task.py CHANGED Viewed

@@ -51,7 +51,9 @@ class Task(BaseModel):
             try:
                 return json.loads(v)
             except json.JSONDecodeError as e:
-                raise ValueError(f"Invalid JSON string: {e}") from e
+                from hud.shared.exceptions import HudConfigError
+                raise HudConfigError(f"Invalid JSON string: {e}") from e
         return v
     @field_validator("setup_tool", "evaluate_tool", mode="before")
@@ -66,7 +68,9 @@ class Task(BaseModel):
             try:
                 v = json.loads(v)
             except json.JSONDecodeError as e:
-                raise ValueError(f"Invalid JSON string: {e}") from e
+                from hud.shared.exceptions import HudConfigError
+                raise HudConfigError(f"Invalid JSON string: {e}") from e
         if isinstance(v, dict):
             return MCPToolCall(**v)

hud/native/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Native Python MCP servers for HUD.
+These servers run as pure Python processes without containerization.
+They can be run standalone or mounted into other servers, providing
+lightweight evaluation and comparison capabilities.
+"""

hud-python 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.20py3-none-any.whl → 0.4.22py3-none-any.whl