PyPI - hud-python - Versions diffs - 0.5.31__tar.gz → 0.5.33__tar.gz - Mend

hud-python 0.5.31tar.gz → 0.5.33tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (335) hide show

{hud_python-0.5.31 → hud_python-0.5.33}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.5.31
+Version: 0.5.33
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues

{hud_python-0.5.31 → hud_python-0.5.33}/hud/agents/claude.py RENAMED Viewed

@@ -445,7 +445,9 @@ class ClaudeAgent(MCPAgent):
                                 text_document_block(content.text, title=tool_call.name)
                             )
                     elif isinstance(content, types.ImageContent):
-                        claude_blocks.append(base64_to_content_block(content.data))
+                        claude_blocks.append(
+                            base64_to_content_block(content.data, content.mimeType)
+                        )
                     elif isinstance(content, types.EmbeddedResource):
                         resource = content.resource
                         if (
@@ -683,13 +685,19 @@ class ClaudeAgent(MCPAgent):
         return messages_cached
-def base64_to_content_block(base64: str) -> BetaImageBlockParam:
+def base64_to_content_block(
+    base64: str,
+    media_type: str = "image/png",
+) -> BetaImageBlockParam:
     """Convert base64 image to Claude content block."""
     return BetaImageBlockParam(
         type="image",
         source=BetaBase64ImageSourceParam(
             type="base64",
-            media_type="image/png",
+            media_type=cast(
+                "Literal['image/jpeg', 'image/png', 'image/gif', 'image/webp']",
+                media_type,
+            ),
             data=base64,
         ),
     )

{hud_python-0.5.31 → hud_python-0.5.33}/hud/cli/tests/test_build.py RENAMED Viewed

@@ -61,12 +61,12 @@ class TestIncrementVersion:
     def test_increment_minor(self):
         """Test incrementing minor version."""
         assert increment_version("1.2.3", "minor") == "1.3.0"
-        assert increment_version("0.5.31", "minor") == "0.6.0"
+        assert increment_version("0.5.33", "minor") == "0.6.0"
     def test_increment_major(self):
         """Test incrementing major version."""
         assert increment_version("1.2.3", "major") == "2.0.0"
-        assert increment_version("0.5.31", "major") == "1.0.0"
+        assert increment_version("0.5.33", "major") == "1.0.0"
     def test_increment_with_v_prefix(self):
         """Test incrementing version with v prefix."""

{hud_python-0.5.31 → hud_python-0.5.33}/hud/datasets/loader.py RENAMED Viewed

@@ -70,15 +70,15 @@ def _load_from_file(path: Path) -> list[Task]:
     return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
-def resolve_taskset_id(slug: str) -> str:
-    """Resolve a taskset slug/name to its UUID via the HUD API."""
+def resolve_taskset_id(name: str) -> str:
+    """Resolve a taskset name to its UUID via the HUD API."""
     headers = {}
     if settings.api_key:
         headers["Authorization"] = f"Bearer {settings.api_key}"
     with httpx.Client() as client:
         response = client.get(
-            f"{settings.hud_api_url}/tasks/evalset/{slug}",
+            f"{settings.hud_api_url}/tasks/evalset/{name}",
             headers=headers,
         )
         response.raise_for_status()
@@ -86,7 +86,7 @@ def resolve_taskset_id(slug: str) -> str:
     evalset_id = data.get("evalset_id")
     if not evalset_id:
-        raise ValueError(f"Could not resolve taskset '{slug}' — not found or no access")
+        raise ValueError(f"Could not resolve taskset '{name}' — not found or no access")
     return evalset_id
@@ -146,14 +146,14 @@ def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str,
     Supports multiple sources with auto-detection:
     - Local file path (JSON or JSONL)
-    - HUD API dataset slug (e.g., "hud-evals/SheetBench-50")
+    - HUD API evalset name (e.g., "SheetBench-50")
     Automatically detects and converts v4 LegacyTask format to v5 Task.
     Args:
         source: Task source. Can be:
             - Path to a local JSON/JSONL file
-            - HUD API dataset slug (e.g., "hud-evals/SheetBench-50")
+            - HUD API evalset name (e.g., "SheetBench-50")
         raw: If True, return raw dicts without validation or env var substitution.
             Useful for preserving template strings like "${HUD_API_KEY}".
@@ -193,8 +193,7 @@ def save_tasks(
     Creates or updates a taskset with the given tasks.
     Args:
-        name: Taskset name/slug (e.g., "my-evals/benchmark-v1").
-            If no org prefix, uses user's default org.
+        name: Evalset name (e.g., "benchmark-v1").
         tasks: List of Task objects (v5 format) to save.
     Returns:
@@ -214,10 +213,10 @@ def save_tasks(
         ]
         # Save to HUD API
-        taskset_id = save_tasks("my-evals/benchmark-v1", tasks)
+        taskset_id = save_tasks("benchmark-v1", tasks)
         # Later, load them back
-        loaded = load_tasks("my-evals/benchmark-v1")
+        loaded = load_tasks("benchmark-v1")
         ```
     Raises:

{hud_python-0.5.31 → hud_python-0.5.33}/hud/datasets/tests/test_loader.py RENAMED Viewed

@@ -50,7 +50,7 @@ class TestLoadTasks:
         mock_client.__exit__.return_value = None
         mock_client_class.return_value = mock_client
-        tasks = load_tasks("test-org/test-dataset")
+        tasks = load_tasks("test-dataset")
         assert len(tasks) == 2
         # Tasks are keyed by ID in dict, order may vary
@@ -61,7 +61,7 @@ class TestLoadTasks:
         # Platform IDs are internal and should not be inferred from dict keys
         assert all(t.id is None for t in tasks)
         mock_client.get.assert_called_once_with(
-            "https://api.hud.ai/tasks/evalset/test-org/test-dataset",
+            "https://api.hud.ai/tasks/evalset/test-dataset",
             headers={"Authorization": "Bearer test_key"},
             params={"all": "true"},
         )
@@ -96,7 +96,7 @@ class TestLoadTasks:
         mock_client.__exit__.return_value = None
         mock_client_class.return_value = mock_client
-        tasks = load_tasks("test-org/test-dataset")
+        tasks = load_tasks("test-dataset")
         assert len(tasks) == 1
         assert tasks[0].scenario == "checkout"
@@ -126,11 +126,11 @@ class TestLoadTasks:
         mock_client.__exit__.return_value = None
         mock_client_class.return_value = mock_client
-        tasks = load_tasks("test-org/test-dataset")
+        tasks = load_tasks("test-dataset")
         assert len(tasks) == 0
         mock_client.get.assert_called_once_with(
-            "https://api.hud.ai/tasks/evalset/test-org/test-dataset",
+            "https://api.hud.ai/tasks/evalset/test-dataset",
             headers={},
             params={"all": "true"},
         )
@@ -198,7 +198,7 @@ class TestLoadTasks:
         mock_client.__exit__.return_value = None
         mock_client_class.return_value = mock_client
-        tasks = load_tasks("test-org/test-dataset")
+        tasks = load_tasks("test-dataset")
         assert len(tasks) == 0
@@ -223,7 +223,7 @@ class TestLoadTasks:
         mock_client.__exit__.return_value = None
         mock_client_class.return_value = mock_client
-        tasks = load_tasks("test-org/test-dataset")
+        tasks = load_tasks("test-dataset")
         assert len(tasks) == 1
         assert tasks[0].scenario == "test"
@@ -259,7 +259,7 @@ class TestSaveTasks:
         mock_client_class.return_value = mock_client
         taskset_id = save_tasks(
-            "test-org/test-dataset",
+            "test-dataset",
             [
                 Task(
                     env={"name": "test-env"},
@@ -276,6 +276,6 @@ class TestSaveTasks:
         call_args = mock_client.post.call_args
         assert call_args.args[0] == "https://api.hud.ai/tasks/upload"
         payload = call_args.kwargs["json"]
-        assert payload["name"] == "test-org/test-dataset"
+        assert payload["name"] == "test-dataset"
         assert payload["tasks"][0]["slug"] == "checkout-smoke"
         assert "id" not in payload["tasks"][0]

{hud_python-0.5.31 → hud_python-0.5.33}/hud/environment/connection.py RENAMED Viewed

@@ -159,6 +159,9 @@ class Connector:
             "transport": self._transport,
             "auth": self._auth,
         }
+        client_timeout = getattr(self._transport, "_hud_client_timeout", None)
+        if client_timeout is not None:
+            client_kwargs["timeout"] = client_timeout
         if self._elicitation_handler is not None:
             client_kwargs["elicitation_handler"] = self._elicitation_handler

{hud_python-0.5.31 → hud_python-0.5.33}/hud/environment/connectors/mcp_config.py RENAMED Viewed

@@ -2,7 +2,7 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 from hud.environment.connectors.base import BaseConnectorMixin
@@ -66,8 +66,7 @@ class MCPConfigConnectorMixin(BaseConnectorMixin):
                 if settings.client_timeout > 0
                 else min(request_timeout, settings.__class__.model_fields["client_timeout"].default)
             )
-            server_config.setdefault("sse_read_timeout", timeout)
-            transport = _build_transport(server_config)
+            transport = _build_transport(server_config, timeout=timeout)
         return self._add_connection(
             name,
@@ -121,17 +120,29 @@ class MCPConfigConnectorMixin(BaseConnectorMixin):
         return self
-def _build_transport(server_config: dict[str, Any]) -> Any:
+def _build_transport(server_config: dict[str, Any], *, timeout: float | None = None) -> Any:
     from fastmcp.client.transports import SSETransport, StreamableHttpTransport
     from fastmcp.mcp_config import infer_transport_type_from_url
     url = server_config["url"]
     transport_type = server_config.get("transport") or infer_transport_type_from_url(url)
-    transport_cls = SSETransport if transport_type == "sse" else StreamableHttpTransport
-    return transport_cls(
-        url=url,
-        headers=server_config.get("headers"),
-        auth=server_config.get("auth"),
-        sse_read_timeout=server_config.get("sse_read_timeout"),
-    )
+    transport_timeout = timeout if timeout is not None else server_config.get("sse_read_timeout")
+    transport_kwargs = {
+        "url": url,
+        "headers": server_config.get("headers"),
+        "auth": server_config.get("auth"),
+        "httpx_client_factory": server_config.get("httpx_client_factory"),
+    }
+    if transport_type == "sse":
+        return SSETransport(
+            **transport_kwargs,
+            sse_read_timeout=transport_timeout,
+        )
+    transport = StreamableHttpTransport(**transport_kwargs)
+    if transport_timeout is not None:
+        # FastMCP 3.x wants streamable HTTP timeouts on the client/session,
+        # not on the transport constructor.
+        cast("Any", transport)._hud_client_timeout = transport_timeout
+    return transport

{hud_python-0.5.31 → hud_python-0.5.33}/hud/environment/tests/test_connection.py RENAMED Viewed

@@ -140,6 +140,35 @@ class TestConnector:
             # Client is now set
             assert connector.client is mock_client
+    @pytest.mark.asyncio
+    async def test_connect_passes_transport_timeout_to_client(self) -> None:
+        """connect() forwards transport timeout to FastMCP client session kwargs."""
+        class Transport:
+            _hud_client_timeout = 300
+        transport = Transport()
+        connector = Connector(
+            transport=transport,
+            config=ConnectionConfig(),
+            name="test",
+            connection_type=ConnectionType.REMOTE,
+            auth="test-token",
+        )
+        mock_client = MagicMock()
+        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+        mock_client.is_connected = MagicMock(return_value=True)
+        with patch("fastmcp.client.Client", return_value=mock_client) as mock_cls:
+            await connector.connect()
+            mock_cls.assert_called_once_with(
+                transport=transport,
+                auth="test-token",
+                timeout=300,
+            )
     @pytest.mark.asyncio
     async def test_disconnect_clears_client(self) -> None:
         """disconnect() closes client and clears state."""

{hud_python-0.5.31 → hud_python-0.5.33}/hud/environment/tests/test_connectors.py RENAMED Viewed

@@ -197,7 +197,7 @@ class TestRemoteConnectorMixin:
         env = TestEnv()
         with patch("hud.settings.settings", spec=Settings) as mock_settings:
             mock_settings.hud_mcp_url = "https://mcp.hud.ai"
-            mock_settings.client_timeout = 300  # Used in connect_mcp for sse_read_timeout
+            mock_settings.client_timeout = 300  # Used in connect_mcp transport timeout logic
             env.connect_hub("browser")
@@ -205,3 +205,45 @@ class TestRemoteConnectorMixin:
         assert "hud" in env._connections
         # Verify hub config is stored for serialization
         assert env._hub_config == {"name": "browser"}
+    def test_connect_mcp_streamable_transport_uses_client_timeout(self) -> None:
+        """Streamable HTTP uses FastMCP client timeout instead of deprecated transport arg."""
+        from fastmcp.client.transports import StreamableHttpTransport
+        from hud.environment.connectors.mcp_config import MCPConfigConnectorMixin
+        from hud.settings import Settings
+        class TestEnv(MCPConfigConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+        env = TestEnv()
+        with patch("hud.settings.settings", spec=Settings) as mock_settings:
+            mock_settings.client_timeout = 300
+            env.connect_mcp({"browser": {"url": "https://mcp.hud.ai/browser"}})
+        transport = env._connections["browser"]._transport
+        assert isinstance(transport, StreamableHttpTransport)
+        assert transport.sse_read_timeout is None
+        assert getattr(transport, "_hud_client_timeout", None) == 300
+    def test_connect_mcp_sse_transport_keeps_sse_timeout(self) -> None:
+        """SSE transports should continue to receive sse_read_timeout directly."""
+        from fastmcp.client.transports import SSETransport
+        from hud.environment.connectors.mcp_config import MCPConfigConnectorMixin
+        from hud.settings import Settings
+        class TestEnv(MCPConfigConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+        env = TestEnv()
+        with patch("hud.settings.settings", spec=Settings) as mock_settings:
+            mock_settings.client_timeout = 300
+            env.connect_mcp({"browser": {"url": "https://mcp.hud.ai/browser", "transport": "sse"}})
+        transport = env._connections["browser"]._transport
+        assert isinstance(transport, SSETransport)
+        assert transport.sse_read_timeout is not None
+        assert transport.sse_read_timeout.total_seconds() == 300

{hud_python-0.5.31 → hud_python-0.5.33}/hud/eval/context.py RENAMED Viewed

@@ -723,21 +723,18 @@ class EvalContext(Environment):
         return False
     # =========================================================================
-    # Tool Call Instrumentation
+    # MCP Telemetry Instrumentation
     # =========================================================================
-    async def _execute_tool(self, name: str, arguments: dict[str, Any]) -> MCPToolResult:
-        """Execute a tool with automatic telemetry recording.
+    def _should_instrument(self) -> bool:
+        """Whether local MCP instrumentation should be applied.
-        Overrides Environment._execute_tool to record MCP spans for the eval context.
-        Instrumentation is disabled when connected to a remote HUD server (telemetry is
-        recorded server-side in that case).
+        Returns False when telemetry is handled server-side (remote hub or HUD MCP).
         """
-        # Skip instrumentation when connected to a remote hub - telemetry is handled server-side
+        if not self._trace_enabled:
+            return False
         if self._hub_config is not None:
-            return await super()._execute_tool(name, arguments)
-        # Skip instrumentation for v4 tasks with HUD MCP config (remote server)
+            return False
         if self._mcp_config is not None:
             from hud.utils.mcp import _is_hud_server
@@ -745,18 +742,47 @@ class EvalContext(Environment):
                 if isinstance(server_cfg, dict):
                     url = server_cfg.get("url", "")
                     if url and _is_hud_server(url):
-                        return await super()._execute_tool(name, arguments)
+                        return False
+        return True
-        # For local environments, record MCP spans
+    async def _execute_tool(self, name: str, arguments: dict[str, Any]) -> MCPToolResult:
+        if not self._should_instrument():
+            return await super()._execute_tool(name, arguments)
         return await self._execute_tool_instrumented(name, arguments)
-    @instrument(category="mcp")
+    @instrument(method="tools/call")
     async def _execute_tool_instrumented(
         self, name: str, arguments: dict[str, Any]
     ) -> MCPToolResult:
-        """Instrumented version of _execute_tool for local environments."""
         return await super()._execute_tool(name, arguments)
+    async def run_scenario_setup(
+        self,
+        scenario_name: str,
+        args: dict[str, Any],
+        session_id: str | None = None,
+    ) -> str | None:
+        if not self._should_instrument():
+            return await super().run_scenario_setup(scenario_name, args, session_id)
+        return await self._run_setup_instrumented(scenario_name, args)
+    @instrument(method="prompts/get")
+    async def _run_setup_instrumented(self, name: str, arguments: dict[str, Any]) -> str | None:
+        return await super().run_scenario_setup(name, arguments)
+    async def run_scenario_evaluate(
+        self,
+        scenario_name: str,
+        session_id: str | None = None,
+    ) -> EvaluationResult:
+        if not self._should_instrument():
+            return await super().run_scenario_evaluate(scenario_name, session_id)
+        return await self._run_evaluate_instrumented(scenario_name)
+    @instrument(method="resources/read")
+    async def _run_evaluate_instrumented(self, uri: str) -> EvaluationResult:
+        return await super().run_scenario_evaluate(uri)
     def __repr__(self) -> str:
         return f"EvalContext({self.trace_id[:8]}..., name={self.eval_name!r}, reward={self.reward})"

{hud_python-0.5.31 → hud_python-0.5.33}/hud/telemetry/instrument.py RENAMED Viewed

@@ -100,6 +100,7 @@ def instrument(
     name: str | None = None,
     category: str = "function",
     span_type: str | None = None,
+    method: str | None = None,
     internal_type: str | None = None,
     record_args: bool = True,
     record_result: bool = True,
@@ -113,6 +114,7 @@ def instrument(
     name: str | None = None,
     category: str = "function",
     span_type: str | None = None,
+    method: str | None = None,
     internal_type: str | None = None,
     record_args: bool = True,
     record_result: bool = True,
@@ -126,6 +128,7 @@ def instrument(
     name: str | None = None,
     category: str = "function",
     span_type: str | None = None,
+    method: str | None = None,
     internal_type: str | None = None,
     record_args: bool = True,
     record_result: bool = True,
@@ -138,6 +141,7 @@ def instrument(
     name: str | None = None,
     category: str = "function",
     span_type: str | None = None,
+    method: str | None = None,
     internal_type: str | None = None,
     record_args: bool = True,
     record_result: bool = True,
@@ -151,6 +155,10 @@ def instrument(
         name: Custom span name (defaults to module.function)
         category: Span category (e.g., "agent", "tool", "function", "mcp")
         span_type: Alias for category (deprecated, use category instead)
+        method: MCP method name (e.g., "tools/call", "resources/read").
+            When set, produces MCP spans: name becomes "{method}.mcp",
+            type becomes "SERVER", and request is structured as
+            {"method": ..., "params": ...}.
         internal_type: Internal span type (e.g., "user-message")
         record_args: Whether to record function arguments
         record_result: Whether to record function result
@@ -168,6 +176,7 @@ def instrument(
             return await model.generate(messages)
     """
     effective_category = span_type if span_type is not None else category
+    effective_method = method
     def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
         if hasattr(func, "_hud_instrumented"):
@@ -193,13 +202,19 @@ def instrument(
             error: str | None = None,
         ) -> dict[str, Any]:
             """Build a HudSpan-compatible span record."""
-            # Build attributes using TraceStep
+            is_mcp = effective_method is not None
+            extra_attrs: dict[str, Any] = {}
+            if is_mcp:
+                extra_attrs["method_name"] = effective_method
             attributes = TraceStep(
                 task_run_id=task_run_id,
-                category=effective_category,
-                type="CLIENT",
+                category="mcp" if is_mcp else effective_category,
+                type="SERVER" if is_mcp else "CLIENT",
                 start_timestamp=start_time,
                 end_timestamp=end_time,
+                **extra_attrs,
             )
             # Record arguments as request
@@ -213,21 +228,50 @@ def instrument(
                         if k not in ("self", "cls")
                     }
                     if args_dict:
-                        attributes.request = args_dict
+                        if is_mcp:
+                            attributes.request = {
+                                "method": effective_method,
+                                "params": args_dict,
+                            }
+                        else:
+                            attributes.request = args_dict
                 except Exception as e:
                     logger.debug("Failed to serialize args: %s", e)
             # Record result
             if record_result and result is not None and error is None:
                 try:
-                    attributes.result = _serialize_value(result)
+                    serialized = _serialize_value(result)
+                    if is_mcp and effective_method == "prompts/get":
+                        if isinstance(serialized, str):
+                            serialized = {
+                                "messages": [
+                                    {
+                                        "role": "user",
+                                        "content": {
+                                            "type": "text",
+                                            "text": serialized,
+                                        },
+                                    }
+                                ]
+                            }
+                    elif is_mcp and effective_method == "resources/read":
+                        if isinstance(serialized, list):
+                            serialized = {"contents": serialized}
+                        elif isinstance(serialized, dict) and "reward" in serialized:
+                            uri = args_dict.get("uri", "") if args_dict else ""
+                            serialized = {
+                                "contents": [{"uri": uri, "text": json.dumps(serialized)}]
+                            }
+                    attributes.result = serialized
                 except Exception as e:
                     logger.debug("Failed to serialize result: %s", e)
             # Build span
             span_id = uuid.uuid4().hex[:16]
+            effective_name = f"{effective_method}.mcp" if is_mcp else span_name
             span: dict[str, Any] = {
-                "name": span_name,
+                "name": effective_name,
                 "trace_id": _normalize_trace_id(task_run_id),
                 "span_id": span_id,
                 "parent_span_id": None,

hud-python 0.5.31__tar.gz → 0.5.33__tar.gz

hud-python 0.5.31tar.gz → 0.5.33tar.gz