PyPI - hud-python - Versions diffs - 0.5.25__tar.gz → 0.5.27__tar.gz - Mend

hud-python 0.5.25tar.gz → 0.5.27tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (347) hide show

{hud_python-0.5.25 → hud_python-0.5.27}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.5.25
+Version: 0.5.27
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues

{hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/claude.py RENAMED Viewed

@@ -3,6 +3,7 @@
 from __future__ import annotations
 import copy
+import json
 import logging
 from inspect import cleandoc
 from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
@@ -85,7 +86,12 @@ class ClaudeAgent(MCPAgent):
             logger.debug("Legacy fallback: detected %s as computer tool", tool.name)
             model_lower = (self.model or "").lower()
             if any(
-                fnmatch.fnmatch(model_lower, p) for p in ("claude-opus-4-5*", "claude-opus-4-6*")
+                fnmatch.fnmatch(model_lower, p)
+                for p in (
+                    "claude-opus-4-5*",
+                    "claude-opus-4-6*",
+                    "claude-sonnet-4-6*",
+                )
             ):
                 return NativeToolSpec(
                     api_type="computer_20251124",
@@ -149,15 +155,15 @@ class ClaudeAgent(MCPAgent):
         # these will be initialized in _convert_tools_for_claude
         self.has_computer_tool = False
-        self.tool_mapping: dict[str, str] = {}
-        self.claude_tools: list[BetaToolUnionParam] = []
-        self._required_betas: set[str] = set()
+        self.tool_mapping = {}
+        self.claude_tools = []
+        self._required_betas = set()
     def _on_tools_ready(self) -> None:
         """Build Claude-specific tool mappings after tools are discovered."""
         self._convert_tools_for_claude()
-    async def get_system_messages(self) -> list[BetaMessageParam]:
+    async def get_system_messages(self) -> list[types.ContentBlock]:
         """No system messages for Claude because applied in get_response"""
         return []
@@ -195,10 +201,42 @@ class ClaudeAgent(MCPAgent):
         return [BetaMessageParam(role="user", content=anthropic_blocks)]
+    @staticmethod
+    def _extract_invalid_tool_json(exc: Exception) -> str | None:
+        """Extract malformed tool JSON payload from Anthropic stream errors.
+        Returns None when the exception is unrelated to tool JSON parsing.
+        """
+        message = str(exc)
+        parse_error_prefix = "Unable to parse tool parameter JSON from model."
+        if parse_error_prefix not in message:
+            return None
+        marker = "JSON: "
+        marker_index = message.find(marker)
+        if marker_index == -1:
+            return ""
+        return message[marker_index + len(marker) :].strip()
+    @staticmethod
+    def _build_invalid_tool_json_retry_message(invalid_json: str) -> BetaMessageParam:
+        """Build a user message prompting the model to re-emit valid tool JSON."""
+        wrapped = json.dumps({"INVALID_JSON": invalid_json}, ensure_ascii=True)
+        retry_text = (
+            "Your previous tool-call arguments were invalid JSON and could not be parsed.\n"
+            "Retry the same intended tool call once with valid JSON arguments only.\n"
+            "Ensure all strings are quoted and all arrays/objects are valid JSON.\n"
+            f"Malformed payload (wrapped): {wrapped}"
+        )
+        return BetaMessageParam(
+            role="user",
+            content=[text_to_content_block(retry_text)],
+        )
     async def get_response(self, messages: list[BetaMessageParam]) -> AgentResponse:
         """Get response from Claude including any tool calls."""
         messages_cached = self._add_prompt_caching(messages)
         # betas to use - collected during tool conversion based on native specs
         # Only pass betas when non-empty; an empty list can produce an empty
         # anthropic-beta header which the API rejects.
@@ -223,21 +261,58 @@ class ClaudeAgent(MCPAgent):
                 ) from None
         else:
             # Regular Anthropic client supports .stream()
-            async with self.anthropic_client.beta.messages.stream(
-                model=self.config.model,
-                system=self.system_prompt if self.system_prompt is not None else Omit(),
-                max_tokens=self.max_tokens,
-                messages=messages_cached,
-                tools=self.claude_tools,
-                tool_choice={"type": "auto", "disable_parallel_tool_use": True},
-                betas=betas,
-            ) as stream:
-                # allow backend to accumulate message content
-                async for _ in stream:
-                    pass
-                # get final message
-                response = await stream.get_final_message()
-                messages.append(BetaMessageParam(role="assistant", content=response.content))
+            response = None
+            invalid_json_failures = 0
+            for _ in range(3):
+                messages_cached = self._add_prompt_caching(messages)
+                try:
+                    async with self.anthropic_client.beta.messages.stream(
+                        model=self.config.model,
+                        system=self.system_prompt if self.system_prompt is not None else Omit(),
+                        max_tokens=self.max_tokens,
+                        messages=messages_cached,
+                        tools=self.claude_tools,
+                        tool_choice={"type": "auto", "disable_parallel_tool_use": True},
+                        betas=betas,
+                    ) as stream:
+                        # allow backend to accumulate message content
+                        async for _ in stream:
+                            pass
+                        # get final message
+                        response = await stream.get_final_message()
+                        messages.append(
+                            BetaMessageParam(
+                                role="assistant",
+                                content=response.content,
+                            )
+                        )
+                        break
+                except ValueError as exc:
+                    invalid_json = self._extract_invalid_tool_json(exc)
+                    is_retryable = invalid_json is not None
+                    if not is_retryable:
+                        raise
+                    invalid_json_failures += 1
+                    if invalid_json_failures == 1:
+                        logger.warning(
+                            "Claude returned invalid streamed tool JSON; "
+                            "retrying same generation once"
+                        )
+                        continue
+                    if invalid_json_failures == 2:
+                        logger.warning(
+                            "Claude returned invalid streamed tool JSON twice; "
+                            "retrying once with INVALID_JSON guidance"
+                        )
+                        messages.append(self._build_invalid_tool_json_retry_message(invalid_json))
+                        continue
+                    raise
+            if response is None:
+                raise ValueError("Claude response missing after stream retries")
         # Process response
         result = AgentResponse(content="", tool_calls=[], done=True)

{hud_python-0.5.25 → hud_python-0.5.27}/hud/agents/tests/test_claude.py RENAMED Viewed

@@ -99,6 +99,30 @@ class MockStreamContextManager:
         return self.response
+class MockErrorStreamContextManager:
+    """Mock stream context manager that raises a fixed error while streaming."""
+    def __init__(self, error: Exception) -> None:
+        self.error = error
+    async def __aenter__(self) -> MockErrorStreamContextManager:
+        return self
+    async def __aexit__(
+        self, exc_type: type | None, exc_val: Exception | None, exc_tb: Any
+    ) -> bool:
+        return False
+    def __aiter__(self) -> MockErrorStreamContextManager:
+        return self
+    async def __anext__(self) -> None:
+        raise self.error
+    async def get_final_message(self) -> MagicMock:
+        raise AssertionError("get_final_message should not be called when stream iteration fails")
 class TestClaudeHelperFunctions:
     """Test helper functions for Claude message formatting."""
@@ -410,6 +434,120 @@ class TestClaudeAgent:
         assert response.tool_calls[0].name == "my_tool"
         assert response.tool_calls[0].arguments == {"x": "value"}
+    @pytest.mark.asyncio
+    async def test_get_response_retries_same_generation_once_on_invalid_streamed_tool_json(
+        self, mock_anthropic: AsyncAnthropic
+    ) -> None:
+        """First invalid streamed tool JSON should retry without adding guidance."""
+        invalid_json_error = ValueError(
+            "Unable to parse tool parameter JSON from model. Please retry your request or "
+            "adjust your "
+            'prompt. Error: expected value at line 1 column 10. JSON: {"labels": bug}'
+        )
+        first_stream = MockErrorStreamContextManager(invalid_json_error)
+        mock_response = MagicMock()
+        mock_response.content = [MagicMock(type="text", text="Recovered")]
+        second_stream = MockStreamContextManager(mock_response)
+        mock_anthropic.beta.messages.stream = MagicMock(side_effect=[first_stream, second_stream])
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            validate_api_key=False,
+        )
+        agent.claude_tools = []
+        agent.tool_mapping = {}
+        agent.has_computer_tool = False
+        agent._initialized = True
+        messages: list[BetaMessageParam] = [
+            cast(
+                "BetaMessageParam",
+                {"role": "user", "content": [{"type": "text", "text": "Create a Linear ticket"}]},
+            )
+        ]
+        response = await agent.get_response(messages)
+        assert response.content == "Recovered"
+        assert mock_anthropic.beta.messages.stream.call_count == 2
+        # Original user message + assistant response (no guidance message needed)
+        assert len(messages) == 2
+        assert messages[1]["role"] == "assistant"
+    @pytest.mark.asyncio
+    async def test_get_response_adds_invalid_json_guidance_after_second_failure(
+        self, mock_anthropic: AsyncAnthropic
+    ) -> None:
+        """Second consecutive invalid JSON failure should add INVALID_JSON guidance."""
+        invalid_json_error = ValueError(
+            "Unable to parse tool parameter JSON from model. Please retry your request or "
+            "adjust your "
+            'prompt. Error: expected value at line 1 column 10. JSON: {"labels": bug}'
+        )
+        first_stream = MockErrorStreamContextManager(invalid_json_error)
+        second_stream = MockErrorStreamContextManager(invalid_json_error)
+        mock_response = MagicMock()
+        mock_response.content = [MagicMock(type="text", text="Recovered after guidance")]
+        third_stream = MockStreamContextManager(mock_response)
+        mock_anthropic.beta.messages.stream = MagicMock(
+            side_effect=[first_stream, second_stream, third_stream]
+        )
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            validate_api_key=False,
+        )
+        agent.claude_tools = []
+        agent.tool_mapping = {}
+        agent.has_computer_tool = False
+        agent._initialized = True
+        messages: list[BetaMessageParam] = [
+            cast(
+                "BetaMessageParam",
+                {"role": "user", "content": [{"type": "text", "text": "Create a Linear ticket"}]},
+            )
+        ]
+        response = await agent.get_response(messages)
+        assert response.content == "Recovered after guidance"
+        assert mock_anthropic.beta.messages.stream.call_count == 3
+        # Original user message + INVALID_JSON guidance + assistant response
+        assert len(messages) == 3
+        retry_message = messages[1]
+        assert retry_message["role"] == "user"
+        retry_content = cast("list[dict[str, Any]]", retry_message["content"])
+        assert "INVALID_JSON" in retry_content[0]["text"]
+    @pytest.mark.asyncio
+    async def test_get_response_does_not_retry_unrelated_value_error(
+        self, mock_anthropic: AsyncAnthropic
+    ) -> None:
+        """Non-tool-json ValueErrors should propagate immediately."""
+        unrelated_error = ValueError("stream exploded for unrelated reason")
+        mock_anthropic.beta.messages.stream = MagicMock(
+            return_value=MockErrorStreamContextManager(unrelated_error)
+        )
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            validate_api_key=False,
+        )
+        agent.claude_tools = []
+        agent.tool_mapping = {}
+        agent.has_computer_tool = False
+        agent._initialized = True
+        with pytest.raises(ValueError, match="unrelated reason"):
+            await agent.get_response([])
+        assert mock_anthropic.beta.messages.stream.call_count == 1
 class TestClaudeAgentBedrock:
     """Test ClaudeAgent class with Bedrock."""

{hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/dev.py RENAMED Viewed

@@ -50,42 +50,41 @@ def show_dev_server_info(
     # Server section
     hud_console.section_title("Server")
-    hud_console.print(f"{hud_console.sym.ITEM} {escape(server_name)}")
+    hud_console.console.print(f"{hud_console.sym.ITEM} {escape(server_name)}", highlight=False)
+    _print = lambda msg: hud_console.console.print(msg, highlight=False)
     if transport == "http":
-        hud_console.print(f"{hud_console.sym.ITEM} http://localhost:{port}/mcp")
+        _print(f"{hud_console.sym.ITEM} http://localhost:{port}/mcp")
     else:
-        hud_console.print(f"{hud_console.sym.ITEM} (stdio)")
+        _print(f"{hud_console.sym.ITEM} (stdio)")
     # Quick Links (only for HTTP mode)
     if transport == "http":
         hud_console.section_title("Quick Links")
-        hud_console.print(f"{hud_console.sym.ITEM} Docs: http://localhost:{port}/docs")
-        hud_console.print(f"{hud_console.sym.ITEM} Cursor:")
+        _print(f"{hud_console.sym.ITEM} Docs: http://localhost:{port}/docs")
+        _print(f"{hud_console.sym.ITEM} Cursor:")
         # Display the Cursor link on its own line to prevent wrapping
         hud_console.link(cursor_deeplink)
         # Show eval endpoint if in Docker mode
         if docker_mode:
-            hud_console.print(
-                f"{hud_console.sym.ITEM} Eval API: http://localhost:{port}/eval (POST)"
-            )
+            _print(f"{hud_console.sym.ITEM} Eval API: http://localhost:{port}/eval (POST)")
         # Show debugging URLs from telemetry
         if telemetry:
             if "live_url" in telemetry:
                 url = escape(telemetry["live_url"])
-                hud_console.print(f"{hud_console.sym.ITEM} Live URL: {url}")
+                _print(f"{hud_console.sym.ITEM} Live URL: {url}")
             if "vnc_url" in telemetry:
-                hud_console.print(f"{hud_console.sym.ITEM} VNC URL: {escape(telemetry['vnc_url'])}")
+                _print(f"{hud_console.sym.ITEM} VNC URL: {escape(telemetry['vnc_url'])}")
             if "cdp_url" in telemetry:
-                hud_console.print(f"{hud_console.sym.ITEM} CDP URL: {escape(telemetry['cdp_url'])}")
+                _print(f"{hud_console.sym.ITEM} CDP URL: {escape(telemetry['cdp_url'])}")
         # Check for VNC (browser environment)
         if env_dir and (env_dir / "environment" / "server.py").exists():
             try:
                 content = (env_dir / "environment" / "server.py").read_text()
                 if "x11vnc" in content.lower() or "vnc" in content.lower():
-                    hud_console.print(f"{hud_console.sym.ITEM} VNC: http://localhost:8080/vnc.html")
+                    _print(f"{hud_console.sym.ITEM} VNC: http://localhost:8080/vnc.html")
             except Exception:  # noqa: S110
                 pass

{hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/eval.py RENAMED Viewed

@@ -96,7 +96,7 @@ _DEFAULT_CONFIG_TEMPLATE = """# HUD Eval Configuration
 # max_steps = 10
 # group_size = 1
 # byok = false  # Remote only; use encrypted env vars on the platform.
-# task_ids = ["task_1", "task_2"]
+# task_ids = ["checkout-smoke", "0"]  # slugs or 0-based indices
 # verbose = true
 # very_verbose = true
 # auto_respond = true
@@ -627,15 +627,18 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
         hud_console.error(f"No tasks found in: {cfg.source}")
         raise typer.Exit(1)
-    # Filter by task IDs if provided
+    # Filter by task slugs (or positional indices) if provided
     if cfg.task_ids:
-        id_set = set(cfg.task_ids)
-        # Match by task.id or index
-        filtered = [t for i, t in enumerate(tasks) if t.id in id_set or str(i) in id_set]
+        selector_set = set(cfg.task_ids)
+        filtered = []
+        for i, task in enumerate(tasks):
+            task_slug = getattr(task, "slug", None)
+            if (isinstance(task_slug, str) and task_slug in selector_set) or str(i) in selector_set:
+                filtered.append(task)
         if not filtered:
-            hud_console.error(f"No tasks found matching IDs: {', '.join(cfg.task_ids)}")
+            hud_console.error(f"No tasks found matching slugs/indices: {', '.join(cfg.task_ids)}")
             raise typer.Exit(1)
-        hud_console.info(f"Filtered to {len(filtered)} task(s) by ID")
+        hud_console.info(f"Filtered to {len(filtered)} task(s) by slug/index")
         tasks = filtered
     elif not cfg.all:
         # Single task mode (no --all, --full, or --task-ids)
@@ -687,33 +690,16 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
                         sanitized[agent_name] = agent_settings
                 eval_cfg_dict["agent_config"] = sanitized
-        tasks_to_create = [t for t in tasks if cfg.taskset and not t.id]
-        tasks_data = (
-            [t.model_dump(mode="json", exclude_none=True) for t in tasks_to_create]
-            if tasks_to_create
-            else None
-        )
-        ids = await _send_job_enter(
+        await _send_job_enter(
             job_id=job_id,
             name=f"eval ({cfg.source})" if cfg.source else "eval",
             variants=None,
             group=cfg.group_size,
             api_key=None,
             taskset=cfg.taskset,
-            tasks=tasks_data,
             hud_eval_config=eval_cfg_dict,
         )
-        if cfg.taskset and ids:
-            if len(ids) != len(tasks_to_create):
-                hud_console.warning(
-                    f"Task count mismatch: sent {len(tasks_to_create)} tasks, "
-                    f"received {len(ids)} IDs. Some tasks may not be linked."
-                )
-            for task_obj, task_version_id in zip(tasks_to_create, ids, strict=False):
-                task_obj.id = task_version_id
         trace_ids = await submit_rollouts(
             tasks=tasks,
             job_id=job_id,
@@ -809,7 +795,11 @@ def eval_command(
         help="Automatically prompt the agent to continue if it does not respond with a tool call",
     ),
     group_size: int | None = typer.Option(None, "--group-size", help="Runs per task"),
-    task_ids: str | None = typer.Option(None, "--task-ids", help="Comma-separated task IDs to run"),
+    task_ids: str | None = typer.Option(
+        None,
+        "--task-ids",
+        help="Comma-separated task slugs (or 0-based indices) to run",
+    ),
     yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation"),
     remote: bool = typer.Option(
         False, "--remote", help="Submit tasks to platform for remote execution"

{hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/flows/dev.py RENAMED Viewed

@@ -138,8 +138,9 @@ def show_dev_ui(
     # Show other info below
     label = "Base image" if is_docker else "Server"
     hud_console.info("")
-    hud_console.print(f"{hud_console.sym.ITEM} {escape(label)}: {escape(server_name)}")
-    hud_console.print(f"{hud_console.sym.ITEM} Cursor:")
+    _print = lambda msg: hud_console.console.print(msg, highlight=False)
+    _print(f"{hud_console.sym.ITEM} {escape(label)}: {escape(server_name)}")
+    _print(f"{hud_console.sym.ITEM} Cursor:")
     # Display the Cursor link on its own line to prevent wrapping
     hud_console.link(cursor_deeplink)
     hud_console.info("")

{hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/remove.py RENAMED Viewed

@@ -5,6 +5,7 @@ from __future__ import annotations
 import shutil
 import typer
+from rich.markup import escape
 from hud.utils.hud_console import HUDConsole
@@ -91,8 +92,8 @@ def remove_environment(
             if image:
                 hud_console.info("")
                 hud_console.info("Note: The Docker image may still exist locally.")
-                hud_console.info(
-                    f"To remove it, run: [cyan]docker rmi {image.split('@')[0]}[/cyan]"
+                hud_console.print(
+                    f"To remove it, run: [cyan]docker rmi {escape(image.split('@')[0])}[/cyan]"
                 )
     except Exception as e:
         hud_console.error(f"Failed to remove environment: {e}")

{hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/tests/test_build.py RENAMED Viewed

@@ -60,12 +60,12 @@ class TestIncrementVersion:
     def test_increment_minor(self):
         """Test incrementing minor version."""
         assert increment_version("1.2.3", "minor") == "1.3.0"
-        assert increment_version("0.5.25", "minor") == "0.6.0"
+        assert increment_version("0.5.27", "minor") == "0.6.0"
     def test_increment_major(self):
         """Test incrementing major version."""
         assert increment_version("1.2.3", "major") == "2.0.0"
-        assert increment_version("0.5.25", "major") == "1.0.0"
+        assert increment_version("0.5.27", "major") == "1.0.0"
     def test_increment_with_v_prefix(self):
         """Test incrementing version with v prefix."""

{hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/interactive.py RENAMED Viewed

@@ -433,7 +433,7 @@ class InteractiveMCPTester:
             # Show next steps tutorial
             self.console.section_title("Next Steps")
             self.console.info("🏗️  Ready to test with real agents? Run:")
-            self.console.info("    [cyan]hud build[/cyan]")
+            self.console.print("    [cyan]hud build[/cyan]")
             self.console.info("")
             self.console.info("This will:")
             self.console.info("  1. Build your environment image")
@@ -441,8 +441,10 @@ class InteractiveMCPTester:
             self.console.info("  3. Prepare it for testing with agents")
             self.console.info("")
             self.console.info("Then you can:")
-            self.console.info("  • Test locally: [cyan]hud run <image>[/cyan]")
-            self.console.info("  • Push to registry: [cyan]hud push --image <registry/name>[/cyan]")
+            self.console.print("  • Test locally: [cyan]hud run <image>[/cyan]")
+            self.console.print(
+                "  • Push to registry: [cyan]hud push --image <registry/name>[/cyan]"
+            )
             self.console.info("  • Use with agents via the lock file")
             console.print("\n[dim]Happy testing! 🎉[/dim]")

{hud_python-0.5.25 → hud_python-0.5.27}/hud/cli/utils/version_check.py RENAMED Viewed

@@ -26,6 +26,7 @@ from typing import NamedTuple
 import httpx
 from packaging import version
+from rich.markup import escape
 from hud.utils.hud_console import HUDConsole
@@ -241,16 +242,12 @@ def display_update_prompt(console: HUDConsole | None = None) -> None:
             else:
                 upgrade_cmd = "uv tool upgrade hud-python"
-            # Create update message
-            update_msg = (
-                f"🆕 A new version of hud-python is available: "
-                f"[bold cyan]{info.latest}[/bold cyan] "
-                f"(current: [dim]{info.current}[/dim])\n"
-                f"   Run: [bold yellow]{upgrade_cmd}[/bold yellow] to update"
+            console.print(
+                f"[yellow]🆕 A new version of hud-python is available: "
+                f"[bold cyan]{escape(info.latest)}[/bold cyan] "
+                f"(current: [dim]{escape(info.current)}[/dim])\n"
+                f"   Run: [bold yellow]{escape(upgrade_cmd)}[/bold yellow] to update[/yellow]"
             )
-            # Display using console info
-            console.info(f"[yellow]{update_msg}[/yellow]")
     except Exception:  # noqa: S110
         # Never let version checking disrupt the user's workflow
         pass

{hud_python-0.5.25 → hud_python-0.5.27}/hud/datasets/loader.py RENAMED Viewed

@@ -110,6 +110,8 @@ def _load_from_huggingface(dataset_name: str) -> list[Task]:
 def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]:
     """Load raw task dicts from HUD API."""
+    from hud.datasets.utils import _normalize_task_dict
     headers = {}
     if settings.api_key:
         headers["Authorization"] = f"Bearer {settings.api_key}"
@@ -126,13 +128,11 @@ def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]:
         # Extract tasks dict from response
         tasks_dict = data.get("tasks", {})
-        raw_items: list[dict[str, Any]] = []
-        for task_id, task_data in tasks_dict.items():
-            if task_data.get("id") is None:
-                task_data["id"] = task_id
-            raw_items.append(task_data)
-        return raw_items
+        return [
+            _normalize_task_dict(task_data)
+            for task_data in tasks_dict.values()
+            if isinstance(task_data, dict)
+        ]
 def _load_from_api(dataset_name: str) -> list[Task]:
@@ -282,8 +282,13 @@ def save_tasks(
                 "Use Task.from_v4(legacy_task) to convert from LegacyTask."
             )
-    # Convert tasks to dicts (Task is a Pydantic model)
-    task_dicts = [task.model_dump(mode="json", exclude_none=True) for task in tasks]
+    # Convert tasks to dicts (Task is a Pydantic model).
+    # id is internal/platform-assigned; uploads should identify via slug.
+    task_dicts: list[dict[str, Any]] = []
+    for task in tasks:
+        task_data = task.model_dump(mode="json", exclude_none=True)
+        task_data.pop("id", None)
+        task_dicts.append(task_data)
     # Build request payload
     payload: dict[str, Any] = {
@@ -296,7 +301,7 @@ def save_tasks(
     try:
         with httpx.Client(timeout=60) as client:
             response = client.post(
-                f"{settings.hud_api_url}/tasks/evalset",
+                f"{settings.hud_api_url}/tasks/upload",
                 json=payload,
                 headers=headers,
             )

{hud_python-0.5.25 → hud_python-0.5.27}/hud/datasets/runner.py RENAMED Viewed

@@ -187,7 +187,7 @@ async def run_single_task(
         ```
     """
     # Determine trace name
-    effective_trace_name = trace_name or task_id or task.id or "single_task"
+    effective_trace_name = trace_name or task_id or task.slug or "single_task"
     # Run with explicit eval context parameters
     async with hud.eval(

hud-python 0.5.25__tar.gz → 0.5.27__tar.gz

hud-python 0.5.25tar.gz → 0.5.27tar.gz