PyPI - hud-python - Versions diffs - 0.4.31__py3-none-any.whl → 0.4.33__py3-none-any.whl - Mend

hud-python 0.4.31py3-none-any.whl → 0.4.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (17) hide show

hud/cli/flows/tasks.py +83 -14
hud/cli/push.py +1 -0
hud/cli/rl/remote_runner.py +75 -62
hud/clients/utils/mcp_use_retry.py +3 -3
hud/rl/buffer.py +108 -77
hud/samples/__init__.py +7 -0
hud/samples/browser.py +33 -0
hud/types.py +19 -6
hud/utils/mcp.py +6 -1
hud/utils/tests/test_version.py +1 -1
hud/utils/tool_shorthand.py +59 -0
hud/version.py +1 -1
{hud_python-0.4.31.dist-info → hud_python-0.4.33.dist-info}/METADATA +1 -1
{hud_python-0.4.31.dist-info → hud_python-0.4.33.dist-info}/RECORD +17 -14
{hud_python-0.4.31.dist-info → hud_python-0.4.33.dist-info}/WHEEL +0 -0
{hud_python-0.4.31.dist-info → hud_python-0.4.33.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.31.dist-info → hud_python-0.4.33.dist-info}/licenses/LICENSE +0 -0

hud/cli/flows/tasks.py CHANGED Viewed

@@ -27,9 +27,28 @@ def _is_remote_url(url: str) -> bool:
 def _validate_tasks(tasks: list[Task]) -> bool:
-    """Validate the tasks file."""
+    """Validate the tasks file: return True if tasks already reference a remote MCP URL.
+    A task is considered remote if any "url" field anywhere inside mcp_config
+    is a valid remote URL (e.g., https://mcp.hud.so/v3/mcp).
+    """
+    def _has_remote_url(obj: Any) -> bool:
+        if isinstance(obj, dict):
+            for k, v in obj.items():
+                if k == "url" and isinstance(v, str) and _is_remote_url(v):
+                    return True
+                if _has_remote_url(v):
+                    return True
+        elif isinstance(obj, list):
+            for item in obj:
+                if _has_remote_url(item):
+                    return True
+        return False
     for task in tasks:
-        if not task.mcp_config or (not _is_remote_url(task.mcp_config.get("url", ""))):
+        cfg = task.mcp_config or {}
+        if not _has_remote_url(cfg):
             return False
     return True
@@ -100,7 +119,7 @@ def _ensure_pushed(env_dir: Path, lock_data: dict[str, Any]) -> dict[str, Any]:
         require_docker_running()
         # If Docker or login is not configured, the push function will fail and halt.
-        push_environment(str(env_dir))
+        push_environment(str(env_dir), yes=True)
         # Reload lock after push
         lock_path = env_dir / "hud.lock.yaml"
@@ -111,10 +130,24 @@ def _ensure_pushed(env_dir: Path, lock_data: dict[str, Any]) -> dict[str, Any]:
 def _derive_remote_image(lock_data: dict[str, Any]) -> str:
-    """Derive org/name:tag from lock file image field for MCP header."""
+    """Derive org/name:tag from lock file for MCP header.
+    Preference order:
+    1) lock_data["push"]["image_with_tag"] if present
+    2) Derive from lock_data["image"] (may be a digest; falls back to latest)
+    """
+    push_info = lock_data.get("push", {}) if isinstance(lock_data, dict) else {}
+    # 1) Exact image_with_tag if present
+    pushed_with_tag = str(push_info.get("image_with_tag", "")).strip()
+    if pushed_with_tag:
+        name, tag = extract_name_and_tag(pushed_with_tag)
+        return f"{name}:{tag}"
+    # Base name always comes from lock_data.image to preserve org/repo
     image_ref = str(lock_data.get("image", "")).strip()
     if not image_ref:
-        raise typer.Exit("Lock file missing image reference")
+        raise typer.Exit(1)
     name, tag = extract_name_and_tag(image_ref)
     return f"{name}:{tag}"
@@ -157,19 +190,55 @@ def convert_tasks_to_remote(tasks_file: str) -> str:
     # Derive remote image name org/name:tag
     remote_image = _derive_remote_image(lock_data)
+    # Helper to strip extra fields from tool calls
+    def _simplify_tool_call(tool: Any) -> Any:
+        def _one(x: Any) -> dict[str, Any]:
+            try:
+                data = x.model_dump() if hasattr(x, "model_dump") else dict(x)
+            except Exception:
+                try:
+                    data = dict(x)
+                except Exception:
+                    return {}
+            # Keep only name and arguments
+            name = data.get("name")
+            arguments = data.get("arguments", {})
+            return {"name": name, "arguments": arguments}
+        if tool is None:
+            return None
+        if isinstance(tool, list):
+            return [_one(x) for x in tool]
+        return _one(tool)
     # Convert to list[dict]
     tasks_payload: list[dict[str, Any]] = []
     for t in tasks:
-        item = t.model_dump()
-        item["mcp_config"] = {
-            "hud": {
-                "url": "https://mcp.hud.so/v3/mcp",
-                "headers": {
-                    "Authorization": "Bearer ${HUD_API_KEY}",
-                    "Mcp-Image": remote_image,
-                },
-            }
+        item: dict[str, Any] = {
+            "prompt": t.prompt,
+            "mcp_config": {
+                "hud": {
+                    "url": "https://mcp.hud.so/v3/mcp",
+                    "headers": {
+                        "Authorization": "Bearer ${HUD_API_KEY}",
+                        "Mcp-Image": remote_image,
+                    },
+                }
+            },
         }
+        # Optional fields, omit Nones
+        if t.setup_tool is not None:
+            item["setup_tool"] = _simplify_tool_call(t.setup_tool)
+        if t.evaluate_tool is not None:
+            item["evaluate_tool"] = _simplify_tool_call(t.evaluate_tool)
+        if t.agent_tools is not None:
+            item["agent_tools"] = t.agent_tools
+        if t.system_prompt is not None:
+            item["system_prompt"] = t.system_prompt
+        if t.metadata:
+            item["metadata"] = t.metadata
         tasks_payload.append(item)
     # Write new file: remote_<name>.json (always JSON array)

hud/cli/push.py CHANGED Viewed

@@ -332,6 +332,7 @@ def push_environment(
         "source": local_image,
         "pushedAt": datetime.now(UTC).isoformat().replace("+00:00", "Z"),
         "registry": pushed_digest.split("/")[0] if "/" in pushed_digest else "docker.io",
+        "image_with_tag": image,
     }
     # Save updated lock file

hud/cli/rl/remote_runner.py CHANGED Viewed

@@ -9,6 +9,7 @@ from __future__ import annotations
 import os
 import subprocess
 import time
+import uuid
 from pathlib import Path
 from rich.console import Console
@@ -29,6 +30,41 @@ GPU_PRICING = {
 }
+def ensure_vllm_deployed(model_name: str, gpu_type: str = "A100", timeout: int = 600) -> None:
+    """Deploy vLLM for a model if needed and wait until it's ready.
+    Args:
+        model_name: The name of the model to deploy vLLM for
+        gpu_type: GPU type to use for deployment (e.g., A100, H100)
+        timeout: Max seconds to wait for vLLM to be ready
+    """
+    # Check current model status
+    info = rl_api.get_model(model_name)
+    if info.vllm_url:
+        hud_console.success("vLLM server already running")
+        return
+    hud_console.info(f"Deploying vLLM server for {model_name}...")
+    rl_api.deploy_vllm(model_name, gpu_type=gpu_type)
+    hud_console.success("vLLM deployment started")
+    hud_console.info("Waiting for vLLM server to be ready...")
+    start_time = time.time()
+    with hud_console.progress() as progress:
+        progress.update("Checking deployment status (see live status on https://app.hud.so/models)")
+        while True:
+            if time.time() - start_time > timeout:
+                hud_console.error("Timeout waiting for vLLM deployment")
+                raise ValueError("vLLM deployment timeout")
+            info = rl_api.get_model(model_name)
+            if info.vllm_url or info.status == "ready":
+                hud_console.success(
+                    f"vLLM server ready at http://rl.hud.so/v1/models/{model_name}/vllm"
+                )
+                break
+            time.sleep(5)
 def run_remote_training(
     tasks_file: str | None,
     model: str | None,
@@ -128,49 +164,55 @@ def run_remote_training(
             from rich.prompt import Prompt
             # Ask for model name
-            default_name = model_type.split("/")[-1].lower()
+            base_default = model_type.split("/")[-1].lower()
+            default_name = base_default
+            existing_names = {m.name for m in active_models}
+            suffix = 1
+            while default_name in existing_names:
+                default_name = f"{base_default}-{suffix}"
+                suffix += 1
             hud_console.info(f"Enter model name (default: {default_name}):")
             model_name = Prompt.ask("Model name", default=default_name)
             model_name = model_name.replace("/", "-").lower()
-            # Create the model
+            # Create the model with retry on name conflict
             hud_console.info(f"Creating model: {model_name}")
             try:
                 rl_api.create_model(model_name, model_type)
                 hud_console.success(f"Created model: {model_name}")
+                ensure_vllm_deployed(model_name, gpu_type="A100")
-                # Deploy vLLM automatically
-                hud_console.info(f"Deploying vLLM server for {model_name}...")
-                rl_api.deploy_vllm(model_name, gpu_type="A100")
-                hud_console.success("vLLM deployment started")
-                # Wait for deployment
-                hud_console.info("Waiting for vLLM server to be ready...")
-                max_wait = 600  # 10 minutes
-                start_time = time.time()
-                with hud_console.progress() as progress:
-                    progress.update(
-                        "Checking deployment status (see live status on https://app.hud.so/models)"
-                    )
+            except Exception as e:
+                # If the name already exists, suggest a new name and prompt once
+                message = str(e)
+                if "already exists" in message or "409" in message:
+                    alt_name = f"{model_name}-1"
+                    i = 1
                     while True:
-                        if time.time() - start_time > max_wait:
-                            hud_console.error("Timeout waiting for vLLM deployment")
-                            raise ValueError("vLLM deployment timeout")
-                        model_info = rl_api.get_model(model_name)
-                        if model_info.status == "ready":
-                            hud_console.success(
-                                f"vLLM server ready at http://rl.hud.so/v1/models/{model_name}/vllm"
-                            )
+                        candidate = f"{model_name}-{str(uuid.uuid4())[:4]}"
+                        if candidate not in existing_names:
+                            alt_name = candidate
                             break
-                        time.sleep(5)
-            except Exception as e:
-                hud_console.error(f"Failed to create model: {e}")
-                raise
+                        i += 1
+                    hud_console.warning(
+                        f"Model '{model_name}' exists. Suggesting '{alt_name}' instead."
+                    )
+                    try:
+                        from rich.prompt import Prompt as _Prompt
+                        chosen = _Prompt.ask("Use different name", default=alt_name)
+                        chosen = chosen.replace("/", "-").lower()
+                        rl_api.create_model(chosen, model_type)
+                        hud_console.success(f"Created model: {chosen}")
+                        model_name = chosen
+                        ensure_vllm_deployed(model_name, gpu_type="A100")
+                    except Exception as e2:
+                        hud_console.error(f"Failed to create model: {e2}")
+                        raise
+                else:
+                    hud_console.error(f"Failed to create model: {e}")
+                    raise
         else:
             # Existing model selected
@@ -194,36 +236,7 @@ def run_remote_training(
                     return
             # Ensure vLLM is deployed
-            if not model_info.vllm_url:
-                hud_console.info(f"Deploying vLLM server for {model_name}...")
-                rl_api.deploy_vllm(model_name, gpu_type="A100")
-                hud_console.success("vLLM deployment started")
-                # Wait for deployment
-                hud_console.info("Waiting for vLLM server to be ready...")
-                max_wait = 600  # 10 minutes
-                start_time = time.time()
-                with hud_console.progress() as progress:
-                    progress.update(
-                        "Checking deployment status (see live status on https://app.hud.so/models)"
-                    )
-                    while True:
-                        if time.time() - start_time > max_wait:
-                            hud_console.error("Timeout waiting for vLLM deployment")
-                            raise ValueError("vLLM deployment timeout")
-                        model_info = rl_api.get_model(model_name)
-                        if model_info.vllm_url:
-                            hud_console.success(
-                                f"vLLM server ready at http://rl.hud.so/v1/models/{model_name}/vllm"
-                            )
-                            break
-                        time.sleep(5)
-            else:
-                hud_console.success("vLLM server already running")
+            ensure_vllm_deployed(model_name, gpu_type="A100")
     except KeyboardInterrupt:
         hud_console.dim_info("Training cancelled", "")
         return

hud/clients/utils/mcp_use_retry.py CHANGED Viewed

@@ -10,13 +10,13 @@ import asyncio
 import logging
 from typing import TYPE_CHECKING, Any, TypeVar
-if TYPE_CHECKING:
-    from collections.abc import Callable
 import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
+if TYPE_CHECKING:
+    from collections.abc import Callable
 logger = logging.getLogger(__name__)
 T = TypeVar("T")

hud/rl/buffer.py CHANGED Viewed

@@ -219,12 +219,93 @@ class ReplayBuffer(Buffer[Trace]):
         else:
             raise ValueError(f"Invalid select strategy: {self.select_strategy}")
+    def _extract_group_key(self, trace: Trace) -> tuple[str, str]:
+        """Return a stable grouping key for a trace.
+        Preference order:
+        1) task.id when present (kind='id')
+        2) task.prompt exact string (kind='prompt') when id is None
+        3) 'NA' for missing/errored entries (kind='NA')
+        """
+        if getattr(trace, "isError", False):
+            return ("NA", "NA")
+        task = getattr(trace, "task", None)
+        if task is None:
+            return ("NA", "NA")
+        tid = getattr(task, "id", None)
+        if tid is not None:
+            return ("id", str(tid))
+        prompt = getattr(task, "prompt", None)
+        if prompt:
+            return ("prompt", str(prompt))
+        return ("NA", "NA")
+    def _validate_and_split_groups(
+        self, recent_traces: list[Trace]
+    ) -> tuple[list[list[Trace]], list[tuple[str, str]]]:
+        """Validate and split recent traces into homogeneous groups by id or prompt.
+        - Uses id when present; otherwise falls back to prompt equality.
+        - Any NA/error traces are excluded and the group is filled by duplicating
+          existing valid members in that group.
+        - Always returns len == groups_per_batch groups of size == group_size.
+        """
+        from collections import Counter
+        groups_per_batch = self.batch_size // self.group_size
+        window_keys = [self._extract_group_key(t) for t in recent_traces]
+        window_counter = Counter(k for k in window_keys if k[0] != "NA")
+        validated_groups: list[list[Trace]] = []
+        selected_keys: list[tuple[str, str]] = []
+        for g_idx in range(groups_per_batch):
+            start = g_idx * self.group_size
+            end = start + self.group_size
+            chunk = recent_traces[start:end]
+            key_counts = Counter()
+            per_item_keys: list[tuple[str, str]] = []
+            for tr in chunk:
+                k = self._extract_group_key(tr)
+                per_item_keys.append(k)
+                if k[0] != "NA":
+                    key_counts[k] += 1
+            if key_counts:
+                best_key = key_counts.most_common(1)[0][0]
+            elif window_counter:
+                best_key = window_counter.most_common(1)[0][0]
+            else:
+                best_key = ("NA", "NA")
+            homogeneous = [tr for tr, k in zip(chunk, per_item_keys, strict=False) if k == best_key]
+            while len(homogeneous) < self.group_size:
+                if homogeneous:
+                    homogeneous.append(homogeneous[-1])
+                else:
+                    idx = next((i for i, wk in enumerate(window_keys) if wk[0] != "NA"), None)
+                    if idx is not None:
+                        homogeneous.append(recent_traces[idx])
+                    elif chunk:
+                        homogeneous.append(chunk[0])
+                    else:
+                        homogeneous.append(recent_traces[0])
+            validated_groups.append(homogeneous)
+            selected_keys.append(best_key)
+        return validated_groups, selected_keys
     def _sample_high_variance_traces(self) -> list[Trace]:
         from collections import Counter, defaultdict, deque
-        # Expect recent window to already be grouped by task id
-        # Build recent window and earlier lookup (short form)
         buf_list = list(self.buffer)
         if len(buf_list) < self.batch_size:
             hud_console.warning(
@@ -234,81 +315,32 @@ class ReplayBuffer(Buffer[Trace]):
                 take = min(len(buf_list) or 1, self.batch_size - len(buf_list))
                 buf_list.extend(buf_list[:take])
         recent_traces = buf_list[-self.batch_size :]
-        hud_console.info(
-            f"[group-sampler] recent-window histogram: {Counter(getattr(t.task, 'id', 'NA') for t in recent_traces)}"  # noqa: E501
-        )
+        recent_keys = [self._extract_group_key(t) for t in recent_traces]
+        hud_console.info(f"[group-sampler] recent-window histogram: {Counter(recent_keys)}")
         hud_console.info(
             f"[group-sampler] Building earlier traces lookup, buffer size: {len(buf_list)}"
         )
-        earlier_traces_by_task: dict[str, deque[Trace]] = defaultdict(deque)
+        earlier_traces_by_key: dict[tuple[str, str], deque[Trace]] = defaultdict(deque)
         for tr in buf_list[: -self.batch_size]:
-            earlier_traces_by_task[getattr(tr.task, "id", "NA")].append(tr)
+            k = self._extract_group_key(tr)
+            if k[0] != "NA":
+                earlier_traces_by_key[k].append(tr)
+        groups, group_keys = self._validate_and_split_groups(recent_traces)
-        # Chunk from the most-recent end
         final_traces: list[Trace] = []
-        groups_per_batch = self.batch_size // self.group_size
-        hud_console.info(f"[group-sampler] Processing {groups_per_batch} groups")
-        for g_idx in range(groups_per_batch):
-            start = g_idx * self.group_size
-            end = start + self.group_size
-            group = recent_traces[start:end]
-            # Assert homogeneity: every trace in a group must share the same task id
-            cnt = Counter(getattr(t.task, "id", "NA") for t in group)
-            if len(cnt) != 1:
-                raise RuntimeError(f"Group {g_idx} is not homogeneous: {dict(cnt)}")
-            target_tid = next(iter(cnt.keys()))
-            # Build homogeneous group of target_tid, filling from earlier traces to increase spread
-            homogeneous: list[Trace] = [
-                t for t in group if getattr(t.task, "id", "NA") == target_tid
-            ]
-            needed = self.group_size - len(homogeneous)
-            # Greedy fill: choose earlier traces (same task-id) farthest from current mean reward
-            def current_mean(homogeneous: list[Trace]) -> float:
-                if not homogeneous:
+        for g_idx, (homogeneous, target_key) in enumerate(zip(groups, group_keys, strict=False)):
+            def current_mean(h: list[Trace]) -> float:
+                if not h:
                     return 0.0
-                vals = [float(getattr(t, "reward", 0.0) or 0.0) for t in homogeneous]
+                vals = [float(getattr(t, "reward", 0.0) or 0.0) for t in h]
                 return sum(vals) / len(vals)
-            while needed > 0:
-                pool = earlier_traces_by_task.get(target_tid, deque())
-                if pool:
-                    mu = current_mean(homogeneous)
-                    # pick element farthest from current mean
-                    best_i = None
-                    best_dist = -1.0
-                    for i, tr in enumerate(list(pool)):
-                        r = float(getattr(tr, "reward", 0.0) or 0.0)
-                        dist = abs(r - mu)
-                        if dist > best_dist:
-                            best_dist = dist
-                            best_i = i
-                    # pop selected
-                    chosen = list(pool)[best_i]  # type: ignore[index]
-                    # remove from deque efficiently by rotating
-                    left = list(pool)
-                    if best_i is not None:
-                        left.pop(best_i)  # O(n) but pool is small in practice
-                        earlier_traces_by_task[target_tid] = deque(left)
-                        homogeneous.append(chosen)
-                else:
-                    # duplicate extreme within current homogeneous set
-                    if not homogeneous:
-                        raise RuntimeError(f"Group {g_idx} has no traces for target {target_tid}")
-                    mu = current_mean(homogeneous)
-                    extreme = max(
-                        homogeneous, key=lambda t: abs(float(getattr(t, "reward", 0.0) or 0.0) - mu)
-                    )
-                    homogeneous.append(extreme)
-                needed -= 1
-            # Replacement step: swap in earlier traces to increase reward spread
-            pool = earlier_traces_by_task.get(target_tid, deque())
+            pool = earlier_traces_by_key.get(target_key, deque())
             if pool:
-                # Log pool stats
                 pool_vals = [float(getattr(tr, "reward", 0.0) or 0.0) for tr in list(pool)]
                 if pool_vals:
                     pool_mean = sum(pool_vals) / len(pool_vals)
@@ -316,16 +348,15 @@ class ReplayBuffer(Buffer[Trace]):
                         pool_vals
                     )
                     hud_console.info(
-                        f"[group-sampler] Group {g_idx}: earlier-pool size={len(pool_vals)} mean={pool_mean:.4f} std={(pool_var**0.5):.4f}"  # noqa: E501
+                        f"[group-sampler] Group {g_idx}: earlier-pool size={len(pool_vals)} "
+                        f"mean={pool_mean:.4f} std={(pool_var**0.5):.4f}"
                     )
-                # Decide how many to replace (up to 1/4 of group, at least 1)
                 replace_k = max(1, self.group_size // 4)
                 replace_k = min(replace_k, len(pool), self.group_size)
                 if replace_k > 0:
                     mu = current_mean(homogeneous)
-                    # Select replacement candidates from pool farthest from current mean
                     pool_list = list(pool)
                     pool_indices = list(range(len(pool_list)))
                     pool_indices.sort(
@@ -337,12 +368,11 @@ class ReplayBuffer(Buffer[Trace]):
                     chosen_pool_idx = set(pool_indices[:replace_k])
                     replacements = [pool_list[i] for i in pool_indices[:replace_k]]
-                    # Remove chosen from pool deque
                     remaining = [tr for i, tr in enumerate(pool_list) if i not in chosen_pool_idx]
-                    earlier_traces_by_task[target_tid] = deque(remaining)
+                    earlier_traces_by_key[target_key] = deque(remaining)
-                    # Select current group positions closest to mean to replace
                     group_indices = list(range(len(homogeneous)))
+                    mu = current_mean(homogeneous)
                     group_indices.sort(
                         key=lambda i: abs(
                             (float(getattr(homogeneous[i], "reward", 0.0) or 0.0)) - mu
@@ -353,18 +383,19 @@ class ReplayBuffer(Buffer[Trace]):
                     for pos, new_tr in zip(target_positions, replacements, strict=False):
                         homogeneous[pos] = new_tr
-            # Validate homogeneity
-            if any(getattr(t.task, "id", "NA") != target_tid for t in homogeneous):
+            if any(self._extract_group_key(t) != target_key for t in homogeneous):
                 raise RuntimeError(f"Group {g_idx} is not homogeneous after sampling")
             final_traces.extend(homogeneous)
         for i in range(0, len(final_traces), self.group_size):
             block = final_traces[i : i + self.group_size]
-            if len({getattr(t.task, "id", "NA") for t in block}) != 1:
+            keys = {self._extract_group_key(t) for t in block}
+            if len(keys) != 1:
                 raise RuntimeError(f"Homogeneity validation failed for block starting at index {i}")
         hud_console.info(
-            f"[group-sampler] final histogram: {Counter(getattr(t.task, 'id', 'NA') for t in final_traces)}"  # noqa: E501
+            f"[group-sampler] final histogram: "
+            f"{Counter(self._extract_group_key(t) for t in final_traces)}"
         )
         return final_traces

hud/samples/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Sample tasks."""
+from __future__ import annotations
+from hud.samples.browser import BrowserTask
+__all__ = ["BrowserTask"]

hud/samples/browser.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Sample browser task factory."""
+from __future__ import annotations
+from typing import Any
+from pydantic import Field
+from hud.settings import settings
+from hud.types import MCPToolCall, Task
+class BrowserTask(Task):
+    """Task subclass with browser defaults for BrowserTask(prompt=...)."""
+    prompt: str = "Open Google and be ready to search."
+    mcp_config: dict[str, Any] = Field(
+        default_factory=lambda: {
+            "browser": {
+                "url": "https://mcp.hud.so/v3/mcp",
+                "headers": {
+                    "Authorization": f"Bearer {settings.api_key}",
+                    "Mcp-Image": "hudevals/hud-remote-browser:0.1.1",
+                },
+            }
+        }
+    )
+    setup_tool: MCPToolCall | list[MCPToolCall] | None = Field(
+        default_factory=lambda: MCPToolCall(
+            name="setup",
+            arguments={"name": "navigate_to_url", "arguments": {"url": "https://www.google.com"}},
+        )
+    )

hud/types.py CHANGED Viewed

@@ -12,6 +12,7 @@ from mcp.types import CallToolRequestParams, CallToolResult
 from pydantic import BaseModel, ConfigDict, Field, field_validator
 from hud.settings import settings
+from hud.utils.tool_shorthand import normalize_to_tool_call_dict
 logger = logging.getLogger(__name__)
@@ -59,8 +60,18 @@ class Task(BaseModel):
     @field_validator("setup_tool", "evaluate_tool", mode="before")
     @classmethod
-    def convert_dict_to_tool_call(cls, v: Any) -> Any:
-        """Convert dict to MCPToolCall instance, parsing JSON strings first."""
+    def convert_dict_to_tool_call(cls, v: Any, info: Any) -> Any:
+        """Convert dict (with shorthands) to MCPToolCall instance.
+        Supports nested forms by walking to the deepest tool name and its arguments.
+        Examples:
+        - {"name": "navigate", "arguments": {...}} -> name=navigate
+        - {"navigate": {...}} -> name=navigate
+        - {"setup": {"navigate": {...}}} -> name=navigate
+        - {"name": "setup", "arguments": {"name": "navigate", "arguments": {...}}}
+          -> name=navigate
+        - Lists are normalized element-wise
+        """
         if v is None:
             return None
@@ -73,10 +84,12 @@ class Task(BaseModel):
                 raise HudConfigError(f"Invalid JSON string: {e}") from e
-        if isinstance(v, dict):
-            return MCPToolCall(**v)
-        if isinstance(v, list):
-            return [MCPToolCall(**item) if isinstance(item, dict) else item for item in v]
+        normalized = normalize_to_tool_call_dict(v)
+        if isinstance(normalized, dict):
+            return MCPToolCall(**normalized)
+        if isinstance(normalized, list):
+            return [MCPToolCall(**item) if isinstance(item, dict) else item for item in normalized]
         return v
     @field_validator("mcp_config", mode="before")

hud/utils/mcp.py CHANGED Viewed

@@ -66,8 +66,13 @@ def setup_hud_telemetry(
     auto_trace_cm = None
     if not run_id and auto_trace:
+        # Start an auto trace and capture its ID for headers/metadata
         auto_trace_cm = trace("My Trace")
-        run_id = auto_trace_cm.__enter__()
+        _trace_obj = auto_trace_cm.__enter__()
+        try:
+            run_id = getattr(_trace_obj, "id", None) or str(_trace_obj)
+        except Exception:  # pragma: no cover - fallback shouldn't fail lint
+            run_id = None
     # Patch HUD servers with run-id (works whether auto or user trace)
     if run_id:

hud/utils/tests/test_version.py CHANGED Viewed

@@ -5,4 +5,4 @@ def test_import():
     """Test that the package can be imported."""
     import hud
-    assert hud.__version__ == "0.4.31"
+    assert hud.__version__ == "0.4.33"

hud/utils/tool_shorthand.py ADDED Viewed

@@ -0,0 +1,59 @@
+from __future__ import annotations
+from typing import Any
+def _is_call_like(obj: Any) -> bool:
+    if not isinstance(obj, dict):
+        return False
+    if "name" in obj and "arguments" in obj:
+        return True
+    if len(obj) == 1:
+        _, v = next(iter(obj.items()))
+        return isinstance(v, dict)
+    return False
+def _to_call_dict(obj: Any) -> Any:
+    """Recursively convert shorthand/wrapped dicts into name/arguments templates.
+    Rules:
+    - If obj is a dict with {name, arguments}: return {name, arguments: recurse(arguments)}
+    - Else if obj is a single-key dict {k: v}: return {name: k, arguments: recurse(v)}
+    - Else: return obj unchanged (leaf arguments/value)
+    """
+    if isinstance(obj, dict):
+        if "name" in obj and "arguments" in obj:
+            args = obj.get("arguments")
+            # Only recurse into arguments if it looks like another call
+            if _is_call_like(args):
+                return {"name": obj.get("name"), "arguments": _to_call_dict(args)}
+            return {"name": obj.get("name"), "arguments": args}
+        if len(obj) == 1:
+            k, v = next(iter(obj.items()))
+            if isinstance(v, dict):
+                return {"name": k, "arguments": _to_call_dict(v)}
+            return obj
+    return obj
+def normalize_to_tool_call_dict(value: Any) -> Any:
+    """
+    Convert shorthand or nested forms into a direct tool call dict:
+    {"name": final_name, "arguments": final_arguments}
+    Lists are normalized element-wise.
+    """
+    if value is None:
+        return value
+    def _normalize_one(item: Any) -> Any:
+        call = _to_call_dict(item)
+        return call
+    if isinstance(value, list):
+        return [_normalize_one(x) for x in value]
+    if isinstance(value, dict):
+        return _normalize_one(value)
+    return value

hud/version.py CHANGED Viewed

@@ -4,4 +4,4 @@ Version information for the HUD SDK.
 from __future__ import annotations
-__version__ = "0.4.31"
+__version__ = "0.4.33"

{hud_python-0.4.31.dist-info → hud_python-0.4.33.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.4.31
+Version: 0.4.33
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues

{hud_python-0.4.31.dist-info → hud_python-0.4.33.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
 hud/__init__.py,sha256=JMDFUE1pP0J1Xl_miBdt7ERvoffZmTzSFe8yxz512A8,552
 hud/__main__.py,sha256=YR8Dq8OhINOsVfQ55PmRXXg4fEK84Rt_-rMtJ5rvhWo,145
 hud/settings.py,sha256=sMS31iW1m-5VpWk-Blhi5-obLcUA0fwxWE1GgJz-vqU,2708
-hud/types.py,sha256=Cn9suZ_ZitLnxmnknfbCYVvmLsXRWI56kJ3LXtdfI6M,10157
-hud/version.py,sha256=wQqwRzN9OMugX4H2oCdAp5bWBbjfBRrvQGeo_0_uaYs,105
+hud/types.py,sha256=RtNM2fPU1NAujTmZLOydQIU-ybk3gVRCoJ2TM2hJOlw,10752
+hud/version.py,sha256=7nCICMgtZOjBoirBGd5_5Ea-s2F7XAgLvEX_110KGAU,105
 hud/agents/__init__.py,sha256=UoIkljWdbq4bM0LD-mSaw6w826EqdEjOk7r6glNYwYQ,286
 hud/agents/base.py,sha256=_u1zR3gXzZ1RlTCUYdMcvgHqdJBC4-AB1lZt0yBx8lg,35406
 hud/agents/claude.py,sha256=wHiw8iAnjnRmZyKRKcOhagCDQMhz9Z6rlSBWqH1X--M,15781
@@ -30,10 +30,10 @@ hud/cli/get.py,sha256=sksKrdzBGZa7ZuSoQkc0haj-CvOGVSSikoVXeaUd3N4,6274
 hud/cli/init.py,sha256=McZwpxZMXD-It_PXINCUy-SwUaPiQ7jdpSU5-F-caO8,19671
 hud/cli/list_func.py,sha256=EVi2Vc3Lb3glBNJxFx4MPnZknZ4xmuJz1OFg_dc8a_E,7177
 hud/cli/pull.py,sha256=Vd1l1-IwskyACzhtC8Df1SYINUZEYmFxrLl0s9cNN6c,12151
-hud/cli/push.py,sha256=JXUxu1QGU7BPWb0erSJq42CIq0sLbaDAO42yYDcvA1g,18347
+hud/cli/push.py,sha256=dmjF-hGlMfq73tquDxsTuM9t50zrkE9PFJqW5vRmYSw,18380
 hud/cli/remove.py,sha256=8vGQyXDqgtjz85_vtusoIG8zurH4RHz6z8UMevQRYM4,6861
 hud/cli/flows/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-hud/cli/flows/tasks.py,sha256=8r-51oon3anwMAi5cyAOgC3iB48jnqlOUO3iTWmqsyI,6372
+hud/cli/flows/tasks.py,sha256=CSdUA4vWMyLHke7pQgxsHzuPAj3CxmQuu66KjSvVai4,8822
 hud/cli/rl/__init__.py,sha256=BeqXdmzPwVBptz4j796XJRxSC5B_9tQta5aKd0jDMvo,5000
 hud/cli/rl/config.py,sha256=iNhCxotM33OEiP9gqPvn8A_AxrBVe6fcFCQTvc13xzA,2884
 hud/cli/rl/display.py,sha256=hqJVGmO9csYinladhZwjF-GMvppYWngxDHajTyIJ_gM,5214
@@ -41,7 +41,7 @@ hud/cli/rl/gpu.py,sha256=peXS-NdUF5RyuSs0aZoCzGLboneBUpCy8f9f99WMrG0,2009
 hud/cli/rl/gpu_utils.py,sha256=H5ckPwgj5EVP3yJ5eVihR5R7Y6Gp6pt8ZUfWCCwcLG4,11072
 hud/cli/rl/local_runner.py,sha256=GssmDgCxGfFsi31aFj22vwCiwa9ELllEwQjbActxSXY,21514
 hud/cli/rl/presets.py,sha256=DzOO82xL5QyzdVtlX-Do1CODMvDz9ILMPapjU92jcZg,3051
-hud/cli/rl/remote_runner.py,sha256=Umyjjbtw6ikyNNfVGjn6sY6Qnta8Uc1KC7KzMgFaJVw,13146
+hud/cli/rl/remote_runner.py,sha256=JvLOf3X-a6svz_aoOKuuaWzoei6Nrp1ShHeg2KxYk7U,13725
 hud/cli/rl/rl_api.py,sha256=INJobvSa50ccR037u_GPsDa_9WboWyNwqEaoh9hcXj0,4306
 hud/cli/rl/vllm.py,sha256=Gq_M6KsQArGz7FNIdemuM5mk16mu3xe8abpO2GCCuOE,6093
 hud/cli/tests/__init__.py,sha256=ZrGVkmH7DHXGqOvjOSNGZeMYaFIRB2K8c6hwr8FPJ-8,68
@@ -83,7 +83,7 @@ hud/clients/tests/test_fastmcp.py,sha256=4q3TzDjuieTZa89taiNJIrzbUncNkYOG4Maubyp
 hud/clients/tests/test_mcp_use_retry.py,sha256=9FxLAz4L5Vv3OTtj4wdhRY23wDYALUpE12TYWl7fbJA,13299
 hud/clients/tests/test_protocol.py,sha256=aK4CS4g3j1D5jPo83ykzZuHUvcZFAulYtIq9T9Hb_fQ,6640
 hud/clients/utils/__init__.py,sha256=-zZjcKIWGj2tXbVDOW45UgoGghhLJzFQVZ6miKenuA4,595
-hud/clients/utils/mcp_use_retry.py,sha256=sBCjtgnAXiXASjzFF_AtBEtmizay0Fi0nPL6sVoooeI,6675
+hud/clients/utils/mcp_use_retry.py,sha256=knsgOTR3YFXshmPFfPQE6K6C5GpR1ZBJe2J7ozEMikA,6675
 hud/clients/utils/retry.py,sha256=mMs2T_mAlb8AYhSqMR4AmCw7838gqCC4mdG3zjMAYM4,5744
 hud/clients/utils/retry_transport.py,sha256=Rsq25eiKKt_pM1bas78QEZvO0illK97X_3opmaS3A3w,6809
 hud/datasets/__init__.py,sha256=-g05iDy76CU4JiRHjKBBhgh3STtiIjmWhUfPqgf5hJE,697
@@ -109,7 +109,7 @@ hud/otel/tests/test_processors.py,sha256=np0R4ssd9j6LJSJykJ5bNjl0POwNYNhgb7BqOZH
 hud/rl/README.md,sha256=uFRpNFaEY8paq9k1C4miF7AGnbqHTGAsPmpcf9JIEeA,1189
 hud/rl/__init__.py,sha256=yYL7U1WV6L3mr3Hig48-4lhnryTaWj4nCXm4lG5vrYI,25
 hud/rl/actor.py,sha256=0YChXyxCz1wVBQ9lKb7vSl64_HQ24-DmYqCCxuORzJc,6747
-hud/rl/buffer.py,sha256=xz4FlvO9l945VsSS4lzRFMwH3rA9HafgbUfADSauXok,15210
+hud/rl/buffer.py,sha256=FWGivdJ0YEYZZPK0bUyvjiKparaUgiBE9GzQLZj8kcA,15372
 hud/rl/chat_template.jinja,sha256=XTdzI8oFGEcSA-exKxyHaprwRDmX5Am1KEb0VxvUc6U,4965
 hud/rl/config.py,sha256=PAKYPCsKl8yg_j3gJSE5SJUgLM7j0lFy0K_Vt4-otDM,5384
 hud/rl/distributed.py,sha256=8avhrb0lHYkhW22Z7MfkqSnlczWj5jMrUMEtkcoCf74,2473
@@ -121,6 +121,8 @@ hud/rl/vllm_adapter.py,sha256=O2_TdTGIyNr9zRGhCw18XWjOKYzEM3049wvlyL2x0sc,4751
 hud/rl/tests/__init__.py,sha256=PXmD3Gs6xOAwaYKb4HnwZERDjX05N1QF-aU6ya0dBtE,27
 hud/rl/tests/test_learner.py,sha256=qfSHFFROteRb98TjBuAKjFmZjCGfuWXPysVvTAWJ7wQ,6025
 hud/rl/utils/start_vllm_server.sh,sha256=ThPokrLK_Qm_uh916fHXXBfMlw1TC97P57-AEI5MuOc,910
+hud/samples/__init__.py,sha256=wgcN1IOLHhR4C1fFKqyvA7Yl9lJhJFf34zfKs-UMSus,128
+hud/samples/browser.py,sha256=7LkzGx2G5dA8RogZwORnxxpVsxMV2gF18D_hGJIEow8,973
 hud/server/__init__.py,sha256=8LUwgsXO8xiViWP7uImDwcOsWLu01r5F4r8U8qH3rSY,91
 hud/server/context.py,sha256=6bCdSzv1FGyItu9472HbbYef279H7QuMGJDR8EtYg5Y,3210
 hud/server/low_level.py,sha256=XYs2pOJ9kN4OcJ6ahDmXM5mWkzq5wJLpKFInUYrWEok,4701
@@ -187,21 +189,22 @@ hud/utils/agent_factories.py,sha256=cvfXByqG6gOYHtm1VGeJjCpxoLxM4aJez8rH-AerP_A,
 hud/utils/async_utils.py,sha256=5cKrJcnaHV2eJNxeyx0r7fPcdPTDBK7kM9-nLaF51X4,2409
 hud/utils/group_eval.py,sha256=oaoBqlQN6g5gRQmuY_JmqM5bpuf2sFIgu4uDZ7X-3a0,8360
 hud/utils/hud_console.py,sha256=ywTrzyNhWFoQN2PpzpDDKp_32b-ACDvfKQuWxDoF8iE,21898
-hud/utils/mcp.py,sha256=jvCWb5MXlMMObhrbYoiTlI-L9HNkEjLVx8GJ-HbdQ7U,2626
+hud/utils/mcp.py,sha256=pMadd7A0DH6Y_aWywKU8jVYu2pRHGPEndV2ZQFrrj60,2888
 hud/utils/pretty_errors.py,sha256=WGeL4CTHtlA6KgPuV_JSX5l6H4-xbuTp6Y6tw1bkiFg,2430
 hud/utils/progress.py,sha256=suikwFM8sdSfkV10nAOEaInDhG4XKgOSvFePg4jSj1A,5927
 hud/utils/tasks.py,sha256=JwFIq0cpPMpMYnICUmx_G4CF6uy9MtiCmmmN7eA6FsA,4682
 hud/utils/telemetry.py,sha256=hrVIx2rUjSGyy9IVxTZ_3Jii83PiHjyFRd5ls2whimM,1863
+hud/utils/tool_shorthand.py,sha256=nWo-Z7D4w8qF1lWKP7TkXMHZiU3vj4jAwfcBXkwrpnE,1833
 hud/utils/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 hud/utils/tests/test_async_utils.py,sha256=RkdSnYErRV3Jn7dfg6CPlcE1RSUL__2B627oIqAyy1s,5945
 hud/utils/tests/test_init.py,sha256=2QLQSGgyP9wJhOvPCusm_zjJad0qApOZi1BXpxcdHXQ,383
 hud/utils/tests/test_mcp.py,sha256=0pUa16mL-bqbZDXp5NHBnt1gO5o10BOg7zTMHZ1DNPM,4023
 hud/utils/tests/test_progress.py,sha256=QSF7Kpi03Ff_l3mAeqW9qs1nhK50j9vBiSobZq7T4f4,7394
 hud/utils/tests/test_telemetry.py,sha256=5jl7bEx8C8b-FfFUko5pf4UY-mPOR-9HaeL98dGtVHM,2781
-hud/utils/tests/test_version.py,sha256=qN98qLlKKeM-W-AM1q1s0Lci8phBQ7SUd_L-0yVmujA,160
+hud/utils/tests/test_version.py,sha256=Wdb1xAhmZ4EgoOIqiOcIr3iRZIxEMUCPCgee6cAlR3s,160
 hud/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-hud_python-0.4.31.dist-info/METADATA,sha256=UZMnmiRCMmHmXOD0BrkQs7Caqk5t8HcllxWPwR2SSzc,20861
-hud_python-0.4.31.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-hud_python-0.4.31.dist-info/entry_points.txt,sha256=jJbodNFg1m0-CDofe5AHvB4zKBq7sSdP97-ohaQ3ae4,63
-hud_python-0.4.31.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
-hud_python-0.4.31.dist-info/RECORD,,
+hud_python-0.4.33.dist-info/METADATA,sha256=EQgm-qxFqkYHk78gbjyHW0KTUu03JKnBidSwzfDG4ZY,20861
+hud_python-0.4.33.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+hud_python-0.4.33.dist-info/entry_points.txt,sha256=jJbodNFg1m0-CDofe5AHvB4zKBq7sSdP97-ohaQ3ae4,63
+hud_python-0.4.33.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
+hud_python-0.4.33.dist-info/RECORD,,

{hud_python-0.4.31.dist-info → hud_python-0.4.33.dist-info}/WHEEL RENAMED Viewed

File without changes

{hud_python-0.4.31.dist-info → hud_python-0.4.33.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{hud_python-0.4.31.dist-info → hud_python-0.4.33.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

hud-python 0.4.31__py3-none-any.whl → 0.4.33__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.31py3-none-any.whl → 0.4.33py3-none-any.whl