PyPI - hud-python - Versions diffs - 0.4.32__tar.gz → 0.4.34__tar.gz - Mend

hud-python 0.4.32tar.gz → 0.4.34tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (224) hide show

{hud_python-0.4.32 → hud_python-0.4.34}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.4.32
+Version: 0.4.34
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues

{hud_python-0.4.32 → hud_python-0.4.34}/hud/agents/misc/response_agent.py RENAMED Viewed

@@ -16,7 +16,17 @@ class ResponseAgent:
     based on the agent's final response message.
     """
-    def __init__(self, api_key: str | None = None, model: str = "gpt-4o") -> None:
+    def __init__(
+        self, api_key: str | None = None, model: str = "gpt-4o", system_prompt: str | None = None
+    ) -> None:
+        """
+        Initialize the ResponseAgent.
+        Args:
+            api_key: The API key to use for the OpenAI client
+            model: The model to use for the OpenAI client (default: "gpt-4o")
+            system_prompt: The system prompt to use for the OpenAI client
+        """
         self.api_key = api_key or settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
         if not self.api_key:
             raise ValueError(
@@ -26,23 +36,29 @@ class ResponseAgent:
         self.client = AsyncOpenAI(api_key=self.api_key)
         self.model = model
-        self.system_prompt = """
+        self.system_prompt = (
+            system_prompt
+            or """
         You are an assistant that helps determine the appropriate response to an agent's message.
         You will receive messages from an agent that is performing tasks for a user.
         Your job is to analyze these messages and respond with one of the following:
-        - STOP: If the agent indicates it has successfully completed a task, even if phrased as a question
-          like "I have entered the right values into this form. Would you like me to do anything else?"
-          or "Here is the website. Is there any other information you need?" or if the agent has
-          strongly determined it wants to stop the task.
+        - STOP: If the agent indicates it has successfully completed a task or is stuck,
+          struggling or says it cannot complete the task, even if phrased as a question
+          like "I have entered the right values into this form. Would you like me to do
+          anything else?" or "Here is the website. Is there any other information you
+          need?" or if the agent has strongly determined it wants to stop the task like
+          "The task is infeasible. Can I help you with something else?"
         - CONTINUE: If the agent is asking for clarification before proceeding with a task
           like "I'm about to clear cookies from this website. Would you like me to proceed?"
-          or "I've entered the right values into this form. Would you like me to continue with the rest of the task?"
+          or "I've entered the right values into this form. Would you like me to continue
+          with the rest of the task?"
         Respond ONLY with one of these two options.
-        """  # noqa: E501
+        """
+        )
     async def determine_response(self, agent_message: str) -> ResponseType:
         """

{hud_python-0.4.32 → hud_python-0.4.34}/hud/cli/__init__.py RENAMED Viewed

@@ -585,6 +585,9 @@ def build(
     ),
     no_cache: bool = typer.Option(False, "--no-cache", help="Build without Docker cache"),
     verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed output"),
+    platform: str | None = typer.Option(
+        None, "--platform", help="Set Docker target platform (e.g., linux/amd64)"
+    ),
 ) -> None:
     """🏗️ Build a HUD environment and generate lock file.
@@ -635,7 +638,7 @@ def build(
         else:
             i += 1
-    build_command(directory, tag, no_cache, verbose, env_vars)
+    build_command(directory, tag, no_cache, verbose, env_vars, platform)
 @app.command()

{hud_python-0.4.32 → hud_python-0.4.34}/hud/cli/build.py RENAMED Viewed

@@ -224,6 +224,7 @@ def build_docker_image(
     no_cache: bool = False,
     verbose: bool = False,
     build_args: dict[str, str] | None = None,
+    platform: str | None = None,
 ) -> bool:
     """Build a Docker image from a directory."""
     hud_console = HUDConsole()
@@ -236,7 +237,10 @@ def build_docker_image(
         return False
     # Build command
-    cmd = ["docker", "build", "-t", tag]
+    cmd = ["docker", "build"]
+    if platform:
+        cmd.extend(["--platform", platform])
+    cmd.extend(["-t", tag])
     if no_cache:
         cmd.append("--no-cache")
@@ -264,6 +268,7 @@ def build_environment(
     no_cache: bool = False,
     verbose: bool = False,
     env_vars: dict[str, str] | None = None,
+    platform: str | None = None,
 ) -> None:
     """Build a HUD environment and generate lock file."""
     hud_console = HUDConsole()
@@ -294,9 +299,8 @@ def build_environment(
     except Exception:
         default_image = f"{env_dir.name}:dev"
-    # Use provided tag or default
-    if not tag:
-        tag = default_image
+    # Determine final image tag to use
+    image_tag: str = tag if tag else default_image
     # Build temporary image first
     temp_tag = f"hud-build-temp:{int(time.time())}"
@@ -304,7 +308,14 @@ def build_environment(
     hud_console.progress_message(f"Building Docker image: {temp_tag}")
     # Build the image (env vars are for runtime, not build time)
-    if not build_docker_image(env_dir, temp_tag, no_cache, verbose):
+    if not build_docker_image(
+        env_dir,
+        temp_tag,
+        no_cache,
+        verbose,
+        build_args=None,
+        platform=platform,
+    ):
         hud_console.error("Docker build failed")
         raise typer.Exit(1)
@@ -422,21 +433,24 @@ def build_environment(
     # Build final image with label (uses cache from first build)
     # Also tag with version
-    base_name = tag.split(":")[0] if tag and ":" in tag else tag
+    base_name = image_tag.split(":")[0] if ":" in image_tag else image_tag
     version_tag = f"{base_name}:{new_version}"
-    label_cmd = [
-        "docker",
-        "build",
-        "--label",
-        f"org.hud.manifest.head={lock_hash}:{lock_size}",
-        "--label",
-        f"org.hud.version={new_version}",
-        "-t",
-        tag,
-        "-t",
-        version_tag,
-    ]
+    label_cmd = ["docker", "build"]
+    if platform is not None:
+        label_cmd.extend(["--platform", platform])
+    label_cmd.extend(
+        [
+            "--label",
+            f"org.hud.manifest.head={lock_hash}:{lock_size}",
+            "--label",
+            f"org.hud.version={new_version}",
+            "-t",
+            image_tag,
+            "-t",
+            version_tag,
+        ]
+    )
     label_cmd.append(str(env_dir))
@@ -457,14 +471,14 @@ def build_environment(
     hud_console.success("Built final image with lock file metadata")
     # NOW get the image ID after the final build
-    image_id = get_docker_image_id(tag)  # type: ignore
+    image_id = get_docker_image_id(image_tag)
     if image_id:
         # For local builds, store the image ID
         # Docker IDs come as sha256:hash, we want tag@sha256:hash
         if image_id.startswith("sha256:"):
-            lock_content["image"] = f"{tag}@{image_id}"
+            lock_content["image"] = f"{image_tag}@{image_id}"
         else:
-            lock_content["image"] = f"{tag}@sha256:{image_id}"
+            lock_content["image"] = f"{image_tag}@sha256:{image_id}"
         # Update the lock file with the new image reference
         with open(lock_path, "w") as f:
@@ -487,8 +501,8 @@ def build_environment(
     # Show the version tag as primary since that's what will be pushed
     hud_console.status_item("Built image", version_tag, primary=True)
-    if tag:
-        hud_console.status_item("Also tagged", tag)
+    if image_tag:
+        hud_console.status_item("Also tagged", image_tag)
     hud_console.status_item("Version", new_version)
     hud_console.status_item("Lock file", "hud.lock.yaml")
     hud_console.status_item("Tools found", str(analysis["toolCount"]))
@@ -500,7 +514,7 @@ def build_environment(
     hud_console.section_title("Next Steps")
     hud_console.info("Test locally:")
     hud_console.command_example("hud dev", "Hot-reload development")
-    hud_console.command_example(f"hud run {tag}", "Run the built image")
+    hud_console.command_example(f"hud run {image_tag}", "Run the built image")
     hud_console.info("")
     hud_console.info("Publish to registry:")
     hud_console.command_example("hud push", f"Push as {version_tag}")
@@ -517,6 +531,7 @@ def build_command(
     no_cache: bool = typer.Option(False, "--no-cache", help="Build without Docker cache"),
     verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed output"),
     env_vars: dict[str, str] | None = None,
+    platform: str | None = None,
 ) -> None:
     """Build a HUD environment and generate lock file."""
-    build_environment(directory, tag, no_cache, verbose, env_vars)
+    build_environment(directory, tag, no_cache, verbose, env_vars, platform)

{hud_python-0.4.32 → hud_python-0.4.34}/hud/cli/flows/tasks.py RENAMED Viewed

@@ -32,6 +32,7 @@ def _validate_tasks(tasks: list[Task]) -> bool:
     A task is considered remote if any "url" field anywhere inside mcp_config
     is a valid remote URL (e.g., https://mcp.hud.so/v3/mcp).
     """
     def _has_remote_url(obj: Any) -> bool:
         if isinstance(obj, dict):
             for k, v in obj.items():
@@ -99,7 +100,8 @@ def _ensure_built(env_dir: Path) -> dict[str, Any]:
         # Check Docker availability before attempting a build
         require_docker_running()
         # Run build (non-interactive). If Docker isn't running, this will raise and stop the flow.
-        build_environment(str(env_dir))
+        # Force linux/amd64 platform to ensure compatibility during RL flows.
+        build_environment(str(env_dir), platform="linux/amd64")
     # Load lock file
     with open(lock_path) as f:
@@ -146,7 +148,7 @@ def _derive_remote_image(lock_data: dict[str, Any]) -> str:
     # Base name always comes from lock_data.image to preserve org/repo
     image_ref = str(lock_data.get("image", "")).strip()
     if not image_ref:
-        raise typer.Exit("Lock file missing image reference")
+        raise typer.Exit(1)
     name, tag = extract_name_and_tag(image_ref)
     return f"{name}:{tag}"

{hud_python-0.4.32 → hud_python-0.4.34}/hud/cli/rl/remote_runner.py RENAMED Viewed

@@ -9,8 +9,8 @@ from __future__ import annotations
 import os
 import subprocess
 import time
-from pathlib import Path
 import uuid
+from pathlib import Path
 from rich.console import Console
@@ -51,9 +51,7 @@ def ensure_vllm_deployed(model_name: str, gpu_type: str = "A100", timeout: int =
     hud_console.info("Waiting for vLLM server to be ready...")
     start_time = time.time()
     with hud_console.progress() as progress:
-        progress.update(
-            "Checking deployment status (see live status on https://app.hud.so/models)"
-        )
+        progress.update("Checking deployment status (see live status on https://app.hud.so/models)")
         while True:
             if time.time() - start_time > timeout:
                 hud_console.error("Timeout waiting for vLLM deployment")

{hud_python-0.4.32 → hud_python-0.4.34}/hud/rl/actor.py RENAMED Viewed

@@ -85,18 +85,19 @@ class Actor:
                     )
                 except TimeoutError:
                     hud_console.warning_log(f"Episode timed out for task {t.id}")
-                    return Trace(isError=True, content="Episode timeout")
+                    # Attach task so buffer grouping has key
+                    return Trace(isError=True, content="Episode timeout", task=t)
             results = await asyncio.gather(
                 *[run_with_timeout(t) for t in batch],
                 return_exceptions=True,
             )
-            # Normalize exceptions to error traces
-            for res in results:
+            # Normalize exceptions to error traces and ensure task is attached
+            for t, res in zip(batch, results, strict=False):
                 if isinstance(res, Exception):
                     hud_console.warning_log(f"Episode error: {res}")
-                    traces.append(Trace(isError=True, content=str(res)))
+                    traces.append(Trace(isError=True, content=str(res), task=t))
                 else:
                     traces.append(res)
@@ -113,7 +114,8 @@ class Actor:
         except Exception:
             logger.info("GOT EXCEPTION")
-            return Trace(isError=True)
+            # Preserve task on exception for grouping
+            return Trace(isError=True, task=task)
         result.info["tool_spec"] = agent.get_tool_schemas()

{hud_python-0.4.32 → hud_python-0.4.34}/hud/rl/buffer.py RENAMED Viewed

@@ -219,12 +219,93 @@ class ReplayBuffer(Buffer[Trace]):
         else:
             raise ValueError(f"Invalid select strategy: {self.select_strategy}")
+    def _extract_group_key(self, trace: Trace) -> tuple[str, str]:
+        """Return a stable grouping key for a trace.
+        Preference order:
+        1) task.id when present (kind='id')
+        2) task.prompt exact string (kind='prompt') when id is None
+        3) 'NA' for missing/errored entries (kind='NA')
+        """
+        if getattr(trace, "isError", False):
+            return ("NA", "NA")
+        task = getattr(trace, "task", None)
+        if task is None:
+            return ("NA", "NA")
+        tid = getattr(task, "id", None)
+        if tid is not None:
+            return ("id", str(tid))
+        prompt = getattr(task, "prompt", None)
+        if prompt:
+            return ("prompt", str(prompt))
+        return ("NA", "NA")
+    def _validate_and_split_groups(
+        self, recent_traces: list[Trace]
+    ) -> tuple[list[list[Trace]], list[tuple[str, str]]]:
+        """Validate and split recent traces into homogeneous groups by id or prompt.
+        - Uses id when present; otherwise falls back to prompt equality.
+        - Any NA/error traces are excluded and the group is filled by duplicating
+          existing valid members in that group.
+        - Always returns len == groups_per_batch groups of size == group_size.
+        """
+        from collections import Counter
+        groups_per_batch = self.batch_size // self.group_size
+        window_keys = [self._extract_group_key(t) for t in recent_traces]
+        window_counter = Counter(k for k in window_keys if k[0] != "NA")
+        validated_groups: list[list[Trace]] = []
+        selected_keys: list[tuple[str, str]] = []
+        for g_idx in range(groups_per_batch):
+            start = g_idx * self.group_size
+            end = start + self.group_size
+            chunk = recent_traces[start:end]
+            key_counts = Counter()
+            per_item_keys: list[tuple[str, str]] = []
+            for tr in chunk:
+                k = self._extract_group_key(tr)
+                per_item_keys.append(k)
+                if k[0] != "NA":
+                    key_counts[k] += 1
+            if key_counts:
+                best_key = key_counts.most_common(1)[0][0]
+            elif window_counter:
+                best_key = window_counter.most_common(1)[0][0]
+            else:
+                best_key = ("NA", "NA")
+            homogeneous = [tr for tr, k in zip(chunk, per_item_keys, strict=False) if k == best_key]
+            while len(homogeneous) < self.group_size:
+                if homogeneous:
+                    homogeneous.append(homogeneous[-1])
+                else:
+                    idx = next((i for i, wk in enumerate(window_keys) if wk[0] != "NA"), None)
+                    if idx is not None:
+                        homogeneous.append(recent_traces[idx])
+                    elif chunk:
+                        homogeneous.append(chunk[0])
+                    else:
+                        homogeneous.append(recent_traces[0])
+            validated_groups.append(homogeneous)
+            selected_keys.append(best_key)
+        return validated_groups, selected_keys
     def _sample_high_variance_traces(self) -> list[Trace]:
         from collections import Counter, defaultdict, deque
-        # Expect recent window to already be grouped by task id
-        # Build recent window and earlier lookup (short form)
         buf_list = list(self.buffer)
         if len(buf_list) < self.batch_size:
             hud_console.warning(
@@ -234,81 +315,32 @@ class ReplayBuffer(Buffer[Trace]):
                 take = min(len(buf_list) or 1, self.batch_size - len(buf_list))
                 buf_list.extend(buf_list[:take])
         recent_traces = buf_list[-self.batch_size :]
-        hud_console.info(
-            f"[group-sampler] recent-window histogram: {Counter(getattr(t.task, 'id', 'NA') for t in recent_traces)}"  # noqa: E501
-        )
+        recent_keys = [self._extract_group_key(t) for t in recent_traces]
+        hud_console.info(f"[group-sampler] recent-window histogram: {Counter(recent_keys)}")
         hud_console.info(
             f"[group-sampler] Building earlier traces lookup, buffer size: {len(buf_list)}"
         )
-        earlier_traces_by_task: dict[str, deque[Trace]] = defaultdict(deque)
+        earlier_traces_by_key: dict[tuple[str, str], deque[Trace]] = defaultdict(deque)
         for tr in buf_list[: -self.batch_size]:
-            earlier_traces_by_task[getattr(tr.task, "id", "NA")].append(tr)
+            k = self._extract_group_key(tr)
+            if k[0] != "NA":
+                earlier_traces_by_key[k].append(tr)
+        groups, group_keys = self._validate_and_split_groups(recent_traces)
-        # Chunk from the most-recent end
         final_traces: list[Trace] = []
-        groups_per_batch = self.batch_size // self.group_size
-        hud_console.info(f"[group-sampler] Processing {groups_per_batch} groups")
-        for g_idx in range(groups_per_batch):
-            start = g_idx * self.group_size
-            end = start + self.group_size
-            group = recent_traces[start:end]
-            # Assert homogeneity: every trace in a group must share the same task id
-            cnt = Counter(getattr(t.task, "id", "NA") for t in group)
-            if len(cnt) != 1:
-                raise RuntimeError(f"Group {g_idx} is not homogeneous: {dict(cnt)}")
-            target_tid = next(iter(cnt.keys()))
-            # Build homogeneous group of target_tid, filling from earlier traces to increase spread
-            homogeneous: list[Trace] = [
-                t for t in group if getattr(t.task, "id", "NA") == target_tid
-            ]
-            needed = self.group_size - len(homogeneous)
-            # Greedy fill: choose earlier traces (same task-id) farthest from current mean reward
-            def current_mean(homogeneous: list[Trace]) -> float:
-                if not homogeneous:
+        for g_idx, (homogeneous, target_key) in enumerate(zip(groups, group_keys, strict=False)):
+            def current_mean(h: list[Trace]) -> float:
+                if not h:
                     return 0.0
-                vals = [float(getattr(t, "reward", 0.0) or 0.0) for t in homogeneous]
+                vals = [float(getattr(t, "reward", 0.0) or 0.0) for t in h]
                 return sum(vals) / len(vals)
-            while needed > 0:
-                pool = earlier_traces_by_task.get(target_tid, deque())
-                if pool:
-                    mu = current_mean(homogeneous)
-                    # pick element farthest from current mean
-                    best_i = None
-                    best_dist = -1.0
-                    for i, tr in enumerate(list(pool)):
-                        r = float(getattr(tr, "reward", 0.0) or 0.0)
-                        dist = abs(r - mu)
-                        if dist > best_dist:
-                            best_dist = dist
-                            best_i = i
-                    # pop selected
-                    chosen = list(pool)[best_i]  # type: ignore[index]
-                    # remove from deque efficiently by rotating
-                    left = list(pool)
-                    if best_i is not None:
-                        left.pop(best_i)  # O(n) but pool is small in practice
-                        earlier_traces_by_task[target_tid] = deque(left)
-                        homogeneous.append(chosen)
-                else:
-                    # duplicate extreme within current homogeneous set
-                    if not homogeneous:
-                        raise RuntimeError(f"Group {g_idx} has no traces for target {target_tid}")
-                    mu = current_mean(homogeneous)
-                    extreme = max(
-                        homogeneous, key=lambda t: abs(float(getattr(t, "reward", 0.0) or 0.0) - mu)
-                    )
-                    homogeneous.append(extreme)
-                needed -= 1
-            # Replacement step: swap in earlier traces to increase reward spread
-            pool = earlier_traces_by_task.get(target_tid, deque())
+            pool = earlier_traces_by_key.get(target_key, deque())
             if pool:
-                # Log pool stats
                 pool_vals = [float(getattr(tr, "reward", 0.0) or 0.0) for tr in list(pool)]
                 if pool_vals:
                     pool_mean = sum(pool_vals) / len(pool_vals)
@@ -316,16 +348,15 @@ class ReplayBuffer(Buffer[Trace]):
                         pool_vals
                     )
                     hud_console.info(
-                        f"[group-sampler] Group {g_idx}: earlier-pool size={len(pool_vals)} mean={pool_mean:.4f} std={(pool_var**0.5):.4f}"  # noqa: E501
+                        f"[group-sampler] Group {g_idx}: earlier-pool size={len(pool_vals)} "
+                        f"mean={pool_mean:.4f} std={(pool_var**0.5):.4f}"
                     )
-                # Decide how many to replace (up to 1/4 of group, at least 1)
                 replace_k = max(1, self.group_size // 4)
                 replace_k = min(replace_k, len(pool), self.group_size)
                 if replace_k > 0:
                     mu = current_mean(homogeneous)
-                    # Select replacement candidates from pool farthest from current mean
                     pool_list = list(pool)
                     pool_indices = list(range(len(pool_list)))
                     pool_indices.sort(
@@ -337,12 +368,11 @@ class ReplayBuffer(Buffer[Trace]):
                     chosen_pool_idx = set(pool_indices[:replace_k])
                     replacements = [pool_list[i] for i in pool_indices[:replace_k]]
-                    # Remove chosen from pool deque
                     remaining = [tr for i, tr in enumerate(pool_list) if i not in chosen_pool_idx]
-                    earlier_traces_by_task[target_tid] = deque(remaining)
+                    earlier_traces_by_key[target_key] = deque(remaining)
-                    # Select current group positions closest to mean to replace
                     group_indices = list(range(len(homogeneous)))
+                    mu = current_mean(homogeneous)
                     group_indices.sort(
                         key=lambda i: abs(
                             (float(getattr(homogeneous[i], "reward", 0.0) or 0.0)) - mu
@@ -353,18 +383,19 @@ class ReplayBuffer(Buffer[Trace]):
                     for pos, new_tr in zip(target_positions, replacements, strict=False):
                         homogeneous[pos] = new_tr
-            # Validate homogeneity
-            if any(getattr(t.task, "id", "NA") != target_tid for t in homogeneous):
+            if any(self._extract_group_key(t) != target_key for t in homogeneous):
                 raise RuntimeError(f"Group {g_idx} is not homogeneous after sampling")
             final_traces.extend(homogeneous)
         for i in range(0, len(final_traces), self.group_size):
             block = final_traces[i : i + self.group_size]
-            if len({getattr(t.task, "id", "NA") for t in block}) != 1:
+            keys = {self._extract_group_key(t) for t in block}
+            if len(keys) != 1:
                 raise RuntimeError(f"Homogeneity validation failed for block starting at index {i}")
         hud_console.info(
-            f"[group-sampler] final histogram: {Counter(getattr(t.task, 'id', 'NA') for t in final_traces)}"  # noqa: E501
+            f"[group-sampler] final histogram: "
+            f"{Counter(self._extract_group_key(t) for t in final_traces)}"
         )
         return final_traces

{hud_python-0.4.32 → hud_python-0.4.34}/hud/rl/tests/test_learner.py RENAMED Viewed

@@ -38,15 +38,20 @@ def make_sample(
     ref_logp_tok: torch.Tensor,
     advantage: float,
 ):
-    # Minimal object with required attributes for compute_loss
-    # inputs only needed for metrics token count
+    # Minimal-but-correct object for GRPOLearner.compute_loss.
+    # Needs assistant_mask (T-1) and attention_mask (T) for sanity_check().
     Tm1 = pol_logp_tok.size(-1)
-    inputs = {"input_ids": torch.zeros(1, Tm1 + 1, dtype=torch.long)}
+    inputs = {
+        "input_ids": torch.zeros(1, Tm1 + 1, dtype=torch.long),
+        "attention_mask": torch.ones(1, Tm1 + 1, dtype=torch.long),
+        "assistant_mask": torch.ones(1, Tm1, dtype=torch.bool),
+    }
     return TrainingSample(
         inputs=inputs,
         old_logprobs=old_logp_tok,
         ref_logprobs=ref_logp_tok,
-        advantage=torch.tensor(advantage, dtype=torch.float32),
+        # advantage must be 1D so .view(-1,1) works in compute_loss
+        advantage=torch.tensor([advantage], dtype=torch.float32),
     )
@@ -155,6 +160,13 @@ def test_skip_update_when_zero_adv(monkeypatch, learner_stub: GRPOLearner):
     monkeypatch.setattr(GRPOLearner, "prepare_groups", _stub_prepare_groups, raising=True)
+    # Return a zero scalar loss that *depends* on params so backward works,
+    # but has zero gradients (no update signal).
+    def _zero_loss(self, sample) -> torch.Tensor:
+        return sum(p.sum() for p in self.policy.parameters()) * 0.0
+    monkeypatch.setattr(GRPOLearner, "compute_loss", _zero_loss, raising=True)
     # Count optimizer.step calls
     steps = {"n": 0}
     # orig_step = learner_stub.optimizer.step
@@ -168,4 +180,7 @@ def test_skip_update_when_zero_adv(monkeypatch, learner_stub: GRPOLearner):
     assert any(p.requires_grad for p in learner_stub.policy.parameters())
     learner_stub.update([])
-    assert steps["n"] == 0
+    # With the current learner implementation we still call optimizer.step()
+    # even if the per-minibatch "advantage" is zero (the step is a no-op
+    # because the gradients are zero). So we expect exactly one step here.
+    assert steps["n"] == 1

hud_python-0.4.34/hud/samples/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Sample tasks."""
+from __future__ import annotations
+from hud.samples.browser import BrowserTask
+__all__ = ["BrowserTask"]

hud_python-0.4.34/hud/samples/browser.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Sample browser task factory."""
+from __future__ import annotations
+from typing import Any
+from pydantic import Field
+from hud.settings import settings
+from hud.types import MCPToolCall, Task
+class BrowserTask(Task):
+    """Task subclass with browser defaults for BrowserTask(prompt=...)."""
+    prompt: str = "Open Google and be ready to search."
+    mcp_config: dict[str, Any] = Field(
+        default_factory=lambda: {
+            "browser": {
+                "url": "https://mcp.hud.so/v3/mcp",
+                "headers": {
+                    "Authorization": f"Bearer {settings.api_key}",
+                    "Mcp-Image": "hudevals/hud-remote-browser:0.1.1",
+                },
+            }
+        }
+    )
+    setup_tool: MCPToolCall | list[MCPToolCall] | None = Field(
+        default_factory=lambda: MCPToolCall(
+            name="setup",
+            arguments={"name": "navigate_to_url", "arguments": {"url": "https://www.google.com"}},
+        )
+    )

hud-python 0.4.32__tar.gz → 0.4.34__tar.gz

Potentially problematic release.

hud-python 0.4.32tar.gz → 0.4.34tar.gz