PyPI - hud-python - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

hud-python 0.2.0py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (17) hide show

hud/__init__.py +1 -1
hud/adapters/claude/adapter.py +9 -1
hud/adapters/common/types.py +7 -0
hud/adapters/operator/adapter.py +4 -0
hud/agent/claude.py +22 -2
hud/agent/operator.py +35 -17
hud/env/docker_client.py +1 -1
hud/env/environment.py +182 -9
hud/env/local_docker_client.py +3 -1
hud/task.py +41 -30
hud/taskset.py +8 -0
hud/utils/common.py +28 -1
hud/utils/config.py +1 -92
{hud_python-0.2.0.dist-info → hud_python-0.2.1.dist-info}/METADATA +19 -26
{hud_python-0.2.0.dist-info → hud_python-0.2.1.dist-info}/RECORD +17 -17
{hud_python-0.2.0.dist-info → hud_python-0.2.1.dist-info}/WHEEL +0 -0
{hud_python-0.2.0.dist-info → hud_python-0.2.1.dist-info}/licenses/LICENSE +0 -0

hud/__init__.py CHANGED Viewed

@@ -8,7 +8,7 @@ from . import agent, env, gym, settings, task, taskset, types, utils
 from .job import create_job, job, load_job
 from .taskset import load_taskset
-__version__ = "0.2.0"
+__version__ = "0.2.1"
 __all__ = [
     "agent",

hud/adapters/claude/adapter.py CHANGED Viewed

@@ -13,6 +13,7 @@ from hud.adapters.common.types import (
     Point,
     PositionFetch,
     PressAction,
+    ResponseAction,
     ScreenshotFetch,
     ScrollAction,
     TypeAction,
@@ -21,7 +22,10 @@ from hud.adapters.common.types import (
 class ClaudeAdapter(Adapter):
-    KEY_MAP: ClassVar[dict[str, CLAKey]] = {"Return": "enter"}
+    KEY_MAP: ClassVar[dict[str, CLAKey]] = {
+        "Return": "enter",
+        "Super": "win",
+        }
     def __init__(self) -> None:
         super().__init__()
@@ -151,6 +155,10 @@ class ClaudeAdapter(Adapter):
             elif action_type == "wait":
                 assert "duration" in data
                 return WaitAction(time=data["duration"])
+            elif action_type == "response":
+                return ResponseAction(text=data.get("text", ""))
             else:
                 raise ValueError(f"Unsupported action type: {action_type}")
         except AssertionError:

hud/adapters/common/types.py CHANGED Viewed

@@ -82,6 +82,12 @@ class DragAction(CLAAction):
     hold_keys: list[CLAKey] | None = None
+# RESPONSE ACTION from agent
+class ResponseAction(CLAAction):
+    type: Literal["response"] = "response"
+    text: str # The final textual response from the agent
 # SCREENSHOT ACTION
 class ScreenshotFetch(CLAAction):
     type: Literal["screenshot"] = "screenshot"
@@ -103,6 +109,7 @@ CLA = Annotated[
     | KeyDownAction
     | KeyUpAction
     | TypeAction
+    | ResponseAction
     | ScrollAction
     | MoveAction
     | WaitAction

hud/adapters/operator/adapter.py CHANGED Viewed

@@ -10,6 +10,7 @@ from hud.adapters.common.types import (
     MoveAction,
     Point,
     PressAction,
+    ResponseAction,
     ScreenshotFetch,
     ScrollAction,
     TypeAction,
@@ -86,6 +87,9 @@ class OperatorAdapter(Adapter):
             elif action_type == "screenshot":
                 return ScreenshotFetch()
+            elif action_type == "response":
+                return ResponseAction(text=data.get("text", ""))
             else:
                 raise ValueError(f"Unsupported action type: {action_type}")

hud/agent/claude.py CHANGED Viewed

@@ -11,7 +11,7 @@ from anthropic.types.beta import (
     BetaImageBlockParam,
 )
+from hud.adapters import Adapter
 from hud.agent.base import Agent
 from hud.adapters.claude import ClaudeAdapter
 from hud.env.environment import Observation
@@ -61,7 +61,7 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
     def __init__(
         self,
         client: AsyncAnthropic | None = None,
-        adapter: ClaudeAdapter | None = None,
+        adapter: Adapter | None = None,
         model: str = "claude-3-7-sonnet-20250219",
         max_tokens: int = 4096,
         max_iterations: int = 10,
@@ -85,6 +85,8 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
             # Create client
             client = AsyncAnthropic(api_key=api_key)
+        adapter = adapter or ClaudeAdapter()
         super().__init__(client=client, adapter=adapter)
@@ -184,4 +186,22 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
                 done = False
                 break
+        # If no tool use action was found, check for a final text response
+        if not actions and done:
+            final_text_response = ""
+            for block in response_content:
+                if block.type == "text":
+                    final_text_response += block.text
+            if final_text_response.strip():
+                logger.info(f"No tool use found. Using final text as response: {final_text_response}")
+                actions = [{
+                    "action": "response",
+                    "text": final_text_response.strip()
+                }]
+                # Keep done = True
+            else:
+                 logger.info("No tool use and no final text block found.")
+                 # Keep done = True, actions remains empty
         return actions, done

hud/agent/operator.py CHANGED Viewed

@@ -9,9 +9,11 @@ from openai.types.responses import (
     ResponseInputParam,
     ResponseInputItemParam,
     ResponseOutputMessage,
-    ResponseComputerToolCall
+    ResponseComputerToolCall,
+    ResponseOutputText
 )
+from hud.adapters import Adapter
 from hud.agent.base import Agent
 from hud.adapters.operator import OperatorAdapter
 from hud.env.environment import Observation
@@ -32,7 +34,7 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
         client: OpenAI | None = None,
         model: str = "computer-use-preview",
         environment: Literal["windows", "mac", "linux", "browser"] = "windows",
-        adapter: OperatorAdapter | None = None,
+        adapter: Adapter | None = None,
         max_iterations: int = 8
     ):
         """
@@ -54,6 +56,8 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
             # Create synchronous client
             client = OpenAI(api_key=api_key)
+        adapter = adapter or OperatorAdapter()
         super().__init__(client=client, adapter=adapter)
@@ -74,7 +78,7 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
         self.last_response_id = None
         self.pending_call_id = None
         self.initial_prompt = None
     async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
         """
         Fetch a response from the model based on the observation.
@@ -158,33 +162,47 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
         # Store the response ID for the next call
         self.last_response_id = response.id
-        # Process the response to extract computer calls
+        # Process the response to extract actions or final text
         actions = []
-        done = True  # Assume we're done unless we find a computer call
-        # Loop through all items in the output to find computer_call items
+        done = True  # Assume done unless a computer call is found
+        final_text_response = ""
+        # Check for computer calls first
         computer_calls = [
             item for item in response.output
             if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
         ]
         if computer_calls:
-            # Extract the computer calls and mark that we're not done
+            # If computer calls exist, process them and set done=False
             done = False
-            # Process all computer calls
             for computer_call in computer_calls:
                 self.pending_call_id = computer_call.call_id
                 action = computer_call.action
-                actions.append(action.model_dump())
-                # Log the action
+                actions.append(action.model_dump()) # Convert Pydantic model to dict
                 logger.info(f"Computer call action: {action}")
         else:
-            # If there are no computer calls, print some debug info
-            logger.info("No computer call found in the response. Either complete or error.")
+            # No computer calls, check for a final text message
+            logger.info("No computer call found. Checking for final message.")
+            logger.info(response.output)
             for item in response.output:
                 if isinstance(item, ResponseOutputMessage) and item.type == "message":
-                    logger.info(f"Message: {item.content}")
+                    # Extract text from content blocks within the message
+                    full_text = "".join([c.text for c in item.content if isinstance(c, ResponseOutputText)])
+                    if full_text:
+                        final_text_response = full_text
+                        logger.info(f"Final text message: {final_text_response}")
+                        break # Stop after finding the first text message
+            # If we found final text, package it as a 'response' action
+            if final_text_response:
+                actions = [{
+                    "type": "response",
+                    "text": final_text_response
+                }]
+                # Keep done = True
+            else:
+                logger.info("No computer calls and no final text message found.")
+                # Keep done = True, actions remains empty
         return actions, done

hud/env/docker_client.py CHANGED Viewed

@@ -215,7 +215,7 @@ class DockerClient(Client):
                 raise ValueError("Could not find package name in pyproject.toml")
             logger.info("Installing %s in /root/controller", self._package_name)
             result = await self.execute(
-                ["bash", "-c", "cd /root/controller && pip install -e ."],
+                ["bash", "-c", "cd /root/controller && pip install -e . --break-system-packages"],
                 timeout=60,
             )
             if result["stdout"]:

hud/env/environment.py CHANGED Viewed

@@ -10,14 +10,13 @@ from pydantic import BaseModel
 from hud.env.client import Client
 from hud.env.remote_client import RemoteClient
 from hud.task import Task
-from hud.utils import HudStyleConfigs, expand_config
-from hud.utils.config import REMOTE_EVALUATE, REMOTE_SETUP, HudStyleConfig, create_remote_config
-if TYPE_CHECKING:
-    from hud.adapters.common import CLA
+from hud.utils.common import HudStyleConfig, HudStyleConfigs
+from hud.utils.config import REMOTE_EVALUATE, REMOTE_FUNCTION_PREFIX, REMOTE_SETUP, expand_config
 logger = logging.getLogger("hud.environment")
+if TYPE_CHECKING:
+    from hud.adapters.common import CLA
 class Observation(BaseModel):
     """
@@ -46,6 +45,9 @@ class Environment(BaseModel):
     task: Task | None = None
     build_data: dict[str, Any]
+    # final response
+    final_response: str | None = None
     async def _invoke_all(self, configs: HudStyleConfigs) -> list[Any]:
         # Execute each config and collect results
         configs_all = [configs] if not isinstance(configs, list) else configs
@@ -76,7 +78,7 @@ class Environment(BaseModel):
             config: The configuration to use for the setup
         """
         if isinstance(self.client, RemoteClient):
-            await self._invoke_all(create_remote_config(self.task, config, REMOTE_SETUP))
+            await self._invoke_all(create_remote_config(self, config, REMOTE_SETUP))
         else:
             if config is not None:
                 await self._invoke_all(config)
@@ -97,7 +99,7 @@ class Environment(BaseModel):
         """
         if isinstance(self.client, RemoteClient):
             results = await self._invoke_all(
-                create_remote_config(self.task, config, REMOTE_EVALUATE))
+                create_remote_config(self, config, REMOTE_EVALUATE))
         else:
             if config is not None:
                 results = await self._invoke_all(config)
@@ -143,9 +145,14 @@ class Environment(BaseModel):
         """
         if actions is None or len(actions) == 0:
             actions = []
+        args = [[action.model_dump() for action in actions]]
+        # TODO: Move this into the server side
+        if self._maybe_store_response(actions):
+            return Observation(text=self.final_response), 0, False, {}
         result, stdout, stderr = await self.client.invoke(
-            HudStyleConfig(function="step", args=[[action.model_dump() for action in actions]])
+            HudStyleConfig(function="step", args=args)
         )
         if stdout:
             logger.info("Step produced stdout: %s", stdout.decode())
@@ -156,6 +163,21 @@ class Environment(BaseModel):
         observation = Observation.model_validate(result["observation"], strict=True)
         return observation, 0, False, {}
+    def _maybe_store_response(self, actions: list[CLA]) -> bool:
+        """Store the final response into the environment.
+        Args:
+            actions: The action(s) to check
+        Returns:
+            bool: True if the response was submitted, False otherwise
+        """
+        if len(actions) > 0 and actions[-1].type == "response":
+            self.final_response = actions[-1].text
+            return True
+        return False
     async def get_urls(self) -> dict[str, Any]:
         """Get URLs for the environment.
@@ -179,3 +201,154 @@ class Environment(BaseModel):
         This should release any resources and clean up the environment.
         """
         await self.client.close()
+def create_remote_config(
+    env: Environment | None = None,
+    config: HudStyleConfigs | None = None,
+    function: str | None = None,
+) -> list[HudStyleConfig]:
+    """
+    Create a remote configuration for setup or evaluate, determining the final
+    function call structure based on the provided task or explicit config.
+    This function orchestrates how setup and evaluate steps defined in a Task
+    or passed directly are prepared for remote execution via `env._invoke_all`.
+    Args:
+        env: Environment object, potentially containing a task definition.
+             Used to access `env.task` and `env.final_response`.
+        config: Direct configuration override (e.g., passed to `env.evaluate(config=...)`).
+                Can be in various HudStyleConfigs formats.
+        function: The top-level function context, typically "setup" or "evaluate".
+    Returns:
+        list[HudStyleConfig]: A list containing a single HudStyleConfig object
+                              ready for remote invocation via `client.invoke`.
+                              The specific function/arguments are chosen based on this priority:
+                              1. Explicit `config` parameter (if provided).
+                              2. Specific `task` attribute (e.g., `task.evaluate`).
+                              3. General `task.config` dictionary.
+                              4. Default private function using `task.id`
+                              (e.g., `private_evaluate(task.id)`).
+                              5. Base `function` name with minimal/default arguments.
+    Logic & Examples (Assuming `function="evaluate"` for examples):
+        1) Explicit `config` provided: The `config` is expanded and becomes the `args`
+           for the top-level `function` call. If the environment has a final_response,
+           it's appended to these args.
+           - Example Input:
+             `env` (with `final_response="Paris"`)
+             `config=("contains_text", "Paris")`
+             `function="evaluate"`
+           - Example Output:
+             `[HudStyleConfig(function='evaluate', args=[
+                HudStyleConfig(function='contains_text', args=['Paris', 'Paris'])
+             ])]`
+        2) No explicit `config`, Task has the attribute (e.g., `task.evaluate`):
+           The Task's attribute value (e.g., `task.evaluate`) is expanded and becomes the `args`
+           for the top-level `function` call. Task ID is added if present. `final_response` is
+           appended if present.
+           - Example Input:
+             `env` (`task=Task(id="t1", evaluate=("check_answer",), ...)`, `final_response="42"`)
+             `config=None`
+             `function="evaluate"`
+           - Example Output:
+             `[HudStyleConfig(function='evaluate', args=[HudStyleConfig(function='check_answer',
+                args=['42'], id='t1')])]`
+        3) No explicit `config`, no specific Task attribute, Task has `task.config`:
+           The `task.config` dictionary becomes the single argument for the top-level
+           `function` call. Task ID is added to the config dict if present. `final_response` is
+           appended if present.
+           - Example Input:
+             `env` (with `task=Task(id="t2", config={"expected": "val"}, ...)`)
+             `config=None`
+             `function="evaluate"`
+           - Example Output:
+             `[HudStyleConfig(function='evaluate', args=[{"expected": "val", "id": "t2"}])]`
+        4) No explicit `config`, no specific Task attribute, no `task.config`, Task has `task.id`:
+           Calls a private function (`private_<function>`) on the remote end, passing
+           the `task.id` as the only argument.
+           - Example Input:
+             `env` (with `task=Task(id="t3", ...)`)
+             `config=None`
+             `function="evaluate"`
+           - Example Output:
+             `[HudStyleConfig(function='private_evaluate', args=['t3'])]`
+        5) No explicit `config` and no relevant Task info:
+           Calls the top-level `function` with empty args.
+           - Example Input:
+             `env` (with `task=Task(...)`)
+             `config=None`
+             `function="evaluate"`
+           - Example Output:
+             `[HudStyleConfig(function='evaluate', args=[])]`
+    """
+    # If no function provided, just expand the config and return it directly
+    if function is None:
+        if config:
+            return expand_config(config)
+        raise ValueError("Either function or config must be provided")
+    # Case 1: Explicit config provided
+    if config:
+        expanded_configs = expand_config(config)
+        if env and env.final_response:
+            # Ensure args is a list before appending
+            if not isinstance(expanded_configs[0].args, list):
+                 expanded_configs[0].args = [expanded_configs[0].args]
+            expanded_configs[0].args.append(env.final_response) # for remote responses
+        return [HudStyleConfig(function=function, args=expanded_configs)]
+    # Otherwise, use the environment's task
+    task = env.task if env else None
+    # Must have a task for the remaining cases
+    if task is None:
+        raise ValueError("Either task or config must be provided")
+    # Case 2: Task has the specified function attribute
+    task_config = getattr(task, function, None)
+    if task_config:
+        expanded_configs = expand_config(task_config)
+        if task.id:
+            expanded_configs[0].id = task.id # for remote IDs
+        elif env and env.final_response:
+            # Ensure args is a list before appending
+            if not isinstance(expanded_configs[0].args, list):
+                 expanded_configs[0].args = [expanded_configs[0].args]
+            expanded_configs[0].args.append(env.final_response) # for remote responses
+        return [HudStyleConfig(function=function, args=expanded_configs)]
+    # Case 3: Check for task.config
+    if hasattr(task, "config") and task.config:
+        # Ensure task.config is a dictionary before adding id
+        final_args = task.config.copy() if isinstance(task.config, dict) else {}
+        if task.id:
+            final_args["id"] = task.id # for remote IDs
+        if env and env.final_response:
+            # Append response, ensuring args exists and is a list
+            if "args" not in final_args:
+                final_args["args"] = []
+            if not isinstance(final_args["args"], list):
+                final_args["args"] = [final_args["args"]]
+            final_args["args"].append(env.final_response)
+        return [HudStyleConfig(function=function, args=[final_args])]
+    # Case 4: Use task.id
+    if task.id:
+        args_list = [task.id]
+        if env and env.final_response:
+             args_list.append(env.final_response) # Append final response
+        return [HudStyleConfig(function=f"{REMOTE_FUNCTION_PREFIX}{function}", args=args_list)]
+    # Case 5: No valid configuration found
+    args_list = []
+    if env and env.final_response:
+        args_list.append(env.final_response)
+    return [HudStyleConfig(function=function, args=args_list)]

hud/env/local_docker_client.py CHANGED Viewed

@@ -25,7 +25,9 @@ class LocalDockerClient(DockerClient):
     """
     @classmethod
-    async def create(cls, dockerfile: str, ports: list[int] | None = None) -> tuple[LocalDockerClient, dict[str, Any]]:
+    async def create(cls, dockerfile: str, ports: list[int] | None = None) -> tuple[
+            LocalDockerClient, dict[str, Any]
+        ]:
         """
         Creates a Docker environment client from a dockerfile.

hud/task.py CHANGED Viewed

@@ -5,8 +5,7 @@ from typing import TYPE_CHECKING, Any
 from pydantic import BaseModel
 from hud.types import CustomGym, Gym
-from hud.utils import HudStyleConfig
-from hud.utils.config import HudStyleConfigs
+from hud.utils.common import HudStyleConfig, HudStyleConfigs
 if TYPE_CHECKING:
     from inspect_ai.dataset import Sample
@@ -35,7 +34,7 @@ class Task(BaseModel):
     The setup and evaluate configurations can be in several formats:
     - String (function name): "chrome.maximize"
-    - String (function with args): "chrome.activate_tab 5"
+    - Tuple (function with args): ("chrome.activate_tab", 5)
     - Dict: {"function": "chrome.navigate", "args": ["https://example.com"]}
     - List of the above: ["chrome.maximize", {"function": "chrome.navigate", "args": ["https://example.com"]}]
@@ -68,15 +67,15 @@ class Task(BaseModel):
     @classmethod
     def from_inspect_sample(cls, sample: Sample) -> Task:
         """Create a Task from an Inspect dataset sample.
-        The task's sandbox is a local ubuntu container using the standard controller.
-        Files will be copied to the user directory
+        Automatically detects if a CustomGym (docker) or QA Gym is needed based on sample.sandbox.
+        Configures evaluation using 'response_includes' or 'match_all' based on sample.target.
         Args:
             sample: An Inspect dataset Sample object
         Returns:
             Task instance
         The Inspect Sample has these fields:
         - input (str | list[ChatMessage]): The input to be submitted to the model
         - choices (list[str] | None): Optional multiple choice answer list
@@ -87,10 +86,8 @@ class Task(BaseModel):
         - files (dict[str, str] | None): Optional files that go with the sample
         - setup (str | None): Optional setup script to run for sample
         """
-        # Extract the input as prompt
         prompt = sample.input
-        if isinstance(prompt, list):  # Handle ChatMessage format
-            # Convert chat message list to a string representation
+        if isinstance(prompt, list):
             prompt_parts = []
             for message in prompt:
                 role = message.role
@@ -98,36 +95,50 @@ class Task(BaseModel):
                 prompt_parts.append(f"{role.capitalize()}: {content}")
             prompt = "\n\n".join(prompt_parts)
-        # Map sandbox from Inspect to our envspec
+        evaluate_config = None
+        if sample.target:
+            if isinstance(sample.target, str):
+                evaluate_config = ("response_includes", [sample.target])
+            elif isinstance(sample.target, list):
+                evaluate_config = ("match_all", sample.target)
+        task_gym: Gym | None = None
+        task_setup: HudStyleConfigs | None = None
         sandbox = sample.sandbox
         dockerfile = None
+        use_qa_gym = True
         if sandbox:
             if isinstance(sandbox, str):
-                if sandbox != "docker":
-                    raise ValueError("docker is the only supported sandbox")
+                if sandbox == "docker":
+                    dockerfile = UBUNTU_DOCKERFILE
+                    use_qa_gym = False
             elif isinstance(sandbox, tuple) and len(sandbox) == 2:
                 sandbox_type, sandbox_config = sandbox
-                if sandbox_type != "docker":
-                    raise ValueError("docker is the only supported sandbox")
-                dockerfile = sandbox_config
-            else:
-                raise ValueError("Invalid sandbox configuration")
-        gym = CustomGym(
-            dockerfile=dockerfile or UBUNTU_DOCKERFILE,
-            location="local",
-        )
+                if sandbox_type == "docker":
+                    dockerfile = sandbox_config
+                    use_qa_gym = False
+        if use_qa_gym:
+            task_gym = "qa"
+            task_setup = None
+        else:
+            task_gym = CustomGym(
+                dockerfile=dockerfile or UBUNTU_DOCKERFILE,
+                location="local",
+            )
+            task_setup = [x for x in convert_inspect_setup(sample.setup)] if sample.setup else None
+            # TODO: Handle sample.files for CustomGym case if needed
         return cls(
-            id=str(sample.id) if sample.id else None,
+            id=None,
             prompt=prompt,
-            setup=[x for x in convert_inspect_setup(sample.setup)] if sample.setup else [],
+            setup=task_setup,
             metadata=sample.metadata,
             choices=sample.choices,
-            target=sample.target,
-            gym=gym,
+            evaluate=evaluate_config,
+            gym=task_gym,
+            # files=sample.files, # TODO: Decide how/if to handle files
         )
-    def convert_sdk01(self) -> None:
-        self.setup = [HudStyleConfig(function="reset", args=[{"task_id": self.id}])]
-        self.evaluate = [HudStyleConfig(function="evaluate", args=[])]

hud/taskset.py CHANGED Viewed

@@ -9,6 +9,8 @@ from hud.settings import settings
 from hud.task import Task
 if TYPE_CHECKING:
+    from collections.abc import Iterator
     from inspect_ai.dataset import Dataset
@@ -49,6 +51,12 @@ class TaskSet(BaseModel):
         """
         return len(self.tasks)
+    def __iter__(self) -> Iterator[Task]:
+        """
+        Returns an iterator over the tasks in the taskset.
+        """
+        return iter(self.tasks)
 async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
     """

hud/utils/common.py CHANGED Viewed

@@ -3,16 +3,43 @@ from __future__ import annotations
 import io
 import logging
 import tarfile
-from typing import TYPE_CHECKING, TypedDict
+from typing import TYPE_CHECKING, Any, TypedDict
+from pydantic import BaseModel
 from hud.server.requests import make_request
 from hud.settings import settings
 if TYPE_CHECKING:
+    from collections.abc import Iterator
     from pathlib import Path
 logger = logging.getLogger("hud.utils.common")
+class HudStyleConfig(BaseModel):
+    function: str  # Format: "x.y.z"
+    args: list[Any] # Must be json serializable
+    id: str | None = None # Optional id for remote execution
+    def __len__(self) -> int:
+        return len(self.args)
+    def __getitem__(self, index: int) -> Any:
+        return self.args[index]
+    def __iter__(self) -> Iterator[Any]:
+        return iter(self.args)
+    def __str__(self) -> str:
+        return f"{self.function}: {', '.join(str(arg) for arg in self.args)}"
+# Type alias for the shorthand config, which just converts to function name and args
+ShorthandConfig = tuple[str | dict[str, Any] | list[str] | list[dict[str, Any]], ...]
+# Type alias for multiple config formats
+HudStyleConfigs = ShorthandConfig | HudStyleConfig | list[HudStyleConfig] | dict[str, Any] | str
 class ExecuteResult(TypedDict):
     """
     Result of an execute command.

hud/utils/config.py CHANGED Viewed

@@ -2,14 +2,8 @@ from __future__ import annotations
 import logging
 import re
-from typing import TYPE_CHECKING, Any
-from pydantic import BaseModel
-if TYPE_CHECKING:
-    from collections.abc import Iterator
-    from hud.task import Task
+from hud.utils.common import HudStyleConfig, HudStyleConfigs
 logger = logging.getLogger("hud.utils.config")
@@ -17,30 +11,6 @@ REMOTE_FUNCTION_PREFIX = "private_"
 REMOTE_SETUP = "setup"
 REMOTE_EVALUATE = "evaluate"
-class HudStyleConfig(BaseModel):
-    function: str  # Format: "x.y.z"
-    args: list[Any] # Must be json serializable
-    id: str | None = None # Optional id for remote execution
-    def __len__(self) -> int:
-        return len(self.args)
-    def __getitem__(self, index: int) -> Any:
-        return self.args[index]
-    def __iter__(self) -> Iterator[Any]:
-        return iter(self.args)
-    def __str__(self) -> str:
-        return f"{self.function}: {', '.join(str(arg) for arg in self.args)}"
-# Type alias for the shorthand config, which just converts to function name and args
-ShorthandConfig = tuple[str | dict[str, Any] | list[str] | list[dict[str, Any]], ...]
-# Type alias for multiple config formats
-HudStyleConfigs = ShorthandConfig | HudStyleConfig | list[HudStyleConfig] | dict[str, Any] | str
 def _is_valid_python_name(name: str) -> bool:
     """Check if a string is a valid Python identifier."""
     return bool(re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", name))
@@ -122,64 +92,3 @@ def expand_config(config: HudStyleConfigs) -> list[HudStyleConfig]:
     error_msg = f"Unknown configuration type: {type(config)}"
     logger.error(error_msg)
     raise ValueError(error_msg)
-def create_remote_config(
-    task: Task | None = None,
-    config: HudStyleConfigs | None = None,
-    function: str | None = None,
-) -> list[HudStyleConfig]:
-    """
-    Create a configuration based on provided inputs.
-    Args:
-        task: Task object with configuration
-        config: Direct configuration (expanded or not)
-        function: Function name to use
-    Returns:
-        list[HudStyleConfig]: List of standardized configurations
-    Logic:
-        1) If explicit config: expand and return HudStyleConfig with func of the function,
-        and args of expanded config
-        2) If task has the specified function defined: use that
-        3) If no task function: check for task._config and use that
-        4) If no _config: use task.id and create private_[function]
-    """
-    # If no function provided, just expand the config and return it directly
-    if function is None:
-        if config:
-            return expand_config(config)
-        raise ValueError("Either function or config must be provided")
-    # Case 1: Explicit config provided
-    if config:
-        expanded_configs = expand_config(config)
-        return [HudStyleConfig(function=function, args=expanded_configs)]
-    # Must have a task for the remaining cases
-    if task is None:
-        raise ValueError("Either task or config must be provided")
-    # Case 2: Task has the specified function attribute
-    task_config = getattr(task, function, None)
-    if task_config and len(task_config) > 0:
-        expanded_configs = expand_config(task_config)
-        if task.id:
-            expanded_configs[0].id = task.id # for remote IDs
-        return [HudStyleConfig(function=function, args=expanded_configs)]
-    # Case 3: Check for _config
-    if hasattr(task, "config") and task.config:
-        if task.id:
-            task.config["id"] = task.id # for remote IDs
-        return [HudStyleConfig(function=function, args=[task.config])]
-    # Case 4: Use task.id
-    if task.id:
-        return [HudStyleConfig(function=f"{REMOTE_FUNCTION_PREFIX}{function}", args=[task.id])]
-    # No valid configuration found
-    #logger.warning("No valid configuration found for function: %s", function)
-    return [HudStyleConfig(function=function, args=[])]

{hud_python-0.2.0.dist-info → hud_python-0.2.1.dist-info}/METADATA RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.2.0
+Version: 0.2.1
 Summary: SDK for the HUD evaluation platform.
-Project-URL: Homepage, https://github.com/Human-Data/hud-sdk
-Project-URL: Bug Tracker, https://github.com/Human-Data/hud-sdk/issues
+Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
+Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
 Project-URL: Documentation, https://hud.so
 Author-email: Human Union Data SDK <founders@hud.so>
 License: MIT License
@@ -57,7 +57,7 @@ Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
 Requires-Dist: ruff==0.9.8; extra == 'dev'
 Description-Content-Type: text/markdown
-# HUD SDK - Human-Agent Interaction Toolkit
+# HUD
 A Python SDK for creating, evaluating, and benchmarking agent interactions with web browsers and OS environments.
@@ -86,21 +86,20 @@ export HUD_API_KEY=your_api_key_here
 pip install hud-python
 ```
-### Simple Browser Example with Operator
+### Simple Browser Example with Claude Computer Use
 > This example uses the `@job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
+Make sure your have defined your `ANTRHOPIC_API_KEY` in environment variables to run Claude.
 ```python
-import os
 import asyncio
 from hud import gym, job
 from hud.task import Task
-from hud.utils import stream
-from hud.agent import OperatorAgent
+from hud.agent import ClaudeAgent
 @job("test-run")
 async def main():
-    # Define a simple task
     task = Task(
         prompt="Insert the text 'capybara' into the search bar",
         gym="hud-browser",
@@ -108,26 +107,20 @@ async def main():
         evaluate=("contains_text", "capybara")
     )
-    # Create environment
+    # Create environment using the gym module
     env = await gym.make(task)
-    # Get URLs and display live view (optional)
-    # urls = await env.get_urls()
-    # stream(urls["live_url"])
     # Initialize Operator agent (API key is loaded automatically)
-    agent = OperatorAgent()
+    agent = ClaudeAgent()
-    # Agent loop
-    obs, _ = env.reset()
+    # Agent loop with predict and step functions
+    obs, _ = await env.reset() # Gets first observation
     for i in range(5):
         actions, done = await agent.predict(obs)
         if done:
             break
         obs, reward, terminated, info = await env.step(actions)
-        if terminated:
-            break
     # Evaluate and close
     result = await env.evaluate()
@@ -143,26 +136,26 @@ if __name__ == "__main__":
 Explore the core concepts and features of the SDK:
-*   **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios.
+*   **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
 *   **[Environments](/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
 *   **[Agents](/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
 *   **[Adapters](/concepts/adapter)**: See how actions and observations are translated between agents and environments.
 *   **[Jobs](/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
 *   **[Trajectories](/concepts/trajectory)**: Understand the recorded data from each agent run.
 *   **Advanced Topics**:
+    *   **[CLA Action Details](/advanced/cla-details)**: Explore the standardized action format.
     *   **[Custom Environments](/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
     *   **[Advanced Environment Control](/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
-    *   **[CLA Action Details](/advanced/cla-details)**: Dive deeper into the standardized action format.
 *   **[Full API Reference](/api-reference/gym)**: Detailed specifications for all modules and classes.
 ## [Examples](examples/)
-We provide several example notebooks showing how to use the HUD SDK:
+We recommend you first take a look at the example notebooks showing how to use the HUD SDK:
 1. [Browser Basics](examples/browser_use.ipynb) - Simple browser interaction with live view
 2. [Task Design](examples/tasks.ipynb) - Creating and customizing tasks
-3. [OSWorld](examples/osworld.ipynb) - Working with OS environments
+3. [OSWorld](examples/osworld.ipynb) - Running the OSWorld benchmark
 4. [Local Development](examples/local.ipynb) - Setting up local custom environments
 ## Documentation
@@ -180,9 +173,9 @@ If you use this SDK in your research, please cite it as follows:
 ```bibtex
 @software{hud2025agentevalplatform,
   author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Max Muoto and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
-  title = {{HUD: An Evaluation Platform for Computer Use Agents}},
-  date = {2025-03},
-  url = {https://github.com/Human-Data/hud-sdk},
+  title = {{HUD: An Evaluation Platform for Agents}},
+  date = {2025-04},
+  url = {https://github.com/hud-evals/hud-sdk},
   langid = {en}
 }
 ```

{hud_python-0.2.0.dist-info → hud_python-0.2.1.dist-info}/RECORD RENAMED Viewed

@@ -1,28 +1,28 @@
-hud/__init__.py,sha256=YX9zAqOSjAFZqHbDJGUVefOsxg7PhkH1ZDflRoiSgP8,464
+hud/__init__.py,sha256=HFL1iwPhLZd7z--2QADzipur68XlekwGrOzU2vWL-Vw,464
 hud/gym.py,sha256=cKjIuJS7A0vJx4K7fctpUjIEv8TkW5x6aB_PRrODrDY,3651
 hud/job.py,sha256=E4RN1CkppRQVy46RWCUDjNIyhMa7lNlFfCgpky2vKFk,5463
 hud/settings.py,sha256=rv8TiZx4wmBzIoEEkOzoywC0nt8UZXlHxIa_LW4tWAg,1346
-hud/task.py,sha256=q1E_urMavnfsb87x2JHkRNMBzbkkaQI1skOulkpJ5DY,5132
-hud/taskset.py,sha256=fV4QgHf8tphDoMjTdBzkyCJT7pQBLEMoGu_Uxuji2DM,2226
+hud/task.py,sha256=aNbHMlO7r1cm5DcO0QLU1SZ7EawOFw9W6DZwTNy72-4,5383
+hud/taskset.py,sha256=xDPBXeDm4AlSOwl-MM98lN0x6PmGV8t9jv7sNyS_u0c,2426
 hud/trajectory.py,sha256=PA-sE2iyt2BctO2Dex-2ZaRmS95AkEXTicZjHCVCYqE,3749
 hud/types.py,sha256=fJZnzK3j3mq7G0gO5TbqRaN92qT4xAb4jUNOXIX8ZZ0,2395
 hud/adapters/__init__.py,sha256=0RNQgrzBCkhNBq1Q7JRESN1WfUVLs_99fR5g1re3APs,207
 hud/adapters/claude/__init__.py,sha256=i7QEF-29FLb9qxp1eYtXs-adIk_tG54tL-9g6d3xodk,100
-hud/adapters/claude/adapter.py,sha256=sgdgkCtNFjFPSSmfsUD1vx0Xz9xhG81A_it4BvRsOXE,5781
+hud/adapters/claude/adapter.py,sha256=x0qQglWsg7n8DJ_NacsymlUQBnkpqNVguUlkQRpYX-A,5955
 hud/adapters/common/__init__.py,sha256=BjdZWJVs_AKtpFrt-tNsdQRjnz7D97DFEQirJ-r0mp8,118
 hud/adapters/common/adapter.py,sha256=ls-gXtg1N_SQc211rkDb3LL511HNZv6etm1nx2ZtrkQ,5808
-hud/adapters/common/types.py,sha256=ubnWlm4JMtCkTNonKZGb425p6oi8jZyIVcekp-pjTXQ,4905
+hud/adapters/common/types.py,sha256=APxGEmoePwjF7OYXAKqBTVT73PJTFV0eBmbURbaT5xk,5091
 hud/adapters/operator/__init__.py,sha256=31vTRs268_TOLd-TeQRKau5bDYy78wxCNpJFhD5_l8U,104
-hud/adapters/operator/adapter.py,sha256=j2bBe_bwOhdbd7Qr6UvWUEkTkUTOA-ADvWYx0B1c_TU,3159
+hud/adapters/operator/adapter.py,sha256=svHgjCdUeMyfgfGzRO3ItGWTKGkm3tmldO2zfjX_sGI,3301
 hud/agent/__init__.py,sha256=cI3bqfmG2_Lwzn2RjrxV0X9qIxCRDiffwd1UaWToct4,238
 hud/agent/base.py,sha256=RThJ_h4A3oU23zyvvKtxY2a_YM03Vd1XYDXdY3bAf8g,3881
-hud/agent/claude.py,sha256=ZPoged_sun2CmPgludfkV4uv-gjak_yyIlGgCIRcWx0,6583
-hud/agent/operator.py,sha256=zJaYW5kJ7rgvRQCufrjsoNCPn2Ra9EakmFFwut_v7Hk,7335
+hud/agent/claude.py,sha256=tbDKAzGCLJPnUnHc8eV-zZmj3ZG6QQx0ukWKoO4Ekec,7445
+hud/agent/operator.py,sha256=44t19TzcCrS1N3-rnD25ZLXx5s4Io8On27LomALuugs,8185
 hud/env/__init__.py,sha256=BHFY_N0kEI142pjWtMyqUb3BGnoiekY8evRCIbSbO2w,271
 hud/env/client.py,sha256=SPR6ct6NFxmIrgIi3K8tEC-vnqOmCbCBtuT81PaVjuY,869
-hud/env/docker_client.py,sha256=4G3OeFBCbIqg9zOXxreDekNvLNMhgtc2cMAjMbqB6Tk,10394
-hud/env/environment.py,sha256=h-Z7I_1Y8vXBL1oOYbC5xRIKwl28NZt0PJ4GmKcd0AM,5863
-hud/env/local_docker_client.py,sha256=9p2IHeSRmk9_lU7FRiHaCMWn0CjbtWLQjsT3x8x6qxY,7767
+hud/env/docker_client.py,sha256=56_u3Ri4NulGcBumAg-7-KilmFmBKthOwEIM5bOLOZc,10418
+hud/env/environment.py,sha256=Xyq4KQO9aWYPwZ0uESAetB5EEZgmlEnZVc7sA0DLz2c,13706
+hud/env/local_docker_client.py,sha256=TCD9z1qjafxjwAWLatAL8d587_ioMDHjs8T5cBgusr8,7789
 hud/env/remote_client.py,sha256=iJiwueuf98xOx0_Y2ltu_63BwKIKNvohhim73Goq74E,5804
 hud/env/remote_docker_client.py,sha256=FwaO7NyygDt9oe3pDD7PwUS21pxzc465mwcXk-Cx-60,6838
 hud/evaluators/__init__.py,sha256=XophB666xPnurhQ_ygfW44h0Jh0BQGCgUzCXEOG2Q1g,158
@@ -34,11 +34,11 @@ hud/evaluators/remote.py,sha256=NVUJJvrpGQj2eL-aFxzTEnAWW7iuSI9eDWtar54dc6E,2174
 hud/server/__init__.py,sha256=cxDKTwMdGzhj7bYajtejN8XCt7K8Xq3eKB2No0qBpoY,169
 hud/server/requests.py,sha256=s8LZZYWT1wl7lPu2vwRaYPZs9_gjKwSg3LZLvS5-s6E,9085
 hud/utils/__init__.py,sha256=LnoI2tQUnd-mQ4eg-gpJJgmHBBIhggJ6c9ap7MBgrfs,260
-hud/utils/common.py,sha256=qTAgiqQqplfrCrll06SAYYr9TyT8gnV4mwDSxsj-W1s,1842
-hud/utils/config.py,sha256=x3F9Rg2lTGEG8_FcnEyymh4Y02qD1UWmcDlOSA1Xq0U,6476
+hud/utils/common.py,sha256=XJZ-hKJkeaNmelG2QD5ybi9FpZQS1ErA40fAYzUSHVE,2742
+hud/utils/config.py,sha256=ePi3GDo8mDUnOZ5G5HyMprqGRvxrxCMfixGNuTOA8rQ,3266
 hud/utils/telemetry.py,sha256=md7AuKxtDqsONMeeTOHen1XpmNds8CbXROX_PnkDxFc,1993
 hud/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-hud_python-0.2.0.dist-info/METADATA,sha256=GbG7OHnQ8WqR3iXT6utC26PkCmgPKrOePTdCNZxuwK4,7222
-hud_python-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-hud_python-0.2.0.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
-hud_python-0.2.0.dist-info/RECORD,,
+hud_python-0.2.1.dist-info/METADATA,sha256=f2lyqGmu9L7_zgCOqrhZ6ZX1JUU6Z0e92bRTfmojSqQ,7219
+hud_python-0.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+hud_python-0.2.1.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
+hud_python-0.2.1.dist-info/RECORD,,

{hud_python-0.2.0.dist-info → hud_python-0.2.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{hud_python-0.2.0.dist-info → hud_python-0.2.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

hud-python 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

Potentially problematic release.

hud-python 0.2.0py3-none-any.whl → 0.2.1py3-none-any.whl