PyPI - hud-python - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

hud-python 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (58) hide show

hud/__init__.py +4 -3
hud/adapters/claude/adapter.py +5 -14
hud/adapters/common/adapter.py +3 -3
hud/adapters/common/tests/__init__.py +0 -0
hud/adapters/common/tests/test_adapter.py +277 -0
hud/adapters/common/types.py +3 -3
hud/adapters/operator/adapter.py +16 -23
hud/agent/__init__.py +8 -1
hud/agent/base.py +28 -28
hud/agent/claude.py +69 -60
hud/agent/langchain.py +32 -26
hud/agent/operator.py +75 -67
hud/env/__init__.py +5 -5
hud/env/client.py +2 -2
hud/env/docker_client.py +37 -39
hud/env/environment.py +91 -66
hud/env/local_docker_client.py +5 -7
hud/env/remote_client.py +39 -32
hud/env/remote_docker_client.py +13 -3
hud/evaluators/__init__.py +2 -3
hud/evaluators/base.py +4 -3
hud/evaluators/inspect.py +3 -8
hud/evaluators/judge.py +34 -58
hud/evaluators/match.py +42 -49
hud/evaluators/remote.py +13 -26
hud/evaluators/tests/__init__.py +0 -0
hud/evaluators/tests/test_inspect.py +12 -0
hud/evaluators/tests/test_judge.py +231 -0
hud/evaluators/tests/test_match.py +115 -0
hud/evaluators/tests/test_remote.py +98 -0
hud/exceptions.py +167 -0
hud/gym.py +9 -7
hud/job.py +179 -109
hud/server/__init__.py +2 -2
hud/server/requests.py +148 -186
hud/server/tests/__init__.py +0 -0
hud/server/tests/test_requests.py +275 -0
hud/settings.py +3 -2
hud/task.py +9 -19
hud/taskset.py +44 -11
hud/trajectory.py +6 -9
hud/types.py +12 -9
hud/utils/__init__.py +2 -2
hud/utils/common.py +36 -15
hud/utils/config.py +45 -30
hud/utils/progress.py +34 -21
hud/utils/telemetry.py +10 -11
hud/utils/tests/__init__.py +0 -0
hud/utils/tests/test_common.py +52 -0
hud/utils/tests/test_config.py +129 -0
hud/utils/tests/test_progress.py +225 -0
hud/utils/tests/test_telemetry.py +37 -0
hud/utils/tests/test_version.py +8 -0
{hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/METADATA +9 -6
hud_python-0.2.4.dist-info/RECORD +62 -0
hud_python-0.2.2.dist-info/RECORD +0 -46
{hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/WHEEL +0 -0
{hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/licenses/LICENSE +0 -0

hud/env/__init__.py CHANGED Viewed

@@ -3,9 +3,9 @@ from __future__ import annotations
 from . import docker_client, environment, local_docker_client, remote_client, remote_docker_client
 __all__ = [
-     "docker_client",
-     "environment",
-     "local_docker_client",
-     "remote_client",
-     "remote_docker_client",
+    "docker_client",
+    "environment",
+    "local_docker_client",
+    "remote_client",
+    "remote_docker_client",
 ]

hud/env/client.py CHANGED Viewed

@@ -7,7 +7,7 @@ from pydantic import BaseModel
 if TYPE_CHECKING:
     from hud.types import EnvironmentStatus
-    from hud.utils.config import HudStyleConfig
+    from hud.utils.config import FunctionConfig
 class Client(BaseModel, ABC):
@@ -16,7 +16,7 @@ class Client(BaseModel, ABC):
     """
     @abstractmethod
-    async def invoke(self, config: HudStyleConfig) -> Any:
+    async def invoke(self, config: FunctionConfig) -> Any:
         """
         Invoke the environment with the given config.
         """

hud/env/docker_client.py CHANGED Viewed

@@ -16,7 +16,7 @@ from hud.utils.common import directory_to_tar_bytes
 if TYPE_CHECKING:
     from hud.utils import ExecuteResult
-    from hud.utils.config import HudStyleConfig
+    from hud.utils.config import FunctionConfig
 logger = logging.getLogger("hud.env.docker_client")
@@ -33,7 +33,7 @@ class InvokeError(Exception):
     """
-def invoke_template(config: HudStyleConfig, package_name: str, divider: str) -> str:
+def invoke_template(config: FunctionConfig, package_name: str, divider: str) -> str:
     """
     Return a python script to run the given config.
     """
@@ -51,16 +51,17 @@ print("{divider}")
 print(result_str)
 """
 class DockerClient(Client):
     """
     Base class for environment clients.
     Handles updating the environment when local files change.
     """
     _last_pyproject_toml_str: str | None = None
     _last_update_time: int = 0
-    _last_file_mtimes: dict[str, float] = {} # noqa: RUF012
+    _last_file_mtimes: dict[str, float] = {}  # noqa: RUF012 - Not recognized as Pydantic model
     _source_path: Path | None = None
     _package_name: str | None = None
@@ -68,47 +69,46 @@ class DockerClient(Client):
     def source_path(self) -> Path | None:
         """Get the source path."""
         return self._source_path
     @property
     def package_name(self) -> str:
         """Get the package name."""
         if not self._package_name:
             raise ValueError("Package name not set")
         return self._package_name
     def set_source_path(self, source_path: Path) -> None:
         """
         Set the source path for this environment controller.
         Can only be set once, and cannot be set if source_path is already set.
         Args:
             source_path: Path to the source code to use in the environment
         Raises:
             ValueError: If source_path has already been set
         """
         if self._source_path:
             raise ValueError("Source path has already been set")
         # Validate source path
         if not source_path.exists():
             raise FileNotFoundError(f"Source path {source_path} does not exist")
         if not source_path.is_dir():
             raise NotADirectoryError(f"Source path {source_path} is not a directory")
         # Parse pyproject.toml to get package name
         pyproject_path = source_path / "pyproject.toml"
         if not pyproject_path.exists():
             raise FileNotFoundError(f"pyproject.toml not found in {source_path}")
         pyproject_data = toml.load(pyproject_path)
         self._package_name = pyproject_data.get("project", {}).get("name")
         if not self._package_name:
             raise ValueError("Could not find package name in pyproject.toml")
         self._source_path = source_path
     @classmethod
     @abc.abstractmethod
     async def create(cls, dockerfile: str) -> DockerClient:
@@ -121,26 +121,26 @@ class DockerClient(Client):
         Returns:
             EnvClient: An instance of the environment client
         """
     @abc.abstractmethod
     async def get_status(self) -> EnvironmentStatus:
         """
         Get the current status of the environment.
         Returns:
             EnvironmentStatus: A status enum indicating the current state of the environment
         """
     def _get_all_file_mtimes(self) -> dict[str, float]:
         """
         Get modification times for all files in the source path.
         Returns:
             Dict[str, float]: Dictionary mapping file paths to modification times
         """
         if not self._source_path:
             return {}
         file_mtimes = {}
         for root, _, files in os.walk(self._source_path):
             for file in files:
@@ -151,12 +151,12 @@ class DockerClient(Client):
                     # Skip files that can't be accessed
                     continue
         return file_mtimes
     async def needs_update(self) -> bool:
         """
         Check if the environment needs an update by:
         1. Checking if any file has been modified since the last update
         Returns:
             bool: True if the environment needs an update, False otherwise.
         """
@@ -166,18 +166,18 @@ class DockerClient(Client):
         # Check if any file has been modified since the last update
         current_mtimes = self._get_all_file_mtimes()
         # If we don't have previous modification times, we need an update
         if not self._last_file_mtimes:
             return True
         # Check for new or modified files
         for file_path, mtime in current_mtimes.items():
             if file_path not in self._last_file_mtimes or mtime > self._last_file_mtimes[file_path]:
                 return True
         return False
     async def update(self) -> None:
         """
         Base update method for environment controllers.
@@ -186,22 +186,22 @@ class DockerClient(Client):
         # If no source path, nothing to update
         if not self._source_path:
             return
         logger.info("Updating environment")
         # Save current file modification times
         self._last_file_mtimes = self._get_all_file_mtimes()
         # Create tar archive of the source code and send it to the container
         tar_bytes = directory_to_tar_bytes(self._source_path)
         await self.execute(["mkdir", "-p", "/root/controller"], timeout=5)
         await self.put_archive("/root/controller", tar_bytes)
         # Check if pyproject.toml exists and parse it
         pyproject_path = self._source_path / "pyproject.toml"
         if not pyproject_path.exists():
             raise FileNotFoundError(f"pyproject.toml not found in {self._source_path}")
         # Read and parse the current content of pyproject.toml
         current_pyproject_content = pyproject_path.read_text()
         if (
@@ -224,8 +224,7 @@ class DockerClient(Client):
                 logger.warning("STDERR:\n%s", result["stderr"])
             # Save current pyproject.toml content
             self._last_pyproject_toml_str = current_pyproject_content
     @abc.abstractmethod
     async def execute(
         self,
@@ -235,20 +234,20 @@ class DockerClient(Client):
     ) -> ExecuteResult:
         """
         Execute a command in the environment. May not be supported by all environments.
         Args:
             command: The command to execute
             workdir: The working directory to execute the command in
             timeout: The timeout for the command
         Returns:
             ExecuteResult: The result of the command
         """
-    async def invoke(self, config: HudStyleConfig) -> tuple[Any, bytes, bytes]:
+    async def invoke(self, config: FunctionConfig) -> tuple[Any, bytes, bytes]:
         """
         Invoke a function in the environment. Supported by all environments.
         Args:
             config: The configuration to invoke
@@ -289,11 +288,11 @@ class DockerClient(Client):
         May not be supported by all environments. (notably browser environments)
         Args:
             path: The path to get the archive of
         Returns:
             bytes: The archive of the path
         """
     @abc.abstractmethod
     async def put_archive(self, path: str, data: bytes) -> bool:
         """
@@ -303,4 +302,3 @@ class DockerClient(Client):
             path: The path to put the archive at
             data: The data to put in the archive
         """

hud/env/environment.py CHANGED Viewed

@@ -10,25 +10,21 @@ from pydantic import BaseModel
 from hud.env.client import Client
 from hud.env.remote_client import RemoteClient
 from hud.task import Task
-from hud.utils.common import HudStyleConfig, HudStyleConfigs
-from hud.utils.config import REMOTE_EVALUATE, REMOTE_FUNCTION_PREFIX, REMOTE_SETUP, expand_config
+from hud.utils.common import FunctionConfig, FunctionConfigs, Observation
+from hud.utils.config import (
+    LOCAL_EVALUATORS,
+    REMOTE_EVALUATE,
+    REMOTE_FUNCTION_PREFIX,
+    REMOTE_SETUP,
+    expand_config,
+)
+from hud.utils.telemetry import stream
 logger = logging.getLogger("hud.environment")
 if TYPE_CHECKING:
     from hud.adapters.common import CLA
-class Observation(BaseModel):
-    """
-    Observation from the environment.
-    Attributes:
-        screenshot: Base64 encoded PNG string of the screen
-        text: Text observation, if available
-    """
-    screenshot: str | None = None  # base64 string png
-    text: str | None = None
+    from hud.agent import Agent
 class Environment(BaseModel):
@@ -48,7 +44,7 @@ class Environment(BaseModel):
     # final response
     final_response: str | None = None
-    async def _invoke_all(self, configs: HudStyleConfigs) -> list[Any]:
+    async def _invoke_all(self, configs: FunctionConfigs) -> list[Any]:
         # Execute each config and collect results
         configs_all = [configs] if not isinstance(configs, list) else configs
         results = []
@@ -69,8 +65,8 @@ class Environment(BaseModel):
                         stderr.decode(),
                     )
         return results
-    async def _setup(self, config: HudStyleConfigs | None = None) -> None:
+    async def _setup(self, config: FunctionConfigs | None = None) -> None:
         """
         Setup the environment.
@@ -87,7 +83,7 @@ class Environment(BaseModel):
             else:
                 raise ValueError("No config or task provided for local environment")
-    async def evaluate(self, config: HudStyleConfigs | None = None) -> Any:
+    async def evaluate(self, config: FunctionConfigs | None = None) -> Any:
         """
         Evaluate the environment.
@@ -98,8 +94,7 @@ class Environment(BaseModel):
             Any: Result of the evaluation
         """
         if isinstance(self.client, RemoteClient):
-            results = await self._invoke_all(
-                create_remote_config(self, config, REMOTE_EVALUATE))
+            results = await self._invoke_all(create_remote_config(self, config, REMOTE_EVALUATE))
         else:
             if config is not None:
                 results = await self._invoke_all(config)
@@ -111,11 +106,10 @@ class Environment(BaseModel):
             return results[0]
         else:
             return results
-    async def reset(self, configs: HudStyleConfigs | None = None) -> tuple[
-        Observation, dict[str, Any]
-    ]:
+    async def reset(
+        self, configs: FunctionConfigs | None = None
+    ) -> tuple[Observation, dict[str, Any]]:
         """
         Reset the environment.
@@ -126,15 +120,15 @@ class Environment(BaseModel):
             Observation: The first observation from the environment
             info: Dictionary of information about the environment
         """
-        #await self._setup(configs)
+        # await self._setup(configs)
         obs, _, _, info = await self.step()
         if self.task and self.task.prompt:
             obs.text = self.task.prompt
         return obs, info
-    async def step(self, actions: list[CLA] | None = None) -> tuple[
-        Observation, float, bool, dict[str, Any]
-    ]:
+    async def step(
+        self, actions: CLA | list[CLA] | None = None
+    ) -> tuple[Observation, float, bool, dict[str, Any]]:
         """Execute a step in the environment.
         Args:
@@ -143,6 +137,8 @@ class Environment(BaseModel):
         Returns:
             Any: Result of the step execution
         """
+        if not isinstance(actions, list) and actions is not None:
+            actions = [actions]
         if actions is None or len(actions) == 0:
             actions = []
         args = [[action.model_dump() for action in actions]]
@@ -150,20 +146,19 @@ class Environment(BaseModel):
         # TODO: Move this into the server side
         if self._maybe_store_response(actions):
             return Observation(text=self.final_response), 0, False, {}
         result, stdout, stderr = await self.client.invoke(
-            HudStyleConfig(function="step", args=args)
+            FunctionConfig(function="step", args=args)
         )
         if stdout:
             logger.info("Step produced stdout: %s", stdout.decode())
         if stderr:
             logger.warning("Step produced stderr: %s", stderr.decode())
         observation = Observation.model_validate(result["observation"], strict=True)
         return observation, 0, False, {}
     def _maybe_store_response(self, actions: list[CLA]) -> bool:
         """Store the final response into the environment.
@@ -178,14 +173,13 @@ class Environment(BaseModel):
             return True
         return False
     async def get_urls(self) -> dict[str, Any]:
         """Get URLs for the environment.
         Returns:
             dict: Dictionary of URLs for accessing the environment
         """
-        data, _, _ = await self.client.invoke(HudStyleConfig(function="get_urls", args=[]))
+        data, _, _ = await self.client.invoke(FunctionConfig(function="get_urls", args=[]))
         self.url = data.get("url")
         self.live_url = data.get("live_url")
@@ -202,11 +196,43 @@ class Environment(BaseModel):
         """
         await self.client.close()
+    async def stream(self) -> str | None:
+        urls = await self.get_urls()
+        if urls["live_url"] is None:
+            logger.warning("No live URL found")
+            return None
+        # Stream the live view
+        return stream(urls["live_url"])
+    async def run(self, agent: Agent, max_steps: int = 27, verbose: bool = True) -> Any:
+        """Run an agent in the environment.
+        Args:
+            agent: The agent to run
+        """
+        if verbose:
+            logger.info("[HUD] Running agent in environment...")
+        obs, _ = await self.reset()
+        for i in range(max_steps):
+            action, done = await agent.predict(obs)
+            if verbose:
+                logger.info("[HUD] Step %d: Action: %s", i, action)
+            obs, reward, terminated, info = await self.step(action)
+            if verbose:
+                logger.info("[HUD] Step %d: Observation: %s", i, obs)
+            if done or terminated:
+                break
+        result = await self.evaluate()
+        if verbose:
+            logger.info("[HUD] Evaluation result: %s", result)
+        return result
 def create_remote_config(
     env: Environment | None = None,
-    config: HudStyleConfigs | None = None,
+    config: FunctionConfigs | None = None,
     function: str | None = None,
-) -> list[HudStyleConfig]:
+) -> list[FunctionConfig]:
     """
     Create a remote configuration for setup or evaluate, determining the final
     function call structure based on the provided task or explicit config.
@@ -218,11 +244,11 @@ def create_remote_config(
         env: Environment object, potentially containing a task definition.
              Used to access `env.task` and `env.final_response`.
         config: Direct configuration override (e.g., passed to `env.evaluate(config=...)`).
-                Can be in various HudStyleConfigs formats.
+                Can be in various FunctionConfigs formats.
         function: The top-level function context, typically "setup" or "evaluate".
     Returns:
-        list[HudStyleConfig]: A list containing a single HudStyleConfig object
+        list[FunctionConfig]: A list containing a single FunctionConfig object
                               ready for remote invocation via `client.invoke`.
                               The specific function/arguments are chosen based on this priority:
                               1. Explicit `config` parameter (if provided).
@@ -242,8 +268,8 @@ def create_remote_config(
              `config=("contains_text", "Paris")`
              `function="evaluate"`
            - Example Output:
-             `[HudStyleConfig(function='evaluate', args=[
-                HudStyleConfig(function='contains_text', args=['Paris', 'Paris'])
+             `[FunctionConfig(function='evaluate', args=[
+                FunctionConfig(function='contains_text', args=['Paris', 'Paris'])
              ])]`
         2) No explicit `config`, Task has the attribute (e.g., `task.evaluate`):
@@ -255,7 +281,7 @@ def create_remote_config(
              `config=None`
              `function="evaluate"`
            - Example Output:
-             `[HudStyleConfig(function='evaluate', args=[HudStyleConfig(function='check_answer',
+             `[FunctionConfig(function='evaluate', args=[FunctionConfig(function='check_answer',
                 args=['42'], id='t1')])]`
         3) No explicit `config`, no specific Task attribute, Task has `task.config`:
@@ -267,7 +293,7 @@ def create_remote_config(
              `config=None`
              `function="evaluate"`
            - Example Output:
-             `[HudStyleConfig(function='evaluate', args=[{"expected": "val", "id": "t2"}])]`
+             `[FunctionConfig(function='evaluate', args=[{"expected": "val", "id": "t2"}])]`
         4) No explicit `config`, no specific Task attribute, no `task.config`, Task has `task.id`:
            Calls a private function (`private_<function>`) on the remote end, passing
@@ -277,7 +303,7 @@ def create_remote_config(
              `config=None`
              `function="evaluate"`
            - Example Output:
-             `[HudStyleConfig(function='private_evaluate', args=['t3'])]`
+             `[FunctionConfig(function='private_evaluate', args=['t3'])]`
         5) No explicit `config` and no relevant Task info:
            Calls the top-level `function` with empty args.
@@ -286,50 +312,50 @@ def create_remote_config(
              `config=None`
              `function="evaluate"`
            - Example Output:
-             `[HudStyleConfig(function='evaluate', args=[])]`
+             `[FunctionConfig(function='evaluate', args=[])]`
     """
     # If no function provided, just expand the config and return it directly
     if function is None:
         if config:
             return expand_config(config)
         raise ValueError("Either function or config must be provided")
     # Case 1: Explicit config provided
     if config:
         expanded_configs = expand_config(config)
-        if env and env.final_response:
+        if env and env.final_response and expanded_configs[0].args[0] in LOCAL_EVALUATORS:
             # Ensure args is a list before appending
             if not isinstance(expanded_configs[0].args, list):
-                 expanded_configs[0].args = [expanded_configs[0].args]
-            expanded_configs[0].args.append(env.final_response) # for remote responses
-        return [HudStyleConfig(function=function, args=expanded_configs)]
+                expanded_configs[0].args = [expanded_configs[0].args]
+            expanded_configs[0].args.append(env.final_response)  # for remote responses
+        return [FunctionConfig(function=function, args=expanded_configs)]
     # Otherwise, use the environment's task
     task = env.task if env else None
     # Must have a task for the remaining cases
     if task is None:
         raise ValueError("Either task or config must be provided")
     # Case 2: Task has the specified function attribute
     task_config = getattr(task, function, None)
     if task_config:
         expanded_configs = expand_config(task_config)
         if task.id:
-            expanded_configs[0].id = task.id # for remote IDs
-        elif env and env.final_response:
+            expanded_configs[0].id = task.id  # for remote IDs
+        if env and env.final_response and expanded_configs[0].function in LOCAL_EVALUATORS:
             # Ensure args is a list before appending
             if not isinstance(expanded_configs[0].args, list):
-                 expanded_configs[0].args = [expanded_configs[0].args]
-            expanded_configs[0].args.append(env.final_response) # for remote responses
-        return [HudStyleConfig(function=function, args=expanded_configs)]
+                expanded_configs[0].args = [expanded_configs[0].args]
+            expanded_configs[0].args.append(env.final_response)  # for remote responses
+        return [FunctionConfig(function=function, args=expanded_configs)]
     # Case 3: Check for task.config
     if hasattr(task, "config") and task.config:
         # Ensure task.config is a dictionary before adding id
         final_args = task.config.copy() if isinstance(task.config, dict) else {}
         if task.id:
-            final_args["id"] = task.id # for remote IDs
+            final_args["id"] = task.id  # for remote IDs
         if env and env.final_response:
             # Append response, ensuring args exists and is a list
             if "args" not in final_args:
@@ -337,18 +363,17 @@ def create_remote_config(
             if not isinstance(final_args["args"], list):
                 final_args["args"] = [final_args["args"]]
             final_args["args"].append(env.final_response)
-        return [HudStyleConfig(function=function, args=[final_args])]
+        return [FunctionConfig(function=function, args=[final_args])]
     # Case 4: Use task.id
     if task.id:
         args_list = [task.id]
         if env and env.final_response:
-             args_list.append(env.final_response) # Append final response
-        return [HudStyleConfig(function=f"{REMOTE_FUNCTION_PREFIX}{function}", args=args_list)]
+            args_list.append(env.final_response)  # Append final response
+        return [FunctionConfig(function=f"{REMOTE_FUNCTION_PREFIX}{function}", args=args_list)]
     # Case 5: No valid configuration found
     args_list = []
     if env and env.final_response:
         args_list.append(env.final_response)
-    return [HudStyleConfig(function=function, args=args_list)]
+    return [FunctionConfig(function=function, args=args_list)]

hud/env/local_docker_client.py CHANGED Viewed

@@ -19,15 +19,16 @@ if TYPE_CHECKING:
 logger = logging.getLogger("hud.env.docker_env_client")
 class LocalDockerClient(DockerClient):
     """
     Docker-based environment client implementation.
     """
     @classmethod
-    async def create(cls, dockerfile: str, ports: list[int] | None = None) -> tuple[
-            LocalDockerClient, dict[str, Any]
-        ]:
+    async def create(
+        cls, dockerfile: str, ports: list[int] | None = None
+    ) -> tuple[LocalDockerClient, dict[str, Any]]:
         """
         Creates a Docker environment client from a dockerfile.
@@ -86,9 +87,7 @@ class LocalDockerClient(DockerClient):
             "HostConfig": {
                 "PublishAllPorts": True,
             },
-            "ExposedPorts": {
-                f"{port}/tcp": {} for port in ports
-            },
+            "ExposedPorts": {f"{port}/tcp": {} for port in ports},
         }
         container = await docker_client.containers.create(config=container_config)
@@ -198,7 +197,6 @@ class LocalDockerClient(DockerClient):
             exit_code=0,
         )
     async def get_archive(self, path: str) -> bytes:
         """
         Get an archive of a path from the container.

hud-python 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

Potentially problematic release.

hud-python 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl