PyPI - hud-python - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

hud-python 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (59) hide show

hud/__init__.py +5 -3
hud/adapters/__init__.py +2 -1
hud/adapters/claude/adapter.py +13 -17
hud/adapters/common/adapter.py +3 -3
hud/adapters/common/tests/__init__.py +0 -0
hud/adapters/common/tests/test_adapter.py +277 -0
hud/adapters/common/types.py +3 -6
hud/adapters/operator/adapter.py +22 -29
hud/agent/__init__.py +9 -1
hud/agent/base.py +28 -28
hud/agent/claude.py +69 -60
hud/agent/langchain.py +204 -0
hud/agent/operator.py +75 -67
hud/env/__init__.py +5 -5
hud/env/client.py +2 -2
hud/env/docker_client.py +37 -39
hud/env/environment.py +91 -66
hud/env/local_docker_client.py +5 -7
hud/env/remote_client.py +40 -29
hud/env/remote_docker_client.py +13 -3
hud/evaluators/__init__.py +2 -3
hud/evaluators/base.py +4 -3
hud/evaluators/inspect.py +3 -8
hud/evaluators/judge.py +34 -58
hud/evaluators/match.py +42 -49
hud/evaluators/remote.py +13 -26
hud/evaluators/tests/__init__.py +0 -0
hud/evaluators/tests/test_inspect.py +12 -0
hud/evaluators/tests/test_judge.py +231 -0
hud/evaluators/tests/test_match.py +115 -0
hud/evaluators/tests/test_remote.py +98 -0
hud/exceptions.py +167 -0
hud/gym.py +12 -10
hud/job.py +525 -47
hud/server/__init__.py +2 -2
hud/server/requests.py +148 -186
hud/server/tests/__init__.py +0 -0
hud/server/tests/test_requests.py +275 -0
hud/settings.py +3 -2
hud/task.py +12 -22
hud/taskset.py +44 -11
hud/trajectory.py +6 -9
hud/types.py +14 -9
hud/utils/__init__.py +2 -2
hud/utils/common.py +37 -13
hud/utils/config.py +44 -29
hud/utils/progress.py +149 -0
hud/utils/telemetry.py +10 -11
hud/utils/tests/__init__.py +0 -0
hud/utils/tests/test_common.py +52 -0
hud/utils/tests/test_config.py +129 -0
hud/utils/tests/test_progress.py +225 -0
hud/utils/tests/test_telemetry.py +37 -0
hud/utils/tests/test_version.py +8 -0
{hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/METADATA +44 -21
hud_python-0.2.3.dist-info/RECORD +62 -0
hud_python-0.2.1.dist-info/RECORD +0 -44
{hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/WHEEL +0 -0
{hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/licenses/LICENSE +0 -0

hud/settings.py CHANGED Viewed

@@ -25,19 +25,20 @@ class Settings(BaseSettings):
         description="API key for authentication with the HUD API",
         validation_alias="HUD_API_KEY",
     )
     anthropic_api_key: str | None = Field(
         default=None,
         description="API key for Anthropic models",
         validation_alias="ANTHROPIC_API_KEY",
     )
     openai_api_key: str | None = Field(
         default=None,
         description="API key for OpenAI models",
         validation_alias="OPENAI_API_KEY",
     )
 # Create a singleton instance
 settings = Settings()

hud/task.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any
 from pydantic import BaseModel
 from hud.types import CustomGym, Gym
-from hud.utils.common import HudStyleConfig, HudStyleConfigs
+from hud.utils.common import FunctionConfig, FunctionConfigs
 if TYPE_CHECKING:
     from inspect_ai.dataset import Sample
@@ -17,12 +17,12 @@ if TYPE_CHECKING:
 UBUNTU_DOCKERFILE = "ubuntu:latest"
-def convert_inspect_setup(setup: str) -> list[HudStyleConfig]:
+def convert_inspect_setup(setup: str) -> list[FunctionConfig]:
     """
     Inspect setup is a single bash string to run in the environment.
-    We convert this into a single HudStyleConfig using the exec command
+    We convert this into a single FunctionConfig using the exec command
     """
-    return [HudStyleConfig(function="bash", args=[setup])]
+    return [FunctionConfig(function="bash", args=[setup])]
 class Task(BaseModel):
@@ -52,16 +52,9 @@ class Task(BaseModel):
     id: str | None = None
     prompt: str
-    setup: HudStyleConfigs | None = None
-    evaluate: HudStyleConfigs | None = None
+    setup: FunctionConfigs | None = None
+    evaluate: FunctionConfigs | None = None
     gym: Gym | None = None
-    target: str | list[str] | None = None
-    choices: list[str] | None = None
-    files: dict[str, str] | None = None
-    metadata: dict[str, Any] | None = None
     config: dict[str, Any] | None = None
     @classmethod
@@ -75,7 +68,7 @@ class Task(BaseModel):
         Returns:
             Task instance
         The Inspect Sample has these fields:
         - input (str | list[ChatMessage]): The input to be submitted to the model
         - choices (list[str] | None): Optional multiple choice answer list
@@ -103,8 +96,8 @@ class Task(BaseModel):
                 evaluate_config = ("match_all", sample.target)
         task_gym: Gym | None = None
-        task_setup: HudStyleConfigs | None = None
+        task_setup: FunctionConfigs | None = None
         sandbox = sample.sandbox
         dockerfile = None
         use_qa_gym = True
@@ -112,7 +105,7 @@ class Task(BaseModel):
         if sandbox:
             if isinstance(sandbox, str):
                 if sandbox == "docker":
-                    dockerfile = UBUNTU_DOCKERFILE
+                    dockerfile = UBUNTU_DOCKERFILE
                     use_qa_gym = False
             elif isinstance(sandbox, tuple) and len(sandbox) == 2:
                 sandbox_type, sandbox_config = sandbox
@@ -122,7 +115,7 @@ class Task(BaseModel):
         if use_qa_gym:
             task_gym = "qa"
-            task_setup = None
+            task_setup = None
         else:
             task_gym = CustomGym(
                 dockerfile=dockerfile or UBUNTU_DOCKERFILE,
@@ -131,14 +124,11 @@ class Task(BaseModel):
             task_setup = [x for x in convert_inspect_setup(sample.setup)] if sample.setup else None
             # TODO: Handle sample.files for CustomGym case if needed
         return cls(
             id=None,
             prompt=prompt,
             setup=task_setup,
-            metadata=sample.metadata,
-            choices=sample.choices,
-            evaluate=evaluate_config,
+            evaluate=evaluate_config,
             gym=task_gym,
             # files=sample.files, # TODO: Decide how/if to handle files
         )

hud/taskset.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING
+from venv import logger
 from pydantic import BaseModel
@@ -23,10 +24,11 @@ class TaskSet(BaseModel):
         description: Description of the taskset
         tasks: List of Task objects in the taskset
     """
     id: str | None = None
     description: str | None = None
     tasks: list[Task] = []
     def __getitem__(self, index: int) -> Task:
         """
         Allows accessing tasks by index using square bracket notation.
@@ -41,7 +43,7 @@ class TaskSet(BaseModel):
             IndexError: If the index is out of range
         """
         return self.tasks[index]
     def __len__(self) -> int:
         """
         Returns the number of tasks in the taskset.
@@ -50,14 +52,40 @@ class TaskSet(BaseModel):
             int: The number of tasks in the taskset
         """
         return len(self.tasks)
     def __iter__(self) -> Iterator[Task]:
         """
         Returns an iterator over the tasks in the taskset.
         """
         return iter(self.tasks)
+    async def upload(
+        self,
+        name: str,
+        description: str | None = None,
+        api_key: str | None = None,
+    ) -> None:
+        """
+        Uploads the taskset to the server.
+        """
+        if api_key is None:
+            api_key = settings.api_key
+        await make_request(
+            method="POST",
+            url=f"{settings.base_url}/v2/tasksets",
+            api_key=api_key,
+            json={
+                "name": name,
+                "description": description,
+                "tasks": [task.model_dump() for task in self.tasks],
+            },
+        )
+        logger.info(
+            "[HUD] Taskset %s uploaded successfully, see it on app.hud.so/tasksets/%s", name, name
+        )
 async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
     """
     Loads a TaskSet by its ID.
@@ -69,20 +97,25 @@ async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
     Returns:
         TaskSet: The loaded taskset
     """
     if api_key is None:
         api_key = settings.api_key
     data = await make_request(
         method="GET",
         url=f"{settings.base_url}/v2/tasksets/{taskset_id}/tasks",
         api_key=api_key,
     )
-    return TaskSet.model_validate({
-        "id": taskset_id,
-        "tasks": data["evalset"],
-    })
+    logger.info(f"[HUD] Taskset {taskset_id} loaded successfully")
+    return TaskSet.model_validate(
+        {
+            "id": taskset_id,
+            "tasks": data["evalset"],
+        }
+    )
 def load_from_inspect(dataset: Dataset) -> TaskSet:
     """

hud/trajectory.py CHANGED Viewed

@@ -29,9 +29,7 @@ class Trajectory(BaseModel):
     def display(self) -> None:
         trajectory_start_timestamp_str = self.trajectory[0].start_timestamp
         t_start_dt = (
-            datetime.datetime.fromisoformat(
-                trajectory_start_timestamp_str.replace("Z", "+00:00")
-            )
+            datetime.datetime.fromisoformat(trajectory_start_timestamp_str.replace("Z", "+00:00"))
             if trajectory_start_timestamp_str
             else None
         )
@@ -48,16 +46,15 @@ class Trajectory(BaseModel):
                     display(Markdown(f"[Image Link]({step.observation_url})"))
                 except Exception as e:
                     print(f"    [Error processing image: {e}]")
-            elif not step.observation_text: # Only print if no image AND no text
-                 print("    No visual or text observation provided.")
+            elif not step.observation_text:  # Only print if no image AND no text
+                print("    No visual or text observation provided.")
             # Observation Text
             if step.observation_text:
                 print(f"    Observation Text: {step.observation_text}")
             # Actions
-            print(f"\n    Actions: {step.actions}") # Added newline for spacing
+            print(f"\n    Actions: {step.actions}")  # Added newline for spacing
             # Duration
             duration_str = "N/A"
@@ -84,7 +81,7 @@ class Trajectory(BaseModel):
                     total_seconds = total_duration.total_seconds() % 60
                     total_duration_str = f"{total_minutes}m {total_seconds:.2f}s"
                 except ValueError:
-                    duration_str = "Error parsing timestamps" # Handle potential format issues
+                    duration_str = "Error parsing timestamps"  # Handle potential format issues
             print(f"    Step Duration: {duration_str}")
             print(f"    Total Duration: {total_duration_str}")
-            display(Markdown("---")) # Use Markdown horizontal rule
+            display(Markdown("---"))  # Use Markdown horizontal rule

hud/types.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 import enum
 from pathlib import Path
-from typing import Any, Literal
+from typing import Any, Literal, TypeAlias
 from pydantic import BaseModel
@@ -10,13 +10,14 @@ from pydantic import BaseModel
 class CustomGym(BaseModel):
     """
     Public environment specification with a dockerfile and controller.
     If the location is remote, the env will be created on the server.
     If the location is dev, the env will be created locally via docker.
     The dockerfile can be specified directly or automatically found in the controller_source_dir.
     If neither is provided, an error will be raised during validation.
     """
     type: Literal["public"] = "public"
     dockerfile: str | None = None
     location: Literal["local", "remote"]
@@ -25,27 +26,25 @@ class CustomGym(BaseModel):
     # If none, then the controller must be installed in the environment through the dockerfile
     # Can be provided as a string or Path object
     controller_source_dir: str | Path | None = None
     def model_post_init(self, __context: Any, /) -> None:
         """Validate and set up dockerfile if not explicitly provided."""
         # Convert string path to Path object if needed
         if isinstance(self.controller_source_dir, str):
             self.controller_source_dir = Path(self.controller_source_dir)
         if self.dockerfile is None:
             if self.controller_source_dir is None:
                 raise ValueError("Either dockerfile or controller_source_dir must be provided")
             # Look for Dockerfile in the controller_source_dir
             dockerfile_path = self.controller_source_dir / "Dockerfile"
             if not dockerfile_path.exists():
                 raise ValueError(f"Dockerfile not found in {self.controller_source_dir}")
             # Read the Dockerfile content
             self.dockerfile = dockerfile_path.read_text()
-# Strings are identifiers for gyms on the HUD server
-Gym = CustomGym | str
 class EnvironmentStatus(str, enum.Enum):
     """
@@ -63,3 +62,9 @@ class EnvironmentStatus(str, enum.Enum):
     COMPLETED = "completed"
     ERROR = "error"
+# Available HUD gyms
+ServerGym: TypeAlias = Literal["qa", "hud-browser", "hud-ubuntu", "OSWorld-Ubuntu"]
+# Gyms can be either custom or server-side
+Gym: TypeAlias = CustomGym | ServerGym

hud/utils/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 from .common import ExecuteResult
-from .config import HudStyleConfig, HudStyleConfigs, expand_config
+from .config import FunctionConfig, FunctionConfigs, expand_config
 from .telemetry import stream
-__all__ = ["ExecuteResult", "HudStyleConfig", "HudStyleConfigs", "expand_config", "stream"]
+__all__ = ["ExecuteResult", "FunctionConfig", "FunctionConfigs", "expand_config", "stream"]

hud/utils/common.py CHANGED Viewed

@@ -16,29 +16,52 @@ if TYPE_CHECKING:
 logger = logging.getLogger("hud.utils.common")
-class HudStyleConfig(BaseModel):
+class FunctionConfig(BaseModel):
     function: str  # Format: "x.y.z"
-    args: list[Any] # Must be json serializable
+    args: list[Any]  # Must be json serializable
-    id: str | None = None # Optional id for remote execution
+    id: str | None = None  # Optional id for remote execution
     def __len__(self) -> int:
         return len(self.args)
     def __getitem__(self, index: int) -> Any:
         return self.args[index]
     def __iter__(self) -> Iterator[Any]:
         return iter(self.args)
     def __str__(self) -> str:
         return f"{self.function}: {', '.join(str(arg) for arg in self.args)}"
 # Type alias for the shorthand config, which just converts to function name and args
 ShorthandConfig = tuple[str | dict[str, Any] | list[str] | list[dict[str, Any]], ...]
 # Type alias for multiple config formats
-HudStyleConfigs = ShorthandConfig | HudStyleConfig | list[HudStyleConfig] | dict[str, Any] | str
+FunctionConfigs = (
+    ShorthandConfig
+    | FunctionConfig
+    | list[FunctionConfig]
+    | list[ShorthandConfig]
+    | dict[str, Any]
+    | str
+)
+class Observation(BaseModel):
+    """
+    Observation from the environment.
+    Attributes:
+        screenshot: Base64 encoded PNG string of the screen
+        text: Text observation, if available
+    """
+    screenshot: str | None = None  # base64 string png
+    text: str | None = None
 class ExecuteResult(TypedDict):
     """
@@ -49,26 +72,27 @@ class ExecuteResult(TypedDict):
         stderr: Standard error from the command
         exit_code: Exit code of the command
     """
     stdout: bytes
     stderr: bytes
     exit_code: int
 def directory_to_tar_bytes(directory_path: Path) -> bytes:
     """
     Converts a directory to a tar archive and returns it as bytes.
     This function creates a tar archive of the specified directory in memory,
     without writing to a temporary file on disk.
     Args:
         path: Path to the directory to convert
     Returns:
         Bytes of the tar archive
     """
     output = io.BytesIO()
     with tarfile.open(fileobj=output, mode="w") as tar:
         # Walk through the directory
         for file_path in directory_path.rglob("*"):
@@ -77,7 +101,7 @@ def directory_to_tar_bytes(directory_path: Path) -> bytes:
                 rel_path = file_path.relative_to(directory_path)
                 logger.debug("Adding %s to tar archive", rel_path)
                 tar.add(file_path, arcname=str(rel_path))
     # Get the bytes from the BytesIO object
     output.seek(0)
     return output.getvalue()

hud/utils/config.py CHANGED Viewed

@@ -2,8 +2,12 @@ from __future__ import annotations
 import logging
 import re
+from typing import TYPE_CHECKING
-from hud.utils.common import HudStyleConfig, HudStyleConfigs
+from hud.utils.common import FunctionConfig, FunctionConfigs
+if TYPE_CHECKING:
+    from typing import TypeGuard
 logger = logging.getLogger("hud.utils.config")
@@ -11,22 +15,27 @@ REMOTE_FUNCTION_PREFIX = "private_"
 REMOTE_SETUP = "setup"
 REMOTE_EVALUATE = "evaluate"
+LOCAL_EVALUATORS = ["response_is", "response_includes", "response_match"]
 def _is_valid_python_name(name: str) -> bool:
     """Check if a string is a valid Python identifier."""
     return bool(re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", name))
-def _validate_hud_config(config: dict) -> HudStyleConfig:
-    """Validate and convert a dictionary to an HudStyleConfig."""
+def _validate_hud_config(config: dict) -> FunctionConfig:
+    """Validate and convert a dictionary to an FunctionConfig."""
     if not isinstance(config.get("function"), str):
         raise ValueError("function must be a string")
     # Validate function path components
     _split_and_validate_path(config["function"])
     args = config["args"] if isinstance(config.get("args"), list) else [config["args"]]
-    # Create a proper HudStyleConfig object instead of using cast
-    return HudStyleConfig(function=config["function"], args=args, id=config.get("id"))
+    # Create a proper FunctionConfig object instead of using cast
+    return FunctionConfig(function=config["function"], args=args, id=config.get("id"))
 def _split_and_validate_path(path: str) -> None:
     """Split a function path into components, validating each part."""
@@ -34,46 +43,52 @@ def _split_and_validate_path(path: str) -> None:
     if not parts:
         raise ValueError("Empty function path")
     # Validate each part
     for part in parts:
         if not _is_valid_python_name(part):
             raise ValueError(f"Invalid Python identifier in path: {part}")
-def expand_config(config: HudStyleConfigs) -> list[HudStyleConfig]:
+def _is_list_of_configs(config: FunctionConfigs) -> TypeGuard[list[FunctionConfig]]:
+    """Check if a config is a list of FunctionConfig objects."""
+    return isinstance(config, list) and all(isinstance(item, FunctionConfig) for item in config)
+def expand_config(config: FunctionConfigs) -> list[FunctionConfig]:
     """
-    Process a config into a standardized list of HudStyleConfig objects.
+    Process a config into a standardized list of FunctionConfig objects.
     Args:
         config: Can be:
             - A tuple where first element is function name and rest are args
-            - A HudStyleConfig object
+            - A FunctionConfig object
             - A dictionary with "function" and "args" keys
-            - A list of HudStyleConfig objects
+            - A list of FunctionConfig objects
     Returns:
-        list[HudStyleConfig]: List of standardized configurations
+        list[FunctionConfig]: List of standardized configurations
     Raises:
         ValueError: If the configuration format is invalid
     """
     logger.debug("Processing config: %s", config)
-    # If it's already a HudStyleConfig, just wrap it in a list
-    if isinstance(config, HudStyleConfig):
+    # If it's already a FunctionConfig, just wrap it in a list
+    if isinstance(config, FunctionConfig):
         return [config]
-    # If it's a list of HudStyleConfigs, return as is
-    if isinstance(config, list) and all(isinstance(item, HudStyleConfig) for item in config):
+    # If it's a list of FunctionConfigs, return as is
+    if _is_list_of_configs(config):
         return config
     # Handle dictionary configuration
     if isinstance(config, dict):
         return [_validate_hud_config(config)]
     if isinstance(config, str):
-        return [HudStyleConfig(function=config, args=[])]
+        return [FunctionConfig(function=config, args=[])]
     # Handle tuple format
     if isinstance(config, tuple):
         if len(config) < 1 or not isinstance(config[0], str):
@@ -81,13 +96,13 @@ def expand_config(config: HudStyleConfigs) -> list[HudStyleConfig]:
             "Expected tuple[str, ...], got: {type(config)}"
             logger.error(error_msg)
             raise ValueError(error_msg)
         # First element is the function name, rest are args
         function_name = config[0]
         args = list(config[1:]) if len(config) > 1 else []
-        return [HudStyleConfig(function=function_name, args=args)]
+        return [FunctionConfig(function=function_name, args=args)]
     # Unknown configuration type
     error_msg = f"Unknown configuration type: {type(config)}"
     logger.error(error_msg)

hud-python 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

Potentially problematic release.

hud-python 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl