PyPI - hud-python - Versions diffs - 0.1.5__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

hud-python 0.1.5py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (46) hide show

hud/__init__.py +16 -12
hud/adapters/__init__.py +4 -2
hud/adapters/claude/adapter.py +9 -2
hud/adapters/common/adapter.py +11 -10
hud/adapters/common/types.py +34 -13
hud/adapters/operator/__init__.py +5 -0
hud/adapters/operator/adapter.py +97 -0
hud/agent/__init__.py +7 -0
hud/agent/base.py +109 -0
hud/agent/claude.py +207 -0
hud/agent/operator.py +208 -0
hud/env/__init__.py +11 -0
hud/env/client.py +35 -0
hud/env/docker_client.py +306 -0
hud/env/environment.py +354 -0
hud/env/local_docker_client.py +251 -0
hud/env/remote_client.py +185 -0
hud/env/remote_docker_client.py +221 -0
hud/evaluators/__init__.py +10 -0
hud/evaluators/base.py +31 -0
hud/evaluators/inspect.py +29 -0
hud/evaluators/judge.py +213 -0
hud/evaluators/match.py +163 -0
hud/evaluators/remote.py +78 -0
hud/gym.py +101 -15
hud/job.py +185 -0
hud/server/__init__.py +2 -2
hud/server/requests.py +87 -0
hud/settings.py +13 -2
hud/task.py +144 -0
hud/taskset.py +103 -0
hud/trajectory.py +90 -0
hud/types.py +65 -0
hud/utils/__init__.py +4 -2
hud/utils/common.py +96 -0
hud/utils/config.py +91 -4
hud/utils/telemetry.py +67 -0
hud_python-0.2.1.dist-info/METADATA +181 -0
hud_python-0.2.1.dist-info/RECORD +44 -0
{hud_python-0.1.5.dist-info → hud_python-0.2.1.dist-info}/licenses/LICENSE +1 -1
hud/client.py +0 -200
hud/environment.py +0 -318
hud/run.py +0 -208
hud_python-0.1.5.dist-info/METADATA +0 -125
hud_python-0.1.5.dist-info/RECORD +0 -21
{hud_python-0.1.5.dist-info → hud_python-0.2.1.dist-info}/WHEEL +0 -0

hud/task.py ADDED Viewed

@@ -0,0 +1,144 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any
+from pydantic import BaseModel
+from hud.types import CustomGym, Gym
+from hud.utils.common import HudStyleConfig, HudStyleConfigs
+if TYPE_CHECKING:
+    from inspect_ai.dataset import Sample
+# Environment specifications:
+# These represent the environment as a whole, including both the controller
+# and the environment type (eg, what os, which services are running)
+UBUNTU_DOCKERFILE = "ubuntu:latest"
+def convert_inspect_setup(setup: str) -> list[HudStyleConfig]:
+    """
+    Inspect setup is a single bash string to run in the environment.
+    We convert this into a single HudStyleConfig using the exec command
+    """
+    return [HudStyleConfig(function="bash", args=[setup])]
+class Task(BaseModel):
+    """A task that can be executed and evaluated.
+    A Task represents a specific activity to be performed in an environment.
+    It contains the prompt describing the task and configurations for
+    setting up and evaluating the environment.
+    The setup and evaluate configurations can be in several formats:
+    - String (function name): "chrome.maximize"
+    - Tuple (function with args): ("chrome.activate_tab", 5)
+    - Dict: {"function": "chrome.navigate", "args": ["https://example.com"]}
+    - List of the above: ["chrome.maximize", {"function": "chrome.navigate", "args": ["https://example.com"]}]
+    Attributes:
+        id: The remote task ID (optional if local-only)
+        prompt: The task prompt or instruction
+        setup: Environment setup configuration (optional)
+        evaluate: Configuration for evaluating responses
+        metadata: Additional task metadata
+        choices: Multiple choice answer list (for Inspect compatibility)
+        target: Ideal target output (for Inspect compatibility)
+        files: Files that go along with the task (for Inspect compatibility)
+        gym: Environment specification
+    """
+    id: str | None = None
+    prompt: str
+    setup: HudStyleConfigs | None = None
+    evaluate: HudStyleConfigs | None = None
+    gym: Gym | None = None
+    target: str | list[str] | None = None
+    choices: list[str] | None = None
+    files: dict[str, str] | None = None
+    metadata: dict[str, Any] | None = None
+    config: dict[str, Any] | None = None
+    @classmethod
+    def from_inspect_sample(cls, sample: Sample) -> Task:
+        """Create a Task from an Inspect dataset sample.
+        Automatically detects if a CustomGym (docker) or QA Gym is needed based on sample.sandbox.
+        Configures evaluation using 'response_includes' or 'match_all' based on sample.target.
+        Args:
+            sample: An Inspect dataset Sample object
+        Returns:
+            Task instance
+        The Inspect Sample has these fields:
+        - input (str | list[ChatMessage]): The input to be submitted to the model
+        - choices (list[str] | None): Optional multiple choice answer list
+        - target (str | list[str] | None): Optional ideal target output
+        - id (str | None): Optional unique identifier for sample
+        - metadata (dict[str, Any] | None): Optional arbitrary metadata
+        - sandbox (str | tuple[str, str]): Optional sandbox environment type
+        - files (dict[str, str] | None): Optional files that go with the sample
+        - setup (str | None): Optional setup script to run for sample
+        """
+        prompt = sample.input
+        if isinstance(prompt, list):
+            prompt_parts = []
+            for message in prompt:
+                role = message.role
+                content = message.content
+                prompt_parts.append(f"{role.capitalize()}: {content}")
+            prompt = "\n\n".join(prompt_parts)
+        evaluate_config = None
+        if sample.target:
+            if isinstance(sample.target, str):
+                evaluate_config = ("response_includes", [sample.target])
+            elif isinstance(sample.target, list):
+                evaluate_config = ("match_all", sample.target)
+        task_gym: Gym | None = None
+        task_setup: HudStyleConfigs | None = None
+        sandbox = sample.sandbox
+        dockerfile = None
+        use_qa_gym = True
+        if sandbox:
+            if isinstance(sandbox, str):
+                if sandbox == "docker":
+                    dockerfile = UBUNTU_DOCKERFILE
+                    use_qa_gym = False
+            elif isinstance(sandbox, tuple) and len(sandbox) == 2:
+                sandbox_type, sandbox_config = sandbox
+                if sandbox_type == "docker":
+                    dockerfile = sandbox_config
+                    use_qa_gym = False
+        if use_qa_gym:
+            task_gym = "qa"
+            task_setup = None
+        else:
+            task_gym = CustomGym(
+                dockerfile=dockerfile or UBUNTU_DOCKERFILE,
+                location="local",
+            )
+            task_setup = [x for x in convert_inspect_setup(sample.setup)] if sample.setup else None
+            # TODO: Handle sample.files for CustomGym case if needed
+        return cls(
+            id=None,
+            prompt=prompt,
+            setup=task_setup,
+            metadata=sample.metadata,
+            choices=sample.choices,
+            evaluate=evaluate_config,
+            gym=task_gym,
+            # files=sample.files, # TODO: Decide how/if to handle files
+        )

hud/taskset.py ADDED Viewed

@@ -0,0 +1,103 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from pydantic import BaseModel
+from hud.server import make_request
+from hud.settings import settings
+from hud.task import Task
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+    from inspect_ai.dataset import Dataset
+class TaskSet(BaseModel):
+    """
+    Collection of related tasks for benchmarking.
+    Attributes:
+        id: Unique identifier for the taskset
+        description: Description of the taskset
+        tasks: List of Task objects in the taskset
+    """
+    id: str | None = None
+    description: str | None = None
+    tasks: list[Task] = []
+    def __getitem__(self, index: int) -> Task:
+        """
+        Allows accessing tasks by index using square bracket notation.
+        Args:
+            index: The index of the task to retrieve
+        Returns:
+            Task: The task at the specified index
+        Raises:
+            IndexError: If the index is out of range
+        """
+        return self.tasks[index]
+    def __len__(self) -> int:
+        """
+        Returns the number of tasks in the taskset.
+        Returns:
+            int: The number of tasks in the taskset
+        """
+        return len(self.tasks)
+    def __iter__(self) -> Iterator[Task]:
+        """
+        Returns an iterator over the tasks in the taskset.
+        """
+        return iter(self.tasks)
+async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
+    """
+    Loads a TaskSet by its ID.
+    Args:
+        taskset_id: The ID of the taskset to load
+        api_key: Optional API key to use for the request
+    Returns:
+        TaskSet: The loaded taskset
+    """
+    if api_key is None:
+        api_key = settings.api_key
+    data = await make_request(
+        method="GET",
+        url=f"{settings.base_url}/v2/tasksets/{taskset_id}/tasks",
+        api_key=api_key,
+    )
+    return TaskSet.model_validate({
+        "id": taskset_id,
+        "tasks": data["evalset"],
+    })
+def load_from_inspect(dataset: Dataset) -> TaskSet:
+    """
+    Creates a TaskSet from an inspect-ai dataset.
+    Args:
+        dataset: An inspect-ai dataset
+    Returns:
+        TaskSet: A new TaskSet instance
+    """
+    tasks = [Task.from_inspect_sample(sample) for sample in dataset]
+    return TaskSet(
+        id=None,
+        tasks=tasks,
+        description=dataset.name,
+    )

hud/trajectory.py ADDED Viewed

@@ -0,0 +1,90 @@
+# ruff: noqa: T201
+from __future__ import annotations
+import datetime
+from IPython.display import HTML, Markdown, display
+from pydantic import BaseModel, Field
+class TrajectoryStep(BaseModel):
+    """Model representing a single task run's trajectory information."""
+    observation_url: str | None = None
+    observation_text: str | None = None
+    actions: list[dict]
+    start_timestamp: str | None = None
+    end_timestamp: str | None = None
+class Trajectory(BaseModel):
+    """Model representing a single task run's trajectory information."""
+    id: str
+    reward: float | None = None
+    logs: str | None = None
+    error: str | None = None
+    trajectory: list[TrajectoryStep] = Field(default_factory=list)
+    def display(self) -> None:
+        trajectory_start_timestamp_str = self.trajectory[0].start_timestamp
+        t_start_dt = (
+            datetime.datetime.fromisoformat(
+                trajectory_start_timestamp_str.replace("Z", "+00:00")
+            )
+            if trajectory_start_timestamp_str
+            else None
+        )
+        for i, step in enumerate(self.trajectory):
+            # Use Markdown for better step separation in Jupyter
+            display(Markdown(f"### Step {i + 1}"))
+            # Observation Image
+            if step.observation_url:
+                try:
+                    # Display in Jupyter/IPython environment using HTML
+                    display(Markdown("**Observation Image:**"))
+                    display(HTML(f'<img src="{step.observation_url}" style="max-width:100%;"/>'))
+                    display(Markdown(f"[Image Link]({step.observation_url})"))
+                except Exception as e:
+                    print(f"    [Error processing image: {e}]")
+            elif not step.observation_text: # Only print if no image AND no text
+                 print("    No visual or text observation provided.")
+            # Observation Text
+            if step.observation_text:
+                print(f"    Observation Text: {step.observation_text}")
+            # Actions
+            print(f"\n    Actions: {step.actions}") # Added newline for spacing
+            # Duration
+            duration_str = "N/A"
+            step_start_timestamp = self.trajectory[i].start_timestamp
+            step_end_timestamp = self.trajectory[i].end_timestamp
+            if step_start_timestamp and step_end_timestamp and t_start_dt:
+                try:
+                    # Attempt to parse timestamps (assuming ISO format)
+                    start_dt = datetime.datetime.fromisoformat(
+                        step_start_timestamp.replace("Z", "+00:00")
+                    )
+                    end_dt = datetime.datetime.fromisoformat(
+                        step_end_timestamp.replace("Z", "+00:00")
+                    )
+                    duration = end_dt - start_dt
+                    total_seconds = duration.total_seconds()
+                    minutes = int(total_seconds // 60)
+                    seconds = total_seconds % 60
+                    duration_str = f"{minutes}m {seconds:.2f}s"
+                    # Calculate the total duration up to this step
+                    total_duration = end_dt - t_start_dt
+                    total_minutes = int(total_duration.total_seconds() // 60)
+                    total_seconds = total_duration.total_seconds() % 60
+                    total_duration_str = f"{total_minutes}m {total_seconds:.2f}s"
+                except ValueError:
+                    duration_str = "Error parsing timestamps" # Handle potential format issues
+            print(f"    Step Duration: {duration_str}")
+            print(f"    Total Duration: {total_duration_str}")
+            display(Markdown("---")) # Use Markdown horizontal rule

hud/types.py ADDED Viewed

@@ -0,0 +1,65 @@
+from __future__ import annotations
+import enum
+from pathlib import Path
+from typing import Any, Literal
+from pydantic import BaseModel
+class CustomGym(BaseModel):
+    """
+    Public environment specification with a dockerfile and controller.
+    If the location is remote, the env will be created on the server.
+    If the location is dev, the env will be created locally via docker.
+    The dockerfile can be specified directly or automatically found in the controller_source_dir.
+    If neither is provided, an error will be raised during validation.
+    """
+    type: Literal["public"] = "public"
+    dockerfile: str | None = None
+    location: Literal["local", "remote"]
+    ports: list[int] | None = None
+    # If path, then it is a development environment on the local computer
+    # If none, then the controller must be installed in the environment through the dockerfile
+    # Can be provided as a string or Path object
+    controller_source_dir: str | Path | None = None
+    def model_post_init(self, __context: Any, /) -> None:
+        """Validate and set up dockerfile if not explicitly provided."""
+        # Convert string path to Path object if needed
+        if isinstance(self.controller_source_dir, str):
+            self.controller_source_dir = Path(self.controller_source_dir)
+        if self.dockerfile is None:
+            if self.controller_source_dir is None:
+                raise ValueError("Either dockerfile or controller_source_dir must be provided")
+            # Look for Dockerfile in the controller_source_dir
+            dockerfile_path = self.controller_source_dir / "Dockerfile"
+            if not dockerfile_path.exists():
+                raise ValueError(f"Dockerfile not found in {self.controller_source_dir}")
+            # Read the Dockerfile content
+            self.dockerfile = dockerfile_path.read_text()
+# Strings are identifiers for gyms on the HUD server
+Gym = CustomGym | str
+class EnvironmentStatus(str, enum.Enum):
+    """
+    Status of the environment.
+    Attributes:
+        INITIALIZING: The environment is initializing
+        RUNNING: The environment is running
+        COMPLETED: The environment is completed
+        ERROR: The environment is in an error state
+    """
+    INITIALIZING = "initializing"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    ERROR = "error"

hud/utils/__init__.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from __future__ import annotations
-from .config import configuration
+from .common import ExecuteResult
+from .config import HudStyleConfig, HudStyleConfigs, expand_config
+from .telemetry import stream
-__all__ = ["configuration"]
+__all__ = ["ExecuteResult", "HudStyleConfig", "HudStyleConfigs", "expand_config", "stream"]

hud/utils/common.py ADDED Viewed

@@ -0,0 +1,96 @@
+from __future__ import annotations
+import io
+import logging
+import tarfile
+from typing import TYPE_CHECKING, Any, TypedDict
+from pydantic import BaseModel
+from hud.server.requests import make_request
+from hud.settings import settings
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+    from pathlib import Path
+logger = logging.getLogger("hud.utils.common")
+class HudStyleConfig(BaseModel):
+    function: str  # Format: "x.y.z"
+    args: list[Any] # Must be json serializable
+    id: str | None = None # Optional id for remote execution
+    def __len__(self) -> int:
+        return len(self.args)
+    def __getitem__(self, index: int) -> Any:
+        return self.args[index]
+    def __iter__(self) -> Iterator[Any]:
+        return iter(self.args)
+    def __str__(self) -> str:
+        return f"{self.function}: {', '.join(str(arg) for arg in self.args)}"
+# Type alias for the shorthand config, which just converts to function name and args
+ShorthandConfig = tuple[str | dict[str, Any] | list[str] | list[dict[str, Any]], ...]
+# Type alias for multiple config formats
+HudStyleConfigs = ShorthandConfig | HudStyleConfig | list[HudStyleConfig] | dict[str, Any] | str
+class ExecuteResult(TypedDict):
+    """
+    Result of an execute command.
+    Attributes:
+        stdout: Standard output from the command
+        stderr: Standard error from the command
+        exit_code: Exit code of the command
+    """
+    stdout: bytes
+    stderr: bytes
+    exit_code: int
+def directory_to_tar_bytes(directory_path: Path) -> bytes:
+    """
+    Converts a directory to a tar archive and returns it as bytes.
+    This function creates a tar archive of the specified directory in memory,
+    without writing to a temporary file on disk.
+    Args:
+        path: Path to the directory to convert
+    Returns:
+        Bytes of the tar archive
+    """
+    output = io.BytesIO()
+    with tarfile.open(fileobj=output, mode="w") as tar:
+        # Walk through the directory
+        for file_path in directory_path.rglob("*"):
+            if file_path.is_file():
+                # Calculate relative path for the archive
+                rel_path = file_path.relative_to(directory_path)
+                logger.debug("Adding %s to tar archive", rel_path)
+                tar.add(file_path, arcname=str(rel_path))
+    # Get the bytes from the BytesIO object
+    output.seek(0)
+    return output.getvalue()
+async def get_gym_id(gym_name_or_id: str) -> str:
+    """
+    Get the gym ID for a given gym name or ID.
+    """
+    data = await make_request(
+        method="GET",
+        url=f"{settings.base_url}/v1/gyms/{gym_name_or_id}",
+        api_key=settings.api_key,
+    )
+    return data["id"]

hud/utils/config.py CHANGED Viewed

@@ -1,7 +1,94 @@
 from __future__ import annotations
-from hud.settings import settings
+import logging
+import re
-# For backwards compatibility, keep 'configuration'
-# but have it point to the settings instance
-configuration = settings
+from hud.utils.common import HudStyleConfig, HudStyleConfigs
+logger = logging.getLogger("hud.utils.config")
+REMOTE_FUNCTION_PREFIX = "private_"
+REMOTE_SETUP = "setup"
+REMOTE_EVALUATE = "evaluate"
+def _is_valid_python_name(name: str) -> bool:
+    """Check if a string is a valid Python identifier."""
+    return bool(re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", name))
+def _validate_hud_config(config: dict) -> HudStyleConfig:
+    """Validate and convert a dictionary to an HudStyleConfig."""
+    if not isinstance(config.get("function"), str):
+        raise ValueError("function must be a string")
+    # Validate function path components
+    _split_and_validate_path(config["function"])
+    args = config["args"] if isinstance(config.get("args"), list) else [config["args"]]
+    # Create a proper HudStyleConfig object instead of using cast
+    return HudStyleConfig(function=config["function"], args=args, id=config.get("id"))
+def _split_and_validate_path(path: str) -> None:
+    """Split a function path into components, validating each part."""
+    parts = path.split(".")
+    if not parts:
+        raise ValueError("Empty function path")
+    # Validate each part
+    for part in parts:
+        if not _is_valid_python_name(part):
+            raise ValueError(f"Invalid Python identifier in path: {part}")
+def expand_config(config: HudStyleConfigs) -> list[HudStyleConfig]:
+    """
+    Process a config into a standardized list of HudStyleConfig objects.
+    Args:
+        config: Can be:
+            - A tuple where first element is function name and rest are args
+            - A HudStyleConfig object
+            - A dictionary with "function" and "args" keys
+            - A list of HudStyleConfig objects
+    Returns:
+        list[HudStyleConfig]: List of standardized configurations
+    Raises:
+        ValueError: If the configuration format is invalid
+    """
+    logger.debug("Processing config: %s", config)
+    # If it's already a HudStyleConfig, just wrap it in a list
+    if isinstance(config, HudStyleConfig):
+        return [config]
+    # If it's a list of HudStyleConfigs, return as is
+    if isinstance(config, list) and all(isinstance(item, HudStyleConfig) for item in config):
+        return config
+    # Handle dictionary configuration
+    if isinstance(config, dict):
+        return [_validate_hud_config(config)]
+    if isinstance(config, str):
+        return [HudStyleConfig(function=config, args=[])]
+    # Handle tuple format
+    if isinstance(config, tuple):
+        if len(config) < 1 or not isinstance(config[0], str):
+            error_msg = "Invalid tuple configuration. "
+            "Expected tuple[str, ...], got: {type(config)}"
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+        # First element is the function name, rest are args
+        function_name = config[0]
+        args = list(config[1:]) if len(config) > 1 else []
+        return [HudStyleConfig(function=function_name, args=args)]
+    # Unknown configuration type
+    error_msg = f"Unknown configuration type: {type(config)}"
+    logger.error(error_msg)
+    raise ValueError(error_msg)

hud/utils/telemetry.py ADDED Viewed

@@ -0,0 +1,67 @@
+from __future__ import annotations
+import logging
+logger = logging.getLogger(__name__)
+def stream(live_url: str | None = None) -> str:
+    """
+    Display a stream in the HUD system.
+    """
+    if live_url is None:
+        raise ValueError("live_url cannot be None")
+    from IPython.display import HTML, display
+    html_content = f"""
+    <div style="width: 960px; height: 540px; overflow: hidden;">
+        <div style="transform: scale(0.5); transform-origin: top left;">
+            <iframe src="{live_url}" width="1920" height="1080" style="border: 1px solid #ddd;">
+            </iframe>
+        </div>
+    </div>
+    """
+    try:
+        display(HTML(html_content))
+    except Exception as e:
+        logger.warning(e)
+    return html_content
+def display_screenshot(base64_image: str, width: int = 960, height: int = 540) -> str:
+    """
+    Display a base64-encoded screenshot image.
+    Args:
+        base64_image: Base64-encoded image string (without the data URI prefix)
+        width: Display width in pixels
+        height: Display height in pixels
+    Returns:
+        The HTML string used to display the image
+    Note:
+        This function will both display the image in IPython environments
+        and return the HTML string for other contexts.
+    """
+    from IPython.display import HTML, display
+    # Ensure the base64 image doesn't already have the data URI prefix
+    if base64_image.startswith("data:image"):
+        img_src = base64_image
+    else:
+        img_src = f"data:image/png;base64,{base64_image}"
+    html_content = f"""
+    <div style="width: {width}px; height: {height}px; overflow: hidden; margin: 10px 0; border: 1px solid #ddd;">
+        <img src="{img_src}" style="max-width: 100%; max-height: 100%;">
+    </div>
+    """  # noqa: E501
+    # Display in IPython environments
+    try:
+        display(HTML(html_content))
+    except Exception as e:
+        logger.warning(e)
+    return html_content

hud-python 0.1.5__py3-none-any.whl → 0.2.1__py3-none-any.whl

Potentially problematic release.

hud-python 0.1.5py3-none-any.whl → 0.2.1py3-none-any.whl