PyPI - hud-python - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

hud-python 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (50) hide show

hud/__init__.py +22 -2
hud/adapters/claude/adapter.py +9 -2
hud/adapters/claude/tests/__init__.py +1 -0
hud/adapters/claude/tests/test_adapter.py +519 -0
hud/adapters/common/types.py +5 -1
hud/adapters/operator/adapter.py +4 -0
hud/adapters/operator/tests/__init__.py +1 -0
hud/adapters/operator/tests/test_adapter.py +370 -0
hud/agent/__init__.py +4 -0
hud/agent/base.py +18 -2
hud/agent/claude.py +20 -17
hud/agent/claude_plays_pokemon.py +282 -0
hud/agent/langchain.py +12 -7
hud/agent/misc/__init__.py +3 -0
hud/agent/misc/response_agent.py +80 -0
hud/agent/operator.py +27 -19
hud/agent/tests/__init__.py +1 -0
hud/agent/tests/test_base.py +202 -0
hud/env/docker_client.py +28 -18
hud/env/environment.py +33 -17
hud/env/local_docker_client.py +83 -42
hud/env/remote_client.py +1 -3
hud/env/remote_docker_client.py +72 -15
hud/exceptions.py +12 -0
hud/gym.py +71 -53
hud/job.py +52 -7
hud/settings.py +6 -0
hud/task.py +45 -33
hud/taskset.py +44 -4
hud/telemetry/__init__.py +21 -0
hud/telemetry/_trace.py +173 -0
hud/telemetry/context.py +193 -0
hud/telemetry/exporter.py +417 -0
hud/telemetry/instrumentation/__init__.py +3 -0
hud/telemetry/instrumentation/mcp.py +498 -0
hud/telemetry/instrumentation/registry.py +59 -0
hud/telemetry/mcp_models.py +331 -0
hud/telemetry/tests/__init__.py +1 -0
hud/telemetry/tests/test_context.py +203 -0
hud/telemetry/tests/test_trace.py +270 -0
hud/types.py +10 -26
hud/utils/common.py +22 -2
hud/utils/misc.py +53 -0
hud/utils/tests/test_version.py +1 -1
hud/version.py +7 -0
{hud_python-0.2.3.dist-info → hud_python-0.2.5.dist-info}/METADATA +90 -22
hud_python-0.2.5.dist-info/RECORD +84 -0
hud_python-0.2.3.dist-info/RECORD +0 -62
{hud_python-0.2.3.dist-info → hud_python-0.2.5.dist-info}/WHEEL +0 -0
{hud_python-0.2.3.dist-info → hud_python-0.2.5.dist-info}/licenses/LICENSE +0 -0

hud/env/remote_docker_client.py CHANGED Viewed

@@ -2,7 +2,9 @@ from __future__ import annotations
 import logging
 from base64 import b64decode, b64encode
-from typing import Any
+from typing import TYPE_CHECKING, Any
+import httpx
 from hud.env.docker_client import DockerClient
 from hud.exceptions import HudResponseError
@@ -10,11 +12,27 @@ from hud.server import make_request
 from hud.settings import settings
 from hud.types import EnvironmentStatus
 from hud.utils import ExecuteResult
-from hud.utils.common import get_gym_id
+from hud.utils.common import directory_to_zip_bytes, get_gym_id
+if TYPE_CHECKING:
+    from pathlib import Path
 logger = logging.getLogger("hud.env.remote_env_client")
+async def upload_bytes_to_presigned_url(presigned_url: str, data_bytes: bytes) -> None:
+    try:
+        async with httpx.AsyncClient() as client:
+            response = await client.put(presigned_url, content=data_bytes)
+            response.raise_for_status()
+    except httpx.HTTPStatusError as e:
+        logger.exception("Failed to upload to presigned URL")
+        raise HudResponseError(message=f"Failed to upload to presigned URL: {e}") from e
+    except httpx.RequestError as e:
+        logger.exception("Network error uploading to presigned URL")
+        raise HudResponseError(message=f"Network error uploading to presigned URL: {e}") from e
 class RemoteDockerClient(DockerClient):
     """
     Remote environment client implementation.
@@ -22,21 +40,64 @@ class RemoteDockerClient(DockerClient):
     Uses the HUD API to manage a remote environment.
     """
+    @classmethod
+    async def build_image(cls, build_context: Path) -> tuple[str, dict[str, Any]]:
+        """
+        Build an image from a build context.
+        """
+        # create the presigned url by making a POST request to /v2/builds
+        logger.info("Creating build")
+        response = await make_request(
+            method="POST",
+            url=f"{settings.base_url}/v2/builds",
+            api_key=settings.api_key,
+        )
+        logger.info("Build created")
+        presigned_url = response["presigned_url"]
+        # List files in the build context
+        files = list(build_context.glob("**/*"))
+        logger.info("Found %d files in build context %s", len(files), build_context)
+        if len(files) == 0:
+            raise HudResponseError(message="Build context is empty")
+        # zip the build context
+        logger.info("Zipping build context")
+        zip_bytes = directory_to_zip_bytes(build_context)
+        logger.info("Created zip archive of size %d kb", len(zip_bytes) // 1024)
+        # upload the zip bytes to the presigned url
+        logger.info("Uploading build context")
+        await upload_bytes_to_presigned_url(presigned_url, zip_bytes)
+        logger.info("Build context uploaded")
+        # start the build and return uri and logs
+        logger.info("Starting build")
+        response = await make_request(
+            method="POST",
+            url=f"{settings.base_url}/v2/builds/{response['id']}/start",
+            api_key=settings.api_key,
+        )
+        logger.info("Build completed")
+        return response["uri"], {"logs": response["logs"]}
     @classmethod
     async def create(
         cls,
-        dockerfile: str,
+        image_uri: str,
         *,
         job_id: str | None = None,
         task_id: str | None = None,
         metadata: dict[str, Any] | None = None,
-    ) -> tuple[RemoteDockerClient, dict[str, Any]]:
+    ) -> RemoteDockerClient:
         """
-        Creates a remote environment client from a dockerfile or gym_id.
+        Creates a remote environment client from an image.
         Args:
-            dockerfile: The dockerfile content to build the environment
-            gym_id: The gym_id of the environment to create
+            image_uri: The image uri to create the environment from
+            job_id: The job_id of the environment to create
+            task_id: The task_id of the environment to create
             metadata: Metadata to associate with the environment
         Returns:
@@ -52,13 +113,14 @@ class RemoteDockerClient(DockerClient):
         logger.info("Creating remote environment")
-        true_gym_id = await get_gym_id("docker")
+        true_gym_id = await get_gym_id("local-docker")
+        # true_gym_id = await get_gym_id("docker")
         # augment metadata with dockerfile
         if "environment_config" not in metadata:
             metadata["environment_config"] = {}
-        metadata["environment_config"]["dockerfile"] = dockerfile
+        metadata["environment_config"]["image_uri"] = image_uri
         # Create a new environment via the HUD API
         response = await make_request(
@@ -85,12 +147,7 @@ class RemoteDockerClient(DockerClient):
                 response_json=response,
             )
-        # Create the controller instance
-        controller = cls(env_id)
-        build_metadata = response.get("metadata", {})
-        return controller, build_metadata
+        return cls(env_id)
     def __init__(self, env_id: str) -> None:
         """

hud/exceptions.py CHANGED Viewed

@@ -165,3 +165,15 @@ class HudNetworkError(HudException):
     This exception is raised when there are issues with the network
     connection, DNS resolution, or other network-related problems.
     """
+class GymMakeException(HudException):
+    """Raised when environment creation or setup fails, includes context data."""
+    def __init__(self, message: str, data: dict[str, Any]) -> None:
+        super().__init__(message)
+        self.data = data
+    def __str__(self) -> str:
+        base = super().__str__()
+        return f"{base} | Data: {self.data}"

hud/gym.py CHANGED Viewed

@@ -8,6 +8,8 @@ from hud.env.environment import Environment
 from hud.env.local_docker_client import LocalDockerClient
 from hud.env.remote_client import RemoteClient
 from hud.env.remote_docker_client import RemoteDockerClient
+from hud.exceptions import GymMakeException
+from hud.telemetry.context import get_current_task_run_id
 from hud.types import CustomGym, Gym
 from hud.utils.common import get_gym_id
@@ -34,17 +36,19 @@ async def make(
         job_id: ID of job to associate with this environment (deprecated, use job instead)
         metadata: Additional metadata for the environment
     """
-    if metadata is None:
-        metadata = {}
+    task = None
+    if isinstance(env_src, str | CustomGym):
+        gym = env_src
+    else:
+        gym = env_src.gym
+        task = env_src
-    # Handle job parameter
     effective_job_id = None
     if job is not None:
         effective_job_id = job.id
     elif job_id is not None:
         effective_job_id = job_id
     else:
-        # Try to get an active job from the decorator context
         try:
             import hud.job
@@ -52,59 +56,73 @@ async def make(
             if active_job:
                 effective_job_id = active_job.id
         except ImportError:
-            pass  # Module not available, skip
-    gym = None
-    task = None
-    if isinstance(env_src, str | CustomGym):
-        gym = env_src
-    else:
-        gym = env_src.gym
-        task = env_src
+            pass
+    build_data = {}
+    try:
+        metadata_copy = {} if metadata is None else metadata.copy()
+        current_task_run_id = get_current_task_run_id()
+        if current_task_run_id:
+            metadata_copy["task_run_id"] = current_task_run_id
+            logger.debug(
+                "Passing task_run_id %s from hud.telemetry context to environment metadata.",
+                current_task_run_id,
+            )
-    if isinstance(gym, CustomGym):
-        # Create the environment (depending on location)
-        if gym.dockerfile is None:
-            raise ValueError("Dockerfile is required for custom environments")
-        if gym.location == "local":
-            logger.info("Creating local environment")
-            client, build_data = await LocalDockerClient.create(gym.dockerfile)
-        elif gym.location == "remote":
-            logger.info("Creating remote environment")
-            client, build_data = await RemoteDockerClient.create(
-                dockerfile=gym.dockerfile,
+        if isinstance(gym, CustomGym):
+            if isinstance(gym.image_or_build_context, str):
+                uri = gym.image_or_build_context
+            elif isinstance(gym.image_or_build_context, Path):
+                if gym.location == "local":
+                    uri, build_data = await LocalDockerClient.build_image(
+                        gym.image_or_build_context
+                    )
+                elif gym.location == "remote":
+                    uri, build_data = await RemoteDockerClient.build_image(
+                        gym.image_or_build_context
+                    )
+                else:
+                    raise ValueError(f"Invalid environment location: {gym.location}")
+            else:
+                raise ValueError(f"Invalid image or build context: {gym.image_or_build_context}")
+            if gym.location == "local":
+                logger.info("Creating local environment")
+                client = await LocalDockerClient.create(uri)
+            elif gym.location == "remote":
+                logger.info("Creating remote environment")
+                client = await RemoteDockerClient.create(
+                    image_uri=uri,
+                    job_id=effective_job_id,
+                    task_id=task.id if task else None,
+                    metadata=metadata_copy,
+                )
+            else:
+                raise ValueError(f"Invalid environment location: {gym.location}")
+            if isinstance(gym.image_or_build_context, Path):
+                logger.info("Setting source path %s", gym.image_or_build_context)
+                client.set_source_path(gym.image_or_build_context)
+        elif isinstance(gym, str):
+            logger.info("Creating private environment")
+            true_gym_id = await get_gym_id(gym)
+            client, build_data = await RemoteClient.create(
+                gym_id=true_gym_id,
                 job_id=effective_job_id,
                 task_id=task.id if task else None,
-                metadata=metadata,
+                metadata=metadata_copy,
             )
         else:
-            raise ValueError(f"Invalid environment location: {gym.location}")
-        # Set up the environment with a source path
-        if gym.controller_source_dir:
-            logger.info("Setting source path")
-            client.set_source_path(Path(gym.controller_source_dir))
-    elif isinstance(gym, str):
-        logger.info("Creating private environment")
-        # Note: the gym_name_or_id is a unique identifier, but it is not a true
-        # gym_id for the purposes of building the environment
-        # we therefore fetch the gym_id from the HUD API here
-        true_gym_id = await get_gym_id(gym)
-        # Create the environment
-        client, build_data = await RemoteClient.create(
-            gym_id=true_gym_id,
-            job_id=effective_job_id,
-            task_id=task.id if task else None,
-            metadata=metadata,
-        )
-    else:
-        raise ValueError(f"Invalid gym source: {gym}")
+            raise ValueError(f"Invalid gym source: {gym}")
-    # Create the environment itself
-    environment = Environment(client=client, metadata=metadata, task=task, build_data=build_data)
-    if task:
-        await environment._setup()
+        environment = Environment(
+            client=client, metadata=metadata_copy, task=task, build_data=build_data
+        )
-    return environment
+        if task:
+            await environment._setup()
+        return environment
+    except Exception as e:
+        build_data["exception"] = str(e)
+        raise GymMakeException("Failed to create environment", build_data) from e

hud/job.py CHANGED Viewed

@@ -12,11 +12,13 @@ from typing import TYPE_CHECKING, Any, TypeVar, cast
 from pydantic import BaseModel, PrivateAttr, TypeAdapter
 import hud.server
-from hud import gym
+from hud import Response, gym
+from hud.agent import ResponseAgent
 from hud.settings import settings
 from hud.task import Task
 from hud.taskset import TaskSet
 from hud.trajectory import Trajectory
+from hud.utils.common import Observation
 from hud.utils.progress import StepProgressTracker
 if TYPE_CHECKING:
@@ -162,7 +164,7 @@ async def create_job(
     # If not, we might need to make a subsequent GET request
     job_data = data  # Adjust if the API response structure is different
-    logger.info("[HUD] View job at https://app.hud.so/jobs/%s.", job_data["id"])
+    logger.info("View job at https://app.hud.so/jobs/%s.", job_data["id"])
     return Job(
         id=job_data["id"],
@@ -259,6 +261,27 @@ def get_active_job() -> Job | None:
     return None
+async def _maybe_resample_action(
+    obs: Observation, action: Any, response_agent: ResponseAgent
+) -> tuple[Observation, bool]:
+    if isinstance(action, Response):
+        action = action.model_dump()
+    if isinstance(action, dict) and action.get("type") == "response":
+        response_text = action.get("text", "")
+        if response_agent and response_text:
+            try:
+                decision = await response_agent.determine_response(response_text)
+                if decision == "CONTINUE":
+                    logger.info("ResponseAgent indicated CONTINUE. Retrying...")
+                    obs = Observation(text="Please continue.")
+                    return obs, False
+                elif decision == "CONTINUE":
+                    logger.warning("Max continue retries reached. Stopping despite CONTINUE.")
+            except Exception as e:
+                logger.warning("Error using ResponseAgent: %s", e)
+    return obs, True
 async def _execute_task(
     agent_cls: type[Agent],
     adapter_cls: type[Adapter] | None,
@@ -270,6 +293,7 @@ async def _execute_task(
     max_steps_per_task: int,
     job: Job,
     tracker: StepProgressTracker | None = None,
+    auto_reply_question: bool = False,
     # Use semaphores instead of rate limiter
     env_creation_semaphore: asyncio.Semaphore | None = None,
     agent_predict_semaphore: asyncio.Semaphore | None = None,
@@ -283,10 +307,15 @@ async def _execute_task(
     status = "error"
     error_msg = "Initialization failed"
     try:
+        response_agent = ResponseAgent() if auto_reply_question else None
         adapter_instance = None
         if adapter_cls:
             adapter_instance = adapter_cls(**(adapter_kwargs or {}))
-        agent_instance = agent_cls(adapter=adapter_instance, **(agent_kwargs or {}))
+        agent_instance = agent_cls(
+            adapter=adapter_instance,
+            **(agent_kwargs or {}),
+        )
         if agent_instance is None:
             raise RuntimeError("Agent could not be instantiated")
@@ -303,6 +332,7 @@ async def _execute_task(
         obs, _ = obs_tuple
         step_error = None
         for step in range(max_steps_per_task):
             action, done = (None, False)
             try:
@@ -319,6 +349,11 @@ async def _execute_task(
                 if action is None and not done:
                     done = True
+                if done and response_agent:
+                    obs, finish = await _maybe_resample_action(obs, action[-1], response_agent)
+                    if not finish:
+                        continue
                 step_result = await env.step(action)
                 if step_result is None:
                     terminated = True
@@ -347,7 +382,7 @@ async def _execute_task(
                         "timestamp": datetime.datetime.now().isoformat(),
                     }
                 )
-                break
+                continue
         else:
             logger.warning("[Job: %s/%s, Task: %s] Max steps reached.", job.name, job.id, task_id)
@@ -361,6 +396,7 @@ async def _execute_task(
                 evaluation_result = await env.evaluate()
                 status = "completed"
                 error_msg = None
+                # logger.info("Evaluation result: %s", evaluation_result)
             except Exception as eval_err:
                 logger.exception(
                     "[Job: %s/%s, Task: %s] Evaluation Error: %s",
@@ -453,6 +489,7 @@ async def run_job(
     agent_cls: type[Agent],
     task_or_taskset: Task | TaskSet,
     job_name: str,
+    auto_reply_question: bool = False,
     adapter_cls: type[Adapter] | None = None,
     agent_kwargs: dict[str, Any] | None = None,
     adapter_kwargs: dict[str, Any] | None = None,
@@ -461,8 +498,8 @@ async def run_job(
     job_metadata: dict[str, Any] | None = None,
     show_progress: bool = True,
     # Concurrency control with semaphores
-    max_concurrent_env_creations: int | None = 30,  # Limits env.make calls
-    max_concurrent_agent_predictions: int | None = 30,  # Limits agent.predict calls
+    max_concurrent_env_creations: int | None = 30,  # Limits gym.make calls
+    max_concurrent_agent_predictions: int | None = None,  # No limit on LLM calls
     max_concurrent_tasks: int | None = 30,  # Limits overall task concurrency
 ) -> Job:
     """
@@ -495,12 +532,16 @@ async def run_job(
     Returns:
         The created Job object with errors stored in job.errors.
     """
+    hud_logger = logging.getLogger("hud")
+    hud_logger.setLevel(logging.CRITICAL)
     tasks_to_run: list[Task] = []
     created_job: Job | None = None
     evalset_id = None
     if isinstance(task_or_taskset, TaskSet):
         evalset_id = task_or_taskset.id
+        await task_or_taskset.fit(agent_cls)
     gym_id = None
     if isinstance(task_or_taskset, Task):
@@ -519,7 +560,7 @@ async def run_job(
             evalset_id=evalset_id,
             gym_id=gym_id,
         )
-        logger.info("Created job with ID: %s", created_job.id)
+        # logger.info("Created job with ID: %s", created_job.id)
     except Exception as e:
         logger.exception("Failed to create job '%s': %s", job_name, e)
         raise
@@ -555,6 +596,8 @@ async def run_job(
         logger.info(
             "Limiting concurrent agent predictions to %d.", max_concurrent_agent_predictions
         )
+    else:
+        logger.info("No limit on concurrent agent predictions.")
     task_execution_sema = None
     effective_concurrency = num_tasks  # Default to running all if parallel
@@ -606,6 +649,7 @@ async def run_job(
                     tracker=tracker,
                     env_creation_semaphore=env_creation_sema,
                     agent_predict_semaphore=agent_predict_sema,
+                    auto_reply_question=auto_reply_question,
                 )
                 for task, task_id in zip(tasks_to_run, task_ids, strict=True)
             ]
@@ -641,6 +685,7 @@ async def run_job(
                     tracker=tracker,
                     env_creation_semaphore=env_creation_sema,
                     agent_predict_semaphore=agent_predict_sema,
+                    auto_reply_question=auto_reply_question,
                 )
     finally:

hud/settings.py CHANGED Viewed

@@ -38,6 +38,12 @@ class Settings(BaseSettings):
         validation_alias="OPENAI_API_KEY",
     )
+    telemetry_enabled: bool = Field(
+        default=True,
+        description="Enable telemetry for the HUD SDK",
+        validation_alias="TELEMETRY_ENABLED",
+    )
 # Create a singleton instance
 settings = Settings()

hud/task.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from __future__ import annotations
+import tempfile
+from pathlib import Path
 from typing import TYPE_CHECKING, Any
+from inspect_ai.util._sandbox import SandboxEnvironmentSpec
 from pydantic import BaseModel
 from hud.types import CustomGym, Gym
@@ -10,11 +13,7 @@ from hud.utils.common import FunctionConfig, FunctionConfigs
 if TYPE_CHECKING:
     from inspect_ai.dataset import Sample
-# Environment specifications:
-# These represent the environment as a whole, including both the controller
-# and the environment type (eg, what os, which services are running)
-UBUNTU_DOCKERFILE = "ubuntu:latest"
+    from hud.agent import Agent
 def convert_inspect_setup(setup: str) -> list[FunctionConfig]:
@@ -57,6 +56,12 @@ class Task(BaseModel):
     gym: Gym | None = None
     config: dict[str, Any] | None = None
+    description: str | None = None
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> Task:
+        return cls(**data)
     @classmethod
     def from_inspect_sample(cls, sample: Sample) -> Task:
         """Create a Task from an Inspect dataset sample.
@@ -91,38 +96,37 @@ class Task(BaseModel):
         evaluate_config = None
         if sample.target:
             if isinstance(sample.target, str):
-                evaluate_config = ("response_includes", [sample.target])
+                evaluate_config = FunctionConfig(function="response_includes", args=[sample.target])
             elif isinstance(sample.target, list):
-                evaluate_config = ("match_all", sample.target)
+                evaluate_config = FunctionConfig(function="match_all", args=sample.target)
-        task_gym: Gym | None = None
-        task_setup: FunctionConfigs | None = None
+        task_setup: FunctionConfigs | None = (
+            convert_inspect_setup(sample.setup) if sample.setup else None
+        )
         sandbox = sample.sandbox
-        dockerfile = None
-        use_qa_gym = True
-        if sandbox:
-            if isinstance(sandbox, str):
-                if sandbox == "docker":
-                    dockerfile = UBUNTU_DOCKERFILE
-                    use_qa_gym = False
-            elif isinstance(sandbox, tuple) and len(sandbox) == 2:
-                sandbox_type, sandbox_config = sandbox
-                if sandbox_type == "docker":
-                    dockerfile = sandbox_config
-                    use_qa_gym = False
-        if use_qa_gym:
-            task_gym = "qa"
-            task_setup = None
-        else:
-            task_gym = CustomGym(
-                dockerfile=dockerfile or UBUNTU_DOCKERFILE,
-                location="local",
-            )
-            task_setup = [x for x in convert_inspect_setup(sample.setup)] if sample.setup else None
-            # TODO: Handle sample.files for CustomGym case if needed
+        match sandbox:
+            case "docker":
+                task_gym = CustomGym(
+                    image_or_build_context="ubuntu:latest",
+                    location="local",
+                )
+            case SandboxEnvironmentSpec(type="docker", config=str()):
+                # create temp dir and put dockerfile there, then use that path
+                temp_dir = tempfile.mkdtemp()
+                temp_dir_path = Path(temp_dir)
+                dockerfile_path = temp_dir_path / "Dockerfile"
+                dockerfile_path.write_text(sandbox.config)
+                task_gym = CustomGym(
+                    image_or_build_context=temp_dir_path,
+                    location="local",
+                )
+            case None:
+                task_gym = "qa"
+                task_setup = None
+            case _:
+                raise ValueError(f"Unsupported sandbox type: {sandbox}")
         return cls(
             id=None,
@@ -132,3 +136,11 @@ class Task(BaseModel):
             gym=task_gym,
             # files=sample.files, # TODO: Decide how/if to handle files
         )
+    async def fit(self, agent: Agent | type[Agent]) -> None:
+        if isinstance(agent, type):
+            agent = agent()
+        if self.gym is None:
+            return
+        self.gym = agent.transfer_gyms.get(self.gym, self.gym)

hud-python 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

Potentially problematic release.

hud-python 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl