PyPI - hud-python - Versions diffs - 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl - Mend

hud-python 0.2.4py3-none-any.whl → 0.2.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (51) hide show

hud/__init__.py +22 -2
hud/adapters/claude/adapter.py +9 -2
hud/adapters/claude/tests/__init__.py +1 -0
hud/adapters/claude/tests/test_adapter.py +519 -0
hud/adapters/common/types.py +5 -1
hud/adapters/operator/adapter.py +4 -0
hud/adapters/operator/tests/__init__.py +1 -0
hud/adapters/operator/tests/test_adapter.py +370 -0
hud/agent/__init__.py +4 -0
hud/agent/base.py +18 -2
hud/agent/claude.py +20 -17
hud/agent/claude_plays_pokemon.py +283 -0
hud/agent/langchain.py +12 -7
hud/agent/misc/__init__.py +3 -0
hud/agent/misc/response_agent.py +80 -0
hud/agent/operator.py +27 -19
hud/agent/tests/__init__.py +1 -0
hud/agent/tests/test_base.py +202 -0
hud/env/docker_client.py +28 -18
hud/env/environment.py +32 -16
hud/env/local_docker_client.py +83 -42
hud/env/remote_client.py +1 -3
hud/env/remote_docker_client.py +71 -14
hud/exceptions.py +12 -0
hud/gym.py +71 -53
hud/job.py +59 -14
hud/server/requests.py +26 -4
hud/settings.py +7 -1
hud/task.py +45 -33
hud/taskset.py +56 -4
hud/telemetry/__init__.py +21 -0
hud/telemetry/_trace.py +173 -0
hud/telemetry/context.py +169 -0
hud/telemetry/exporter.py +417 -0
hud/telemetry/instrumentation/__init__.py +3 -0
hud/telemetry/instrumentation/mcp.py +495 -0
hud/telemetry/instrumentation/registry.py +59 -0
hud/telemetry/mcp_models.py +331 -0
hud/telemetry/tests/__init__.py +1 -0
hud/telemetry/tests/test_context.py +207 -0
hud/telemetry/tests/test_trace.py +270 -0
hud/types.py +11 -27
hud/utils/common.py +22 -2
hud/utils/misc.py +53 -0
hud/utils/tests/test_version.py +1 -1
hud/version.py +7 -0
{hud_python-0.2.4.dist-info → hud_python-0.2.6.dist-info}/METADATA +98 -30
hud_python-0.2.6.dist-info/RECORD +84 -0
hud_python-0.2.4.dist-info/RECORD +0 -62
{hud_python-0.2.4.dist-info → hud_python-0.2.6.dist-info}/WHEEL +0 -0
{hud_python-0.2.4.dist-info → hud_python-0.2.6.dist-info}/licenses/LICENSE +0 -0

hud/task.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from __future__ import annotations
+import tempfile
+from pathlib import Path
 from typing import TYPE_CHECKING, Any
+from inspect_ai.util._sandbox import SandboxEnvironmentSpec
 from pydantic import BaseModel
 from hud.types import CustomGym, Gym
@@ -10,11 +13,7 @@ from hud.utils.common import FunctionConfig, FunctionConfigs
 if TYPE_CHECKING:
     from inspect_ai.dataset import Sample
-# Environment specifications:
-# These represent the environment as a whole, including both the controller
-# and the environment type (eg, what os, which services are running)
-UBUNTU_DOCKERFILE = "ubuntu:latest"
+    from hud.agent import Agent
 def convert_inspect_setup(setup: str) -> list[FunctionConfig]:
@@ -57,6 +56,12 @@ class Task(BaseModel):
     gym: Gym | None = None
     config: dict[str, Any] | None = None
+    description: str | None = None
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> Task:
+        return cls(**data)
     @classmethod
     def from_inspect_sample(cls, sample: Sample) -> Task:
         """Create a Task from an Inspect dataset sample.
@@ -91,38 +96,37 @@ class Task(BaseModel):
         evaluate_config = None
         if sample.target:
             if isinstance(sample.target, str):
-                evaluate_config = ("response_includes", [sample.target])
+                evaluate_config = FunctionConfig(function="response_includes", args=[sample.target])
             elif isinstance(sample.target, list):
-                evaluate_config = ("match_all", sample.target)
+                evaluate_config = FunctionConfig(function="match_all", args=sample.target)
-        task_gym: Gym | None = None
-        task_setup: FunctionConfigs | None = None
+        task_setup: FunctionConfigs | None = (
+            convert_inspect_setup(sample.setup) if sample.setup else None
+        )
         sandbox = sample.sandbox
-        dockerfile = None
-        use_qa_gym = True
-        if sandbox:
-            if isinstance(sandbox, str):
-                if sandbox == "docker":
-                    dockerfile = UBUNTU_DOCKERFILE
-                    use_qa_gym = False
-            elif isinstance(sandbox, tuple) and len(sandbox) == 2:
-                sandbox_type, sandbox_config = sandbox
-                if sandbox_type == "docker":
-                    dockerfile = sandbox_config
-                    use_qa_gym = False
-        if use_qa_gym:
-            task_gym = "qa"
-            task_setup = None
-        else:
-            task_gym = CustomGym(
-                dockerfile=dockerfile or UBUNTU_DOCKERFILE,
-                location="local",
-            )
-            task_setup = [x for x in convert_inspect_setup(sample.setup)] if sample.setup else None
-            # TODO: Handle sample.files for CustomGym case if needed
+        match sandbox:
+            case "docker":
+                task_gym = CustomGym(
+                    image_or_build_context="ubuntu:latest",
+                    location="local",
+                )
+            case SandboxEnvironmentSpec(type="docker", config=str()):
+                # create temp dir and put dockerfile there, then use that path
+                temp_dir = tempfile.mkdtemp()
+                temp_dir_path = Path(temp_dir)
+                dockerfile_path = temp_dir_path / "Dockerfile"
+                dockerfile_path.write_text(sandbox.config)
+                task_gym = CustomGym(
+                    image_or_build_context=temp_dir_path,
+                    location="local",
+                )
+            case None:
+                task_gym = "qa"
+                task_setup = None
+            case _:
+                raise ValueError(f"Unsupported sandbox type: {sandbox}")
         return cls(
             id=None,
@@ -132,3 +136,11 @@ class Task(BaseModel):
             gym=task_gym,
             # files=sample.files, # TODO: Decide how/if to handle files
         )
+    async def fit(self, agent: Agent | type[Agent]) -> None:
+        if isinstance(agent, type):
+            agent = agent()
+        if self.gym is None:
+            return
+        self.gym = agent.transfer_gyms.get(self.gym, self.gym)

hud/taskset.py CHANGED Viewed

@@ -5,15 +5,19 @@ from venv import logger
 from pydantic import BaseModel
+from hud.env.environment import create_remote_config
 from hud.server import make_request
 from hud.settings import settings
 from hud.task import Task
+from hud.utils.config import REMOTE_EVALUATE, REMOTE_SETUP
 if TYPE_CHECKING:
     from collections.abc import Iterator
     from inspect_ai.dataset import Dataset
+    from hud.agent import Agent
 class TaskSet(BaseModel):
     """
@@ -21,11 +25,13 @@ class TaskSet(BaseModel):
     Attributes:
         id: Unique identifier for the taskset
+        name: Name of the taskset
         description: Description of the taskset
         tasks: List of Task objects in the taskset
     """
     id: str | None = None
+    name: str | None = None
     description: str | None = None
     tasks: list[Task] = []
@@ -61,16 +67,50 @@ class TaskSet(BaseModel):
     async def upload(
         self,
-        name: str,
+        name: str | None = None,
         description: str | None = None,
         api_key: str | None = None,
     ) -> None:
         """
         Uploads the taskset to the server.
         """
+        if name is None:
+            name = self.name
+        if name is None:
+            raise ValueError("Taskset name is required")
         if api_key is None:
             api_key = settings.api_key
+        # Convert all tasks to expanded configs
+        processed_tasks = []
+        for task in self.tasks:
+            if task.setup is not None:
+                setup_config = (
+                    create_remote_config(None, task.setup, REMOTE_SETUP)[0].args[0].model_dump()
+                )
+            else:
+                setup_config = None
+            if task.evaluate is not None:
+                evaluate_config = (
+                    create_remote_config(None, task.evaluate, REMOTE_EVALUATE)[0]
+                    .args[0]
+                    .model_dump()
+                )
+            else:
+                evaluate_config = None
+            processed_tasks.append(
+                {
+                    "prompt": task.prompt,
+                    "gym": task.gym,
+                    "setup": setup_config,
+                    "evaluate": evaluate_config,
+                    "config": task.config,
+                }
+            )
         await make_request(
             method="POST",
             url=f"{settings.base_url}/v2/tasksets",
@@ -78,13 +118,25 @@ class TaskSet(BaseModel):
             json={
                 "name": name,
                 "description": description,
-                "tasks": [task.model_dump() for task in self.tasks],
+                "tasks": processed_tasks,
             },
         )
         logger.info(
-            "[HUD] Taskset %s uploaded successfully, see it on app.hud.so/tasksets/%s", name, name
+            "Taskset %s uploaded successfully, see it on app.hud.so/evalsets/%s", name, name
         )
+    async def fit(self, agent: Agent | type[Agent]) -> None:
+        """
+        Automatically adapts the taskset to the agent's transfer_gyms.
+        """
+        if isinstance(agent, type):
+            agent = agent()
+        for task in self.tasks:
+            if task.gym is None:
+                continue
+            task.gym = agent.transfer_gyms.get(task.gym, task.gym)
 async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
     """
@@ -107,7 +159,7 @@ async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
         api_key=api_key,
     )
-    logger.info(f"[HUD] Taskset {taskset_id} loaded successfully")
+    logger.info(f"Taskset {taskset_id} loaded successfully")
     return TaskSet.model_validate(
         {

hud/telemetry/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""
+HUD telemetry module for capturing and reporting telemetry data from MCP calls.
+This module provides functionality to trace MCP calls and export telemetry data
+to the HUD platform for analysis.
+"""
+from __future__ import annotations
+from hud.telemetry._trace import init_telemetry, register_trace, trace
+from hud.telemetry.context import get_current_task_run_id, set_current_task_run_id
+from hud.telemetry.exporter import flush
+__all__ = [
+    "flush",
+    "get_current_task_run_id",
+    "init_telemetry",
+    "register_trace",
+    "set_current_task_run_id",
+    "trace",
+]

hud/telemetry/_trace.py ADDED Viewed

@@ -0,0 +1,173 @@
+from __future__ import annotations
+import asyncio
+import logging
+import time
+import uuid
+from contextlib import contextmanager
+from functools import wraps
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    ParamSpec,
+    TypeVar,
+    overload,
+)
+from hud.telemetry import exporter
+from hud.telemetry.context import (
+    flush_buffer,
+    get_current_task_run_id,
+    is_root_trace,
+    set_current_task_run_id,
+)
+from hud.telemetry.exporter import submit_to_worker_loop
+from hud.telemetry.instrumentation.registry import registry
+if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+        Coroutine,
+        Generator,
+    )
+    from hud.telemetry.mcp_models import BaseMCPCall
+logger = logging.getLogger("hud.telemetry")
+T = TypeVar("T")
+def init_telemetry() -> None:
+    """Initialize telemetry instrumentors and ensure worker is started if telemetry is active."""
+    registry.install_all()
+    logger.info("Telemetry initialized.")
+@contextmanager
+def trace(
+    name: str | None = None,
+    attributes: dict[str, Any] | None = None,
+) -> Generator[str, None, None]:
+    """
+    Context manager for tracing a block of code.
+    The task_run_id is always generated internally as a UUID.
+    Telemetry export is handled by a background worker thread.
+    Args:
+        attributes: Optional dictionary of attributes to associate with this trace
+        name: Optional name for this trace, will be added to attributes.
+    Returns:
+        The generated task run ID (UUID string) used for this trace
+    """
+    task_run_id = str(uuid.uuid4())
+    local_attributes = attributes.copy() if attributes is not None else {}
+    if name is not None:
+        local_attributes["trace_name"] = name
+    start_time = time.time()
+    logger.debug("Starting trace %s (Name: %s)", task_run_id, name if name else "Unnamed")
+    previous_task_id = get_current_task_run_id()
+    was_root = is_root_trace.get()
+    set_current_task_run_id(task_run_id)
+    is_root = previous_task_id is None
+    is_root_trace.set(is_root)
+    try:
+        yield task_run_id
+    finally:
+        end_time = time.time()
+        duration = end_time - start_time
+        mcp_calls: list[BaseMCPCall] = flush_buffer()
+        trace_attributes_final = {
+            **local_attributes,
+            "start_time": start_time,
+            "end_time": end_time,
+            "duration": duration,
+            "is_root": is_root,
+        }
+        if is_root and mcp_calls:
+            try:
+                coro_to_submit = exporter.export_telemetry(
+                    task_run_id=task_run_id,
+                    trace_attributes=trace_attributes_final,
+                    mcp_calls=mcp_calls,
+                )
+                future = submit_to_worker_loop(coro_to_submit)
+                if future:
+                    logger.debug(
+                        "Telemetry for trace %s submitted to background worker.", task_run_id
+                    )
+                else:
+                    logger.warning(
+                        "Failed to submit telemetry for trace %s to"
+                        "background worker (loop not available).",
+                        task_run_id,
+                    )
+            except Exception as e:
+                logger.warning("Failed to submit telemetry for trace %s: %s", task_run_id, e)
+        set_current_task_run_id(previous_task_id)
+        is_root_trace.set(was_root)
+        logger.debug(
+            "Ended trace %s (Name: %s) with %d MCP call(s)",
+            task_run_id,
+            name if name else "Unnamed",
+            len(mcp_calls),
+        )
+        logger.info("View trace at https://app.hud.so/jobs/traces/%s", task_run_id)
+P = ParamSpec("P")
+R = TypeVar("R")
+def register_trace(
+    name: str | None = None, attributes: dict[str, Any] | None = None
+) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
+    """
+    Decorator to wrap a synchronous or asynchronous function call
+    within a hud._telemetry.trace context.
+    Args:
+        name: Optional name for the trace.
+        attributes: Optional dictionary of attributes for the trace.
+    """
+    @overload
+    def decorator(
+        func: Callable[P, Coroutine[Any, Any, R]],
+    ) -> Callable[P, Coroutine[Any, Any, R]]: ...
+    @overload
+    def decorator(func: Callable[P, R]) -> Callable[P, R]: ...
+    def decorator(func: Callable[P, Any]) -> Callable[P, Any]:
+        if asyncio.iscoroutinefunction(func):
+            @wraps(func)
+            async def async_wrapper(*args: P.args, **kwargs: P.kwargs) -> Any:
+                effective_name = name if name else func.__name__
+                with trace(name=effective_name, attributes=attributes):
+                    return await func(*args, **kwargs)
+            return async_wrapper
+        else:
+            @wraps(func)
+            def sync_wrapper(*args: P.args, **kwargs: P.kwargs) -> Any:
+                effective_name = name if name else func.__name__
+                with trace(name=effective_name, attributes=attributes):
+                    return func(*args, **kwargs)
+            return sync_wrapper
+    return decorator

hud/telemetry/context.py ADDED Viewed

@@ -0,0 +1,169 @@
+from __future__ import annotations
+import contextvars
+import logging
+from collections import defaultdict
+from datetime import datetime
+from typing import Any, TypeVar
+from hud.telemetry.mcp_models import (
+    BaseMCPCall,
+    MCPManualTestCall,
+    MCPNotificationCall,
+    MCPRequestCall,
+    MCPResponseCall,
+    StatusType,
+)
+logger = logging.getLogger("hud.telemetry")
+# Context variables for tracing
+current_task_run_id: contextvars.ContextVar[str | None] = contextvars.ContextVar(
+    "current_task_run_id", default=None
+)
+# NEW: Global dictionary for buffering, keyed by task_run_id
+_GLOBAL_MCP_CALL_BUFFERS: defaultdict[str, list[BaseMCPCall]] = defaultdict(list)
+is_root_trace: contextvars.ContextVar[bool] = contextvars.ContextVar("is_root_trace", default=False)
+# Maximum buffer size before automatic flush
+MAX_BUFFER_SIZE = 100
+# Type variable for record factories
+T = TypeVar("T", bound=BaseMCPCall)
+def get_current_task_run_id() -> str | None:
+    """Get the task_run_id for the current trace context."""
+    return current_task_run_id.get()
+def set_current_task_run_id(task_run_id: str | None) -> None:
+    """Set the task_run_id for the current trace context."""
+    current_task_run_id.set(task_run_id)
+def buffer_mcp_call(record: BaseMCPCall | dict[str, Any]) -> None:
+    task_run_id = get_current_task_run_id()
+    if not task_run_id:
+        logger.warning(
+            "BUFFER_MCP_CALL: No task_run_id. Skipping buffer for %s", type(record).__name__
+        )
+        return
+    # Ensure 'record' is a Pydantic model instance from here
+    if isinstance(record, dict):
+        try:
+            record_model = BaseMCPCall.from_dict(record)
+            record = record_model
+        except Exception as e_conv:
+            logger.exception("BUFFER_MCP_CALL: Failed to convert dict to BaseMCPCall: %s", e_conv)
+            return
+    _GLOBAL_MCP_CALL_BUFFERS[task_run_id].append(record)
+    buffer_len = len(_GLOBAL_MCP_CALL_BUFFERS[task_run_id])
+    if buffer_len >= MAX_BUFFER_SIZE:
+        flush_buffer(export=True)
+def flush_buffer(export: bool = False) -> list[BaseMCPCall]:
+    """
+    Clear the MCP calls buffer and return its contents.
+    Args:
+        export: Whether to trigger export of this buffer
+    Returns:
+        The list of buffered MCP calls
+    """
+    task_run_id = get_current_task_run_id()
+    if not task_run_id:
+        logger.warning("FLUSH_BUFFER: No current task_run_id. Cannot flush.")
+        return []
+    buffer_for_task = _GLOBAL_MCP_CALL_BUFFERS.pop(
+        task_run_id, []
+    )  # Get and remove the list for this task
+    return buffer_for_task  # Return the flushed items
+def create_request_record(
+    method: str, status: StatusType = StatusType.STARTED, **kwargs: Any
+) -> MCPRequestCall:
+    """Create and buffer a request record"""
+    task_run_id = get_current_task_run_id()
+    if not task_run_id:
+        logger.warning("No active task_run_id, request record will not be created")
+        raise ValueError("No active task_run_id")
+    record = MCPRequestCall(
+        task_run_id=task_run_id,
+        method=method,
+        status=status,
+        start_time=kwargs.pop("start_time", None) or datetime.now().timestamp(),
+        **kwargs,
+    )
+    buffer_mcp_call(record)
+    return record
+def create_response_record(
+    method: str, related_request_id: str | int | None = None, is_error: bool = False, **kwargs: Any
+) -> MCPResponseCall:
+    """Create and buffer a response record"""
+    task_run_id = get_current_task_run_id()
+    if not task_run_id:
+        logger.warning("No active task_run_id, response record will not be created")
+        raise ValueError("No active task_run_id")
+    record = MCPResponseCall(
+        task_run_id=task_run_id,
+        method=method,
+        status=StatusType.COMPLETED,
+        related_request_id=related_request_id,
+        is_error=is_error,
+        **kwargs,
+    )
+    buffer_mcp_call(record)
+    return record
+def create_notification_record(
+    method: str, status: StatusType = StatusType.STARTED, **kwargs: Any
+) -> MCPNotificationCall:
+    """Create and buffer a notification record"""
+    task_run_id = get_current_task_run_id()
+    if not task_run_id:
+        logger.warning("No active task_run_id, notification record will not be created")
+        raise ValueError("No active task_run_id")
+    record = MCPNotificationCall(
+        task_run_id=task_run_id,
+        method=method,
+        status=status,
+        start_time=kwargs.pop("start_time", None) or datetime.now().timestamp(),
+        **kwargs,
+    )
+    buffer_mcp_call(record)
+    return record
+def create_manual_test_record(**custom_data: Any) -> MCPManualTestCall | None:
+    """Create and buffer a manual test record"""
+    task_run_id = get_current_task_run_id()
+    if not task_run_id:
+        logger.warning("No active task_run_id, manual test record will not be created")
+        return None
+    record = MCPManualTestCall.create(task_run_id=task_run_id, **custom_data)
+    buffer_mcp_call(record)
+    return record
+def reset_context() -> None:
+    """Reset all telemetry context variables. Useful for test isolation."""
+    set_current_task_run_id(None)
+    is_root_trace.set(False)

hud-python 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl

Potentially problematic release.

hud-python 0.2.4py3-none-any.whl → 0.2.6py3-none-any.whl