PyPI - hud-python - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

hud-python 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (50) hide show

hud/__init__.py +22 -2
hud/adapters/claude/adapter.py +9 -2
hud/adapters/claude/tests/__init__.py +1 -0
hud/adapters/claude/tests/test_adapter.py +519 -0
hud/adapters/common/types.py +5 -1
hud/adapters/operator/adapter.py +4 -0
hud/adapters/operator/tests/__init__.py +1 -0
hud/adapters/operator/tests/test_adapter.py +370 -0
hud/agent/__init__.py +4 -0
hud/agent/base.py +18 -2
hud/agent/claude.py +20 -17
hud/agent/claude_plays_pokemon.py +282 -0
hud/agent/langchain.py +12 -7
hud/agent/misc/__init__.py +3 -0
hud/agent/misc/response_agent.py +80 -0
hud/agent/operator.py +27 -19
hud/agent/tests/__init__.py +1 -0
hud/agent/tests/test_base.py +202 -0
hud/env/docker_client.py +28 -18
hud/env/environment.py +33 -17
hud/env/local_docker_client.py +83 -42
hud/env/remote_client.py +1 -3
hud/env/remote_docker_client.py +72 -15
hud/exceptions.py +12 -0
hud/gym.py +71 -53
hud/job.py +52 -7
hud/settings.py +6 -0
hud/task.py +45 -33
hud/taskset.py +44 -4
hud/telemetry/__init__.py +21 -0
hud/telemetry/_trace.py +173 -0
hud/telemetry/context.py +193 -0
hud/telemetry/exporter.py +417 -0
hud/telemetry/instrumentation/__init__.py +3 -0
hud/telemetry/instrumentation/mcp.py +498 -0
hud/telemetry/instrumentation/registry.py +59 -0
hud/telemetry/mcp_models.py +331 -0
hud/telemetry/tests/__init__.py +1 -0
hud/telemetry/tests/test_context.py +203 -0
hud/telemetry/tests/test_trace.py +270 -0
hud/types.py +10 -26
hud/utils/common.py +22 -2
hud/utils/misc.py +53 -0
hud/utils/tests/test_version.py +1 -1
hud/version.py +7 -0
{hud_python-0.2.3.dist-info → hud_python-0.2.5.dist-info}/METADATA +90 -22
hud_python-0.2.5.dist-info/RECORD +84 -0
hud_python-0.2.3.dist-info/RECORD +0 -62
{hud_python-0.2.3.dist-info → hud_python-0.2.5.dist-info}/WHEEL +0 -0
{hud_python-0.2.3.dist-info → hud_python-0.2.5.dist-info}/licenses/LICENSE +0 -0

hud/taskset.py CHANGED Viewed

@@ -5,15 +5,19 @@ from venv import logger
 from pydantic import BaseModel
+from hud.env.environment import create_remote_config
 from hud.server import make_request
 from hud.settings import settings
 from hud.task import Task
+from hud.utils.config import REMOTE_EVALUATE, REMOTE_SETUP
 if TYPE_CHECKING:
     from collections.abc import Iterator
     from inspect_ai.dataset import Dataset
+    from hud.agent import Agent
 class TaskSet(BaseModel):
     """
@@ -21,11 +25,13 @@ class TaskSet(BaseModel):
     Attributes:
         id: Unique identifier for the taskset
+        name: Name of the taskset
         description: Description of the taskset
         tasks: List of Task objects in the taskset
     """
     id: str | None = None
+    name: str | None = None
     description: str | None = None
     tasks: list[Task] = []
@@ -61,16 +67,38 @@ class TaskSet(BaseModel):
     async def upload(
         self,
-        name: str,
+        name: str | None = None,
         description: str | None = None,
         api_key: str | None = None,
     ) -> None:
         """
         Uploads the taskset to the server.
         """
+        if name is None:
+            name = self.name
+        if name is None:
+            raise ValueError("Taskset name is required")
         if api_key is None:
             api_key = settings.api_key
+        # Convert all tasks to expanded configs
+        processed_tasks = []
+        for task in self.tasks:
+            setup_config = create_remote_config(None, task.setup, REMOTE_SETUP)[0].args[0]
+            evaluate_config = create_remote_config(None, task.evaluate, REMOTE_EVALUATE)[0].args[0]
+            processed_tasks.append(
+                {
+                    "prompt": task.prompt,
+                    "gym": task.gym,
+                    "setup": setup_config.model_dump(),
+                    "evaluate": evaluate_config.model_dump(),
+                    "config": task.config,
+                }
+            )
         await make_request(
             method="POST",
             url=f"{settings.base_url}/v2/tasksets",
@@ -78,13 +106,25 @@ class TaskSet(BaseModel):
             json={
                 "name": name,
                 "description": description,
-                "tasks": [task.model_dump() for task in self.tasks],
+                "tasks": processed_tasks,
             },
         )
         logger.info(
-            "[HUD] Taskset %s uploaded successfully, see it on app.hud.so/tasksets/%s", name, name
+            "Taskset %s uploaded successfully, see it on app.hud.so/evalsets/%s", name, name
         )
+    async def fit(self, agent: Agent | type[Agent]) -> None:
+        """
+        Automatically adapts the taskset to the agent's transfer_gyms.
+        """
+        if isinstance(agent, type):
+            agent = agent()
+        for task in self.tasks:
+            if task.gym is None:
+                continue
+            task.gym = agent.transfer_gyms.get(task.gym, task.gym)
 async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
     """
@@ -107,7 +147,7 @@ async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
         api_key=api_key,
     )
-    logger.info(f"[HUD] Taskset {taskset_id} loaded successfully")
+    logger.info(f"Taskset {taskset_id} loaded successfully")
     return TaskSet.model_validate(
         {

hud/telemetry/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""
+HUD telemetry module for capturing and reporting telemetry data from MCP calls.
+This module provides functionality to trace MCP calls and export telemetry data
+to the HUD platform for analysis.
+"""
+from __future__ import annotations
+from hud.telemetry._trace import init_telemetry, register_trace, trace
+from hud.telemetry.context import get_current_task_run_id, set_current_task_run_id
+from hud.telemetry.exporter import flush
+__all__ = [
+    "flush",
+    "get_current_task_run_id",
+    "init_telemetry",
+    "register_trace",
+    "set_current_task_run_id",
+    "trace",
+]

hud/telemetry/_trace.py ADDED Viewed

@@ -0,0 +1,173 @@
+from __future__ import annotations
+import asyncio
+import logging
+import time
+import uuid
+from contextlib import contextmanager
+from functools import wraps
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    ParamSpec,
+    TypeVar,
+    overload,
+)
+from hud.telemetry import exporter
+from hud.telemetry.context import (
+    flush_buffer,
+    get_current_task_run_id,
+    is_root_trace,
+    set_current_task_run_id,
+)
+from hud.telemetry.exporter import submit_to_worker_loop
+from hud.telemetry.instrumentation.registry import registry
+if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+        Coroutine,
+        Generator,
+    )
+    from hud.telemetry.mcp_models import BaseMCPCall
+logger = logging.getLogger("hud.telemetry")
+T = TypeVar("T")
+def init_telemetry() -> None:
+    """Initialize telemetry instrumentors and ensure worker is started if telemetry is active."""
+    registry.install_all()
+    logger.info("Telemetry initialized.")
+@contextmanager
+def trace(
+    name: str | None = None,
+    attributes: dict[str, Any] | None = None,
+) -> Generator[str, None, None]:
+    """
+    Context manager for tracing a block of code.
+    The task_run_id is always generated internally as a UUID.
+    Telemetry export is handled by a background worker thread.
+    Args:
+        attributes: Optional dictionary of attributes to associate with this trace
+        name: Optional name for this trace, will be added to attributes.
+    Returns:
+        The generated task run ID (UUID string) used for this trace
+    """
+    task_run_id = str(uuid.uuid4())
+    local_attributes = attributes.copy() if attributes is not None else {}
+    if name is not None:
+        local_attributes["trace_name"] = name
+    start_time = time.time()
+    logger.debug("Starting trace %s (Name: %s)", task_run_id, name if name else "Unnamed")
+    previous_task_id = get_current_task_run_id()
+    was_root = is_root_trace.get()
+    set_current_task_run_id(task_run_id)
+    is_root = previous_task_id is None
+    is_root_trace.set(is_root)
+    try:
+        yield task_run_id
+    finally:
+        end_time = time.time()
+        duration = end_time - start_time
+        mcp_calls: list[BaseMCPCall] = flush_buffer()
+        trace_attributes_final = {
+            **local_attributes,
+            "start_time": start_time,
+            "end_time": end_time,
+            "duration": duration,
+            "is_root": is_root,
+        }
+        if is_root and mcp_calls:
+            try:
+                coro_to_submit = exporter.export_telemetry(
+                    task_run_id=task_run_id,
+                    trace_attributes=trace_attributes_final,
+                    mcp_calls=mcp_calls,
+                )
+                future = submit_to_worker_loop(coro_to_submit)
+                if future:
+                    logger.debug(
+                        "Telemetry for trace %s submitted to background worker.", task_run_id
+                    )
+                else:
+                    logger.warning(
+                        "Failed to submit telemetry for trace %s to"
+                        "background worker (loop not available).",
+                        task_run_id,
+                    )
+            except Exception as e:
+                logger.warning("Failed to submit telemetry for trace %s: %s", task_run_id, e)
+        set_current_task_run_id(previous_task_id)
+        is_root_trace.set(was_root)
+        logger.debug(
+            "Ended trace %s (Name: %s) with %d MCP call(s)",
+            task_run_id,
+            name if name else "Unnamed",
+            len(mcp_calls),
+        )
+        logger.info("View trace at https://app.hud.so/jobs/traces/%s", task_run_id)
+P = ParamSpec("P")
+R = TypeVar("R")
+def register_trace(
+    name: str | None = None, attributes: dict[str, Any] | None = None
+) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
+    """
+    Decorator to wrap a synchronous or asynchronous function call
+    within a hud._telemetry.trace context.
+    Args:
+        name: Optional name for the trace.
+        attributes: Optional dictionary of attributes for the trace.
+    """
+    @overload
+    def decorator(
+        func: Callable[P, Coroutine[Any, Any, R]],
+    ) -> Callable[P, Coroutine[Any, Any, R]]: ...
+    @overload
+    def decorator(func: Callable[P, R]) -> Callable[P, R]: ...
+    def decorator(func: Callable[P, Any]) -> Callable[P, Any]:
+        if asyncio.iscoroutinefunction(func):
+            @wraps(func)
+            async def async_wrapper(*args: P.args, **kwargs: P.kwargs) -> Any:
+                effective_name = name if name else func.__name__
+                with trace(name=effective_name, attributes=attributes):
+                    return await func(*args, **kwargs)
+            return async_wrapper
+        else:
+            @wraps(func)
+            def sync_wrapper(*args: P.args, **kwargs: P.kwargs) -> Any:
+                effective_name = name if name else func.__name__
+                with trace(name=effective_name, attributes=attributes):
+                    return func(*args, **kwargs)
+            return sync_wrapper
+    return decorator

hud/telemetry/context.py ADDED Viewed

@@ -0,0 +1,193 @@
+from __future__ import annotations
+import contextvars
+import logging
+from datetime import datetime
+from typing import Any, TypeVar
+from hud.telemetry.mcp_models import (
+    BaseMCPCall,
+    MCPManualTestCall,
+    MCPNotificationCall,
+    MCPRequestCall,
+    MCPResponseCall,
+    MCPTelemetryRecord,
+    StatusType,
+)
+logger = logging.getLogger("hud.telemetry")
+# Context variables for tracing
+current_task_run_id: contextvars.ContextVar[str | None] = contextvars.ContextVar(
+    "current_task_run_id", default=None
+)
+mcp_calls_buffer: contextvars.ContextVar[list[BaseMCPCall] | None] = contextvars.ContextVar(
+    "mcp_calls_buffer", default=None
+)
+is_root_trace: contextvars.ContextVar[bool] = contextvars.ContextVar("is_root_trace", default=False)
+# Maximum buffer size before automatic flush
+MAX_BUFFER_SIZE = 100
+# Type variable for record factories
+T = TypeVar("T", bound=BaseMCPCall)
+def get_current_task_run_id() -> str | None:
+    """Get the task_run_id for the current trace context."""
+    value = current_task_run_id.get()
+    # Convert empty string sentinel back to None
+    return None if value == "" else value
+def set_current_task_run_id(task_run_id: str | None) -> None:
+    """Set the task_run_id for the current trace context."""
+    # Handle None value by using empty string as sentinel
+    value_to_set = "" if task_run_id is None else task_run_id
+    current_task_run_id.set(value_to_set)
+def buffer_mcp_call(record: BaseMCPCall | dict[str, Any]) -> None:
+    """
+    Add an MCP call to the buffer for the current trace.
+    Args:
+        record: Either a Pydantic model instance or dictionary with MCP call data
+    """
+    # Only buffer if we have an active trace
+    task_run_id = get_current_task_run_id()
+    if task_run_id is not None and task_run_id != "":
+        buffer = mcp_calls_buffer.get()
+        if buffer is None:
+            buffer = []
+        # Convert dictionary to proper model if needed
+        if isinstance(record, dict):
+            record = BaseMCPCall.from_dict(record)
+        # Ensure the record has the current task_run_id
+        if record.task_run_id != task_run_id:
+            # Create a copy with the current task_run_id
+            record_dict = record.model_dump()
+            record_dict["task_run_id"] = task_run_id
+            record = BaseMCPCall.from_dict(record_dict)
+        # Add to buffer
+        buffer.append(record)
+        mcp_calls_buffer.set(buffer)
+        # Auto-flush if buffer gets too large
+        if len(buffer) >= MAX_BUFFER_SIZE:
+            logger.debug("MCP calls buffer reached size %d, auto-flushing", len(buffer))
+            flush_buffer(export=True)
+def flush_buffer(export: bool = False) -> list[BaseMCPCall]:
+    """
+    Clear the MCP calls buffer and return its contents.
+    Args:
+        export: Whether to trigger export of this buffer
+    Returns:
+        The list of buffered MCP calls
+    """
+    buffer = mcp_calls_buffer.get()
+    if buffer is None:
+        buffer = []
+    # Reset buffer to empty list
+    mcp_calls_buffer.set([])
+    if export and buffer and len(buffer) > 0:
+        task_id = buffer[0].task_run_id if buffer else None
+        if task_id:
+            logger.debug("Exporting %d MCP calls for task run %s", len(buffer), task_id)
+            # Create a telemetry record for export
+            _telemetry_record = MCPTelemetryRecord(task_run_id=task_id, records=buffer)
+            # In the future, we could call an export function here
+            # For now, just log that we have telemetry
+            logger.debug("MCP telemetry record created with %d calls", len(buffer))
+        else:
+            logger.warning("No task_run_id found in buffer, skipping export")
+    return buffer
+def create_request_record(
+    method: str, status: StatusType = StatusType.STARTED, **kwargs: Any
+) -> MCPRequestCall:
+    """Create and buffer a request record"""
+    task_run_id = get_current_task_run_id()
+    if not task_run_id:
+        logger.warning("No active task_run_id, request record will not be created")
+        raise ValueError("No active task_run_id")
+    record = MCPRequestCall(
+        task_run_id=task_run_id,
+        method=method,
+        status=status,
+        start_time=kwargs.pop("start_time", None) or datetime.now().timestamp(),
+        **kwargs,
+    )
+    buffer_mcp_call(record)
+    return record
+def create_response_record(
+    method: str, related_request_id: str | int | None = None, is_error: bool = False, **kwargs: Any
+) -> MCPResponseCall:
+    """Create and buffer a response record"""
+    task_run_id = get_current_task_run_id()
+    if not task_run_id:
+        logger.warning("No active task_run_id, response record will not be created")
+        raise ValueError("No active task_run_id")
+    record = MCPResponseCall(
+        task_run_id=task_run_id,
+        method=method,
+        status=StatusType.COMPLETED,
+        related_request_id=related_request_id,
+        is_error=is_error,
+        **kwargs,
+    )
+    buffer_mcp_call(record)
+    return record
+def create_notification_record(
+    method: str, status: StatusType = StatusType.STARTED, **kwargs: Any
+) -> MCPNotificationCall:
+    """Create and buffer a notification record"""
+    task_run_id = get_current_task_run_id()
+    if not task_run_id:
+        logger.warning("No active task_run_id, notification record will not be created")
+        raise ValueError("No active task_run_id")
+    record = MCPNotificationCall(
+        task_run_id=task_run_id,
+        method=method,
+        status=status,
+        start_time=kwargs.pop("start_time", None) or datetime.now().timestamp(),
+        **kwargs,
+    )
+    buffer_mcp_call(record)
+    return record
+def create_manual_test_record(**custom_data: Any) -> MCPManualTestCall | None:
+    """Create and buffer a manual test record"""
+    task_run_id = get_current_task_run_id()
+    if not task_run_id:
+        logger.warning("No active task_run_id, manual test record will not be created")
+        return None
+    record = MCPManualTestCall.create(task_run_id=task_run_id, **custom_data)
+    buffer_mcp_call(record)
+    return record
+def reset_context() -> None:
+    """Reset all telemetry context variables. Useful for test isolation."""
+    set_current_task_run_id(None)
+    mcp_calls_buffer.set([])
+    is_root_trace.set(False)

hud-python 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

Potentially problematic release.

hud-python 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl