PyPI - hud-python - Versions diffs - 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

hud-python 0.2.4py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (50) hide show

hud/__init__.py +22 -2
hud/adapters/claude/adapter.py +9 -2
hud/adapters/claude/tests/__init__.py +1 -0
hud/adapters/claude/tests/test_adapter.py +519 -0
hud/adapters/common/types.py +5 -1
hud/adapters/operator/adapter.py +4 -0
hud/adapters/operator/tests/__init__.py +1 -0
hud/adapters/operator/tests/test_adapter.py +370 -0
hud/agent/__init__.py +4 -0
hud/agent/base.py +18 -2
hud/agent/claude.py +20 -17
hud/agent/claude_plays_pokemon.py +282 -0
hud/agent/langchain.py +12 -7
hud/agent/misc/__init__.py +3 -0
hud/agent/misc/response_agent.py +80 -0
hud/agent/operator.py +27 -19
hud/agent/tests/__init__.py +1 -0
hud/agent/tests/test_base.py +202 -0
hud/env/docker_client.py +28 -18
hud/env/environment.py +32 -16
hud/env/local_docker_client.py +83 -42
hud/env/remote_client.py +1 -3
hud/env/remote_docker_client.py +72 -15
hud/exceptions.py +12 -0
hud/gym.py +71 -53
hud/job.py +52 -7
hud/settings.py +6 -0
hud/task.py +45 -33
hud/taskset.py +44 -4
hud/telemetry/__init__.py +21 -0
hud/telemetry/_trace.py +173 -0
hud/telemetry/context.py +193 -0
hud/telemetry/exporter.py +417 -0
hud/telemetry/instrumentation/__init__.py +3 -0
hud/telemetry/instrumentation/mcp.py +498 -0
hud/telemetry/instrumentation/registry.py +59 -0
hud/telemetry/mcp_models.py +331 -0
hud/telemetry/tests/__init__.py +1 -0
hud/telemetry/tests/test_context.py +203 -0
hud/telemetry/tests/test_trace.py +270 -0
hud/types.py +10 -26
hud/utils/common.py +22 -2
hud/utils/misc.py +53 -0
hud/utils/tests/test_version.py +1 -1
hud/version.py +7 -0
{hud_python-0.2.4.dist-info → hud_python-0.2.5.dist-info}/METADATA +90 -22
hud_python-0.2.5.dist-info/RECORD +84 -0
hud_python-0.2.4.dist-info/RECORD +0 -62
{hud_python-0.2.4.dist-info → hud_python-0.2.5.dist-info}/WHEEL +0 -0
{hud_python-0.2.4.dist-info → hud_python-0.2.5.dist-info}/licenses/LICENSE +0 -0

hud/telemetry/mcp_models.py ADDED Viewed

@@ -0,0 +1,331 @@
+from __future__ import annotations
+from datetime import datetime
+from enum import Enum
+from typing import TYPE_CHECKING, Any, ClassVar
+# Import MCP types
+from mcp.types import JSONRPCError, JSONRPCNotification, JSONRPCRequest, JSONRPCResponse
+from pydantic import BaseModel, Field, field_validator
+if TYPE_CHECKING:
+    from mcp.shared.message import SessionMessage
+class DirectionType(str, Enum):
+    """Direction of an MCP message"""
+    SENT = "sent"
+    RECEIVED = "received"
+class StatusType(str, Enum):
+    """Status of an MCP operation"""
+    STARTED = "started"
+    COMPLETED = "completed"
+    ERROR = "error"
+class MCPCallType(str, Enum):
+    """Known MCP call types"""
+    SEND_REQUEST = "mcp.shared.session.send_request"
+    SEND_NOTIFICATION = "mcp.shared.session.send_notification"
+    RECEIVE_RESPONSE = "mcp.shared.session.receive_response"
+    RECEIVE_REQUEST = "mcp.shared.session.receive_request"
+    STREAM_READ = "mcp.stream.read"
+    STREAM_WRITE = "mcp.stream.write"
+    HANDLE_INCOMING = "mcp.handle_incoming"
+    MANUAL_TEST = "manual.test"
+class BaseMCPCall(BaseModel):
+    """Base model for all MCP telemetry records"""
+    task_run_id: str
+    call_type: str
+    timestamp: float = Field(default_factory=lambda: datetime.now().timestamp())
+    method: str = "unknown_method"
+    status: StatusType
+    direction: DirectionType | None = None
+    # Additional data that might be useful for any call
+    message_id: str | int | None = None
+    # Mapping of call types to model classes - to be populated by subclasses
+    _call_type_mapping: ClassVar[dict[str, type["BaseMCPCall"]]] = {}
+    @field_validator("call_type")
+    @classmethod
+    def validate_call_type(cls, v: str) -> str:
+        """Allow any string but preferably from MCPCallType"""
+        return v
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> BaseMCPCall:
+        """Create a record from a dictionary, using the appropriate subclass"""
+        call_type = data.get("call_type", "")
+        record_cls = cls._call_type_mapping.get(call_type, BaseMCPCall)
+        return record_cls.model_validate(data)
+    def __init_subclass__(cls, **kwargs: Any) -> None:
+        """Register subclasses in the mapping by their default call_type"""
+        super().__init_subclass__(**kwargs)
+        if hasattr(cls, "__annotations__") and "call_type" in cls.__annotations__:
+            default_call_type = getattr(cls, "call_type", None)
+            if isinstance(default_call_type, str):
+                BaseMCPCall._call_type_mapping[default_call_type] = cls
+class MCPRequestCall(BaseMCPCall):
+    """Record for an MCP request"""
+    direction: DirectionType = DirectionType.SENT
+    call_type: str = MCPCallType.SEND_REQUEST
+    start_time: float
+    end_time: float | None = None
+    duration: float | None = None
+    request_id: str | int | None = None
+    request_data: dict[str, Any] | None = None
+    @classmethod
+    def from_jsonrpc_request(
+        cls,
+        request: JSONRPCRequest,
+        task_run_id: str,
+        status: StatusType = StatusType.STARTED,
+        **kwargs: Any,
+    ) -> MCPRequestCall:
+        """Create telemetry record from a JSONRPCRequest"""
+        return cls(
+            task_run_id=task_run_id,
+            status=status,
+            request_id=request.id,
+            message_id=request.id,
+            method=request.method,
+            request_data=request.model_dump(exclude_none=True),
+            start_time=datetime.now().timestamp(),
+            **kwargs,
+        )
+    @classmethod
+    def from_session_message(
+        cls,
+        message: SessionMessage,
+        task_run_id: str,
+        status: StatusType = StatusType.STARTED,
+        **kwargs: Any,
+    ) -> MCPRequestCall | None:
+        """Create telemetry record from a SessionMessage containing a JSONRPCRequest"""
+        if (
+            hasattr(message, "message")
+            and hasattr(message.message, "root")
+            and isinstance(message.message.root, JSONRPCRequest)
+        ):
+            return cls.from_jsonrpc_request(
+                message.message.root, task_run_id=task_run_id, status=status, **kwargs
+            )
+        return None
+class MCPResponseCall(BaseMCPCall):
+    """Record for an MCP response"""
+    direction: DirectionType = DirectionType.RECEIVED
+    call_type: str = MCPCallType.RECEIVE_RESPONSE
+    is_response_or_error: bool = True
+    is_error: bool = False
+    response_id: str | int | None = None
+    related_request_id: str | int | None = None
+    response_data: dict[str, Any] | None = None
+    error: str | None = None
+    error_type: str | None = None
+    @classmethod
+    def from_jsonrpc_response(
+        cls, response: JSONRPCResponse | JSONRPCError, task_run_id: str, **kwargs: Any
+    ) -> MCPResponseCall:
+        """Create telemetry record from a JSONRPCResponse or JSONRPCError"""
+        is_error = isinstance(response, JSONRPCError)
+        result = cls(
+            task_run_id=task_run_id,
+            status=StatusType.COMPLETED,
+            response_id=response.id,
+            message_id=response.id,
+            related_request_id=response.id,  # In MCP, response ID matches request ID
+            is_error=is_error,
+            method=f"response_to_id_{response.id}",
+            response_data=response.model_dump(exclude_none=True),
+            **kwargs,
+        )
+        if is_error and hasattr(response, "error"):
+            result.error = response.error.message
+            result.error_type = str(response.error.code)
+        return result
+    @classmethod
+    def from_session_message(
+        cls, message: SessionMessage, task_run_id: str, **kwargs: Any
+    ) -> MCPResponseCall | None:
+        """Create telemetry record from a SessionMessage containing a response or error"""
+        if (
+            hasattr(message, "message")
+            and hasattr(message.message, "root")
+            and isinstance(message.message.root, JSONRPCResponse | JSONRPCError)
+        ):
+            return cls.from_jsonrpc_response(
+                message.message.root, task_run_id=task_run_id, **kwargs
+            )
+        return None
+class MCPNotificationCall(BaseMCPCall):
+    """Record for an MCP notification"""
+    direction: DirectionType = DirectionType.SENT
+    call_type: str = MCPCallType.SEND_NOTIFICATION
+    start_time: float
+    end_time: float | None = None
+    duration: float | None = None
+    notification_data: dict[str, Any] | None = None
+    @classmethod
+    def from_jsonrpc_notification(
+        cls,
+        notification: JSONRPCNotification,
+        task_run_id: str,
+        status: StatusType = StatusType.STARTED,
+        **kwargs: Any,
+    ) -> MCPNotificationCall:
+        """Create telemetry record from a JSONRPCNotification"""
+        return cls(
+            task_run_id=task_run_id,
+            status=status,
+            method=notification.method,
+            notification_data=notification.model_dump(exclude_none=True),
+            start_time=datetime.now().timestamp(),
+            **kwargs,
+        )
+    @classmethod
+    def from_session_message(
+        cls,
+        message: SessionMessage,
+        task_run_id: str,
+        status: StatusType = StatusType.STARTED,
+        **kwargs: Any,
+    ) -> MCPNotificationCall | None:
+        """Create telemetry record from a SessionMessage containing a JSONRPCNotification"""
+        if (
+            hasattr(message, "message")
+            and hasattr(message.message, "root")
+            and isinstance(message.message.root, JSONRPCNotification)
+        ):
+            return cls.from_jsonrpc_notification(
+                message.message.root, task_run_id=task_run_id, status=status, **kwargs
+            )
+        return None
+class MCPStreamEvent(BaseMCPCall):
+    """Record for an MCP stream event (read or write)"""
+    stream_event: bool = True
+    event_type: str = Field(..., description="Type of stream event: read or write")
+    item_type: str | None = None
+    is_response_or_error: bool = False
+    message_data: dict[str, Any] | None = None
+    @classmethod
+    def from_session_message(
+        cls, message: SessionMessage, task_run_id: str, event_type: str, **kwargs: Any
+    ) -> MCPStreamEvent:
+        """Create telemetry record for a stream event"""
+        method_name = "unknown_stream_operation"
+        is_response = False
+        item_type = "unknown"
+        message_data = None
+        if hasattr(message, "message") and hasattr(message.message, "root"):
+            msg_root = message.message.root
+            item_type = type(msg_root).__name__
+            message_data = msg_root.model_dump(exclude_none=True)
+            # Check type first before accessing attributes
+            if isinstance(msg_root, JSONRPCRequest | JSONRPCNotification) and hasattr(
+                msg_root, "method"
+            ):
+                method_name = msg_root.method
+            elif isinstance(msg_root, JSONRPCResponse | JSONRPCError) and hasattr(msg_root, "id"):
+                method_name = f"response_to_id_{msg_root.id}"
+                is_response = True
+        return cls(
+            task_run_id=task_run_id,
+            status=StatusType.COMPLETED,
+            method=method_name,
+            event_type=event_type,
+            item_type=item_type,
+            is_response_or_error=is_response,
+            message_data=message_data,
+            timestamp=datetime.now().timestamp(),
+            **kwargs,
+        )
+class MCPManualTestCall(BaseMCPCall):
+    """Record for a manual test record"""
+    call_type: str = MCPCallType.MANUAL_TEST
+    custom_data: dict[str, Any] = Field(default_factory=dict)
+    @classmethod
+    def create(cls, task_run_id: str, **custom_data: Any) -> MCPManualTestCall:
+        """Create a manual test record with custom data"""
+        return cls(
+            task_run_id=task_run_id,
+            status=StatusType.COMPLETED,
+            custom_data=custom_data,
+            timestamp=datetime.now().timestamp(),
+        )
+class MCPTelemetryRecord(BaseModel):
+    """Container for a set of related MCP telemetry records"""
+    task_run_id: str
+    records: list[BaseMCPCall]
+    timestamp: float = Field(default_factory=lambda: datetime.now().timestamp())
+    @property
+    def count_by_type(self) -> dict[str, int]:
+        """Count records by call_type"""
+        result: dict[str, int] = {}
+        for record in self.records:
+            result[record.call_type] = result.get(record.call_type, 0) + 1
+        return result
+    @property
+    def count_by_direction(self) -> dict[str, int]:
+        """Count records by direction"""
+        result: dict[str, int] = {}
+        for record in self.records:
+            if record.direction:
+                direction = record.direction.value
+                result[direction] = result.get(direction, 0) + 1
+        return result
+class TrajectoryStep(BaseModel):
+    """Model representing a single step in a trajectory, for export."""
+    type: str = Field(default="mcp-step")  # Default for MCP calls
+    observation_url: str | None = None
+    observation_text: str | None = None
+    actions: list[dict[str, Any]] = Field(default_factory=list)
+    start_timestamp: str | None = None  # ISO 8601 format
+    end_timestamp: str | None = None  # ISO 8601 format
+    metadata: dict[str, Any] = Field(default_factory=dict)

hud/telemetry/tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # Tests for hud.telemetry module

hud/telemetry/tests/test_context.py ADDED Viewed

@@ -0,0 +1,203 @@
+from __future__ import annotations
+from unittest.mock import MagicMock
+from hud.telemetry.context import (
+    buffer_mcp_call,
+    flush_buffer,
+    get_current_task_run_id,
+    is_root_trace,
+    set_current_task_run_id,
+)
+from hud.telemetry.mcp_models import BaseMCPCall
+class TestTaskRunIdContext:
+    """Test task run ID context management."""
+    def test_get_current_task_run_id_initial(self):
+        """Test getting task run ID when none is set."""
+        # Reset context for clean test
+        set_current_task_run_id(None)
+        result = get_current_task_run_id()
+        assert result is None
+    def test_set_and_get_task_run_id(self):
+        """Test setting and getting task run ID."""
+        test_id = "test-task-run-id"
+        set_current_task_run_id(test_id)
+        result = get_current_task_run_id()
+        assert result == test_id
+    def test_task_run_id_isolation(self):
+        """Test that task run IDs are isolated per context."""
+        # This test simulates what would happen in different contexts
+        set_current_task_run_id("context-1")
+        assert get_current_task_run_id() == "context-1"
+        set_current_task_run_id("context-2")
+        assert get_current_task_run_id() == "context-2"
+        # Reset to None
+        set_current_task_run_id(None)
+        assert get_current_task_run_id() is None
+class TestRootTraceContext:
+    """Test root trace context management."""
+    def test_is_root_trace_initial(self):
+        """Test is_root_trace initial state."""
+        # The initial state may vary, so we just test that it returns a boolean
+        result = is_root_trace.get()
+        assert isinstance(result, bool)
+    def test_set_root_trace(self):
+        """Test setting root trace state."""
+        is_root_trace.set(True)
+        assert is_root_trace.get() is True
+        is_root_trace.set(False)
+        assert is_root_trace.get() is False
+class TestMCPCallBuffer:
+    """Test MCP call buffer management."""
+    def setUp(self):
+        """Clear buffer before each test."""
+        # Flush any existing calls and reset context
+        flush_buffer()
+        set_current_task_run_id(None)
+    def test_flush_buffer_empty(self):
+        """Test flushing empty buffer."""
+        self.setUp()
+        result = flush_buffer()
+        assert result == []
+    def test_add_and_flush_mcp_call(self):
+        """Test adding and flushing MCP calls."""
+        self.setUp()
+        # Set active task run ID
+        set_current_task_run_id("test-task")
+        # Create mock MCP call with required attributes
+        mock_call = MagicMock(spec=BaseMCPCall)
+        mock_call.model_dump.return_value = {"type": "test", "task_run_id": "test-task"}
+        mock_call.task_run_id = "test-task"
+        buffer_mcp_call(mock_call)
+        # Flush should return the call and clear buffer
+        result = flush_buffer()
+        assert len(result) == 1
+        assert result[0] == mock_call
+        # Buffer should be empty after flush
+        result2 = flush_buffer()
+        assert result2 == []
+    def test_add_multiple_mcp_calls(self):
+        """Test adding multiple MCP calls."""
+        self.setUp()
+        # Set active task run ID
+        set_current_task_run_id("test-task")
+        # Create multiple mock calls
+        mock_calls = []
+        for i in range(3):
+            mock_call = MagicMock(spec=BaseMCPCall)
+            mock_call.model_dump.return_value = {"type": f"test_{i}", "task_run_id": "test-task"}
+            mock_call.task_run_id = "test-task"
+            mock_calls.append(mock_call)
+            buffer_mcp_call(mock_call)
+        # Flush should return all calls
+        result = flush_buffer()
+        assert len(result) == 3
+        assert result == mock_calls
+    def test_buffer_isolation_per_task(self):
+        """Test that MCP call buffers contain all calls regardless of task ID."""
+        self.setUp()
+        # Set task run ID 1
+        set_current_task_run_id("task-1")
+        mock_call_1 = MagicMock(spec=BaseMCPCall)
+        mock_call_1.task_run_id = "task-1"
+        mock_call_1.model_dump.return_value = {"type": "test", "task_run_id": "task-1"}
+        buffer_mcp_call(mock_call_1)
+        # Set task run ID 2
+        set_current_task_run_id("task-2")
+        mock_call_2 = MagicMock(spec=BaseMCPCall)
+        mock_call_2.task_run_id = "task-2"
+        mock_call_2.model_dump.return_value = {"type": "test", "task_run_id": "task-2"}
+        buffer_mcp_call(mock_call_2)
+        # Flush should return all calls from both tasks
+        result = flush_buffer()
+        assert len(result) == 2
+        assert result[0] == mock_call_1
+        assert result[1] == mock_call_2
+    def test_buffer_mcp_call_without_task_id(self):
+        """Test adding MCP call when no task run ID is set."""
+        self.setUp()
+        set_current_task_run_id(None)
+        mock_call = MagicMock(spec=BaseMCPCall)
+        mock_call.task_run_id = None
+        buffer_mcp_call(mock_call)
+        # Should not buffer anything when no task ID is set
+        result = flush_buffer()
+        assert len(result) == 0
+class TestContextIntegration:
+    """Integration tests for context management."""
+    def test_context_lifecycle(self):
+        """Test complete context lifecycle."""
+        # Start with clean state
+        set_current_task_run_id(None)
+        flush_buffer()
+        is_root_trace.set(False)
+        # Set up trace context
+        task_id = "integration-test-task"
+        set_current_task_run_id(task_id)
+        is_root_trace.set(True)
+        # Add some MCP calls
+        mock_calls = []
+        for i in range(2):
+            mock_call = MagicMock(spec=BaseMCPCall)
+            mock_call.model_dump.return_value = {
+                "type": f"integration_test_{i}",
+                "task_run_id": task_id,
+            }
+            mock_call.task_run_id = task_id
+            mock_calls.append(mock_call)
+            buffer_mcp_call(mock_call)
+        # Verify context state
+        assert get_current_task_run_id() == task_id
+        assert is_root_trace.get() is True
+        # Flush and verify
+        result = flush_buffer()
+        assert len(result) == 2
+        assert result == mock_calls
+        # Clean up
+        set_current_task_run_id(None)
+        is_root_trace.set(False)
+        # Verify cleanup
+        assert get_current_task_run_id() is None
+        assert flush_buffer() == []

hud-python 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

Potentially problematic release.

hud-python 0.2.4py3-none-any.whl → 0.2.5py3-none-any.whl