PyPI - hud-python - Versions diffs - 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

hud-python 0.2.4py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (50) hide show

hud/__init__.py +22 -2
hud/adapters/claude/adapter.py +9 -2
hud/adapters/claude/tests/__init__.py +1 -0
hud/adapters/claude/tests/test_adapter.py +519 -0
hud/adapters/common/types.py +5 -1
hud/adapters/operator/adapter.py +4 -0
hud/adapters/operator/tests/__init__.py +1 -0
hud/adapters/operator/tests/test_adapter.py +370 -0
hud/agent/__init__.py +4 -0
hud/agent/base.py +18 -2
hud/agent/claude.py +20 -17
hud/agent/claude_plays_pokemon.py +282 -0
hud/agent/langchain.py +12 -7
hud/agent/misc/__init__.py +3 -0
hud/agent/misc/response_agent.py +80 -0
hud/agent/operator.py +27 -19
hud/agent/tests/__init__.py +1 -0
hud/agent/tests/test_base.py +202 -0
hud/env/docker_client.py +28 -18
hud/env/environment.py +32 -16
hud/env/local_docker_client.py +83 -42
hud/env/remote_client.py +1 -3
hud/env/remote_docker_client.py +72 -15
hud/exceptions.py +12 -0
hud/gym.py +71 -53
hud/job.py +52 -7
hud/settings.py +6 -0
hud/task.py +45 -33
hud/taskset.py +44 -4
hud/telemetry/__init__.py +21 -0
hud/telemetry/_trace.py +173 -0
hud/telemetry/context.py +193 -0
hud/telemetry/exporter.py +417 -0
hud/telemetry/instrumentation/__init__.py +3 -0
hud/telemetry/instrumentation/mcp.py +498 -0
hud/telemetry/instrumentation/registry.py +59 -0
hud/telemetry/mcp_models.py +331 -0
hud/telemetry/tests/__init__.py +1 -0
hud/telemetry/tests/test_context.py +203 -0
hud/telemetry/tests/test_trace.py +270 -0
hud/types.py +10 -26
hud/utils/common.py +22 -2
hud/utils/misc.py +53 -0
hud/utils/tests/test_version.py +1 -1
hud/version.py +7 -0
{hud_python-0.2.4.dist-info → hud_python-0.2.5.dist-info}/METADATA +90 -22
hud_python-0.2.5.dist-info/RECORD +84 -0
hud_python-0.2.4.dist-info/RECORD +0 -62
{hud_python-0.2.4.dist-info → hud_python-0.2.5.dist-info}/WHEEL +0 -0
{hud_python-0.2.4.dist-info → hud_python-0.2.5.dist-info}/licenses/LICENSE +0 -0

hud/agent/claude_plays_pokemon.py ADDED Viewed

@@ -0,0 +1,282 @@
+from __future__ import annotations
+import json
+import logging
+from typing import Any, cast
+from anthropic import AsyncAnthropic
+from anthropic.types.beta import (
+    BetaMessageParam,
+    BetaTextBlockParam,
+    BetaImageBlockParam,
+)
+from hud.agent import Agent
+from hud.adapters import Adapter
+from hud.settings import settings
+from hud.env.environment import Observation
+logger = logging.getLogger(__name__)
+# Constants
+DEFAULT_MODEL = "claude-3-7-sonnet-20250219"
+DEFAULT_MAX_TOKENS = 4096
+DEFAULT_MAX_ITERATIONS = 10
+DEFAULT_TEMPERATURE = 0.7
+DEFAULT_MAX_MESSAGE_MEMORY = 20
+def generate_system_prompt(game_name: str) -> str:
+    """Generate the system prompt for the AI agent.
+    Args:
+        game_name: Name of the game being played
+    Returns:
+        str: The system prompt for the AI agent
+    """
+    return """You are a specialized AI assistant designed to play Pokémon games via screenshot analysis and text instructions. Your task is to understand the current game state from visual input, determine appropriate actions, and respond with structured outputs that control the game.
+For each turn, you will receive:
+1. A screenshot of the current game state
+2. Contextual information about the game progress, recent events, and objectives
+Based on this information, you must analyze the situation, determine the best course of action, and provide a structured JSON response.
+## Response Format
+Your response MUST follow this exact JSON format with no additional markers, tags, or block delimiters:
+{
+  "analysis": "Brief analysis of the current game situation, visible UI elements, and important context (1-3 sentences)",
+  "current_objective": "The immediate goal based on the game state (single sentence)",
+  "reasoning": "Step-by-step logic explaining your chosen action sequence (2-4 sentences)",
+  "progress_assessment": "Evaluation of whether previous action(s) achieved their intended goal and why/why not (1-2 sentences)",
+  "actions": [
+    {
+      "type": "press",
+      "keys": ["up"|"down"|"left"|"right"|"a"|"b"|"start"|"select"|"pause"]
+    },
+    {
+      "type": "wait",
+      "time": milliseconds_to_wait
+    }
+  ]
+}
+IMPORTANT: Do not include any conversation markers like <<ASSISTANT_CONVERSATION_START>> or <<ASSISTANT_CONVERSATION_END>> around your response. Provide only the clean JSON object.
+## Action Types
+- Button presses: {"type": "press", "keys": ["button_name"]} - Valid buttons are: up, down, left, right, a, b, start, select, pause
+- Wait for processing: {"type": "wait", "time": milliseconds}
+## Important Rules
+1. Never use "wait" commands while the game is paused. The game state will not change while paused, so waiting is ineffective.
+2. If you detect the game is paused, your next action should be to unpause by using {"type": "press", "keys": ["pause"]} before attempting other actions.
+3. Maintain awareness of whether the game is in a paused state based on visual cues in the screenshot.
+## Game Play Guidelines
+1. **Navigation**: Use directional buttons to move the character or navigate menus
+2. **Interaction**: Use 'a' to confirm selections and interact with objects/NPCs, 'b' to cancel or exit menus
+3. **Menu Access**: Use 'start' to access the game menu
+4. **Battle Strategy**: Analyze Pokémon types, moves, and stats to make optimal battle decisions
+5. **Progressive Play**: Work toward completing the current objective while being mindful of longer-term goals like leveling Pokémon, collecting badges, and advancing the story
+6. **Resource Management**: Monitor and manage HP, PP, items, and Pokéballs effectively
+7. **Memory**: Maintain awareness of the game history and your previous actions to avoid repetitive behaviors
+Always provide thoughtful analysis and clear reasoning for your decisions. If you're uncertain about the best course of action, prioritize safe moves that gather more information.
+"""
+def extract_action_from_response_block(block: dict[str, Any]) -> list[dict[str, Any]]:
+    """Extract actions from a response block.
+    Args:
+        block: The response block containing actions
+    Returns:
+        list[dict[str, Any]]: List of actions extracted from the block
+    """
+    if "actions" in block:
+        actions = block["actions"]
+        if isinstance(actions, list):
+            return actions
+    return []
+def extract_json_from_response(response: str) -> str:
+    """Extract JSON from a response string.
+    Args:
+        response: The response string containing JSON
+    Returns:
+        str: The extracted JSON string
+    """
+    # Try to find JSON block with markdown code block markers
+    start = response.find("```json")
+    end = response.rfind("```")
+    if start != -1 and end != -1:
+        start += len("```json")
+        return response[start:end].strip()
+    # Try to find JSON object directly
+    start = response.find("{")
+    end = response.rfind("}")
+    if start != -1 and end != -1:
+        return response[start : end + 1].strip()
+    return response.strip()
+class ClaudePlaysPokemon(Agent[AsyncAnthropic, None]):
+    """AI agent that plays Pokémon games using Claude."""
+    def __init__(
+        self,
+        client: AsyncAnthropic | None = None,
+        adapter: Adapter | None = None,
+        model: str = DEFAULT_MODEL,
+        max_tokens: int = DEFAULT_MAX_TOKENS,
+        max_iterations: int = DEFAULT_MAX_ITERATIONS,
+        temperature: float = DEFAULT_TEMPERATURE,
+        max_message_memory: int = DEFAULT_MAX_MESSAGE_MEMORY,
+    ) -> None:
+        """Initialize the Claude Plays Pokémon agent.
+        Args:
+            client: Anthropic API client
+            adapter: Game adapter
+            model: Claude model to use
+            max_tokens: Maximum tokens for response
+            max_iterations: Maximum number of iterations
+            temperature: Response temperature
+            max_message_memory: Maximum number of messages to remember
+        Raises:
+            ValueError: If API key is not provided
+        """
+        if client is None:
+            api_key = settings.anthropic_api_key
+            if not api_key:
+                raise ValueError("Anthropic API key is required")
+            client = AsyncAnthropic(api_key=api_key)
+        if adapter is None:
+            adapter = Adapter()
+        super().__init__(
+            client=client,
+            adapter=adapter,
+        )
+        self.model = model
+        self.max_tokens = max_tokens
+        self.max_iterations = max_iterations
+        self.temperature = temperature
+        self.max_message_memory = max_message_memory
+        self.system_prompts: list[BetaMessageParam] = [
+            {
+                "role": "assistant",
+                "content": generate_system_prompt("Pokemon Red"),
+            }
+        ]
+        self.messages: list[BetaMessageParam] = []
+    async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
+        """Fetch a response from Claude based on the current observation.
+        Args:
+            observation: The current game observation
+        Returns:
+            tuple[list[dict[str, Any]], bool]: List of actions and whether the game is done
+        Raises:
+            ValueError: If client is not initialized
+        """
+        if not self.client:
+            raise ValueError("Client is not initialized")
+        user_content: list[BetaTextBlockParam | BetaImageBlockParam] = []
+        if observation.text:
+            user_content.append(
+                {
+                    "type": "text",
+                    "text": observation.text,
+                }
+            )
+        if observation.screenshot:
+            logger.debug("Processing screenshot data")
+            user_content.append(
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/png",
+                        "data": observation.screenshot,
+                    },
+                }
+            )
+        self.messages.append(
+            {
+                "role": "user",
+                "content": user_content,
+            }
+        )
+        logger.debug(
+            "Sending messages to Claude", extra={"messages": self.system_prompts + self.messages}
+        )
+        response = await self.client.beta.messages.create(
+            model=self.model,
+            messages=self.system_prompts + self.messages,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+        )
+        response_content = response.content
+        self.messages.append(
+            cast(
+                BetaMessageParam,
+                {
+                    "role": "user",
+                    "content": response_content,
+                },
+            )
+        )
+        # Maintain message memory limit
+        while len(self.messages) > self.max_message_memory:
+            self.messages.pop(0)
+        action_list: list[dict[str, Any]] = []
+        # Parse response content to extract actions
+        for block in response_content:
+            if block.type == "text":
+                text_json = extract_json_from_response(block.text)
+                try:
+                    text = json.loads(text_json)
+                    if not isinstance(text, dict):
+                        logger.error("Invalid response format", extra={"text": text})
+                        raise ValueError("Response is not a dictionary")
+                    action_list.extend(extract_action_from_response_block(text))
+                except json.JSONDecodeError as e:
+                    logger.error(
+                        "Failed to parse response", extra={"error": str(e), "text": text_json}
+                    )
+            else:
+                logger.error("Unexpected block type", extra={"type": type(block)})
+        logger.debug("Extracted actions", extra={"actions": action_list})
+        return action_list, False

hud/agent/langchain.py CHANGED Viewed

@@ -10,6 +10,7 @@ from pydantic import Field, BaseModel
 # HUD imports
 from hud.adapters import Adapter
 from hud.agent.base import Agent
+from hud.types import Gym
 from hud.utils.common import Observation
 from hud.adapters.common.types import (
     ClickAction,
@@ -66,6 +67,8 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
     Langchain's structured output capabilities to produce a single CLA action per step.
     """
+    transfer_gyms: dict[Gym, Gym] = {"qa": "hud-browser"}
     def __init__(
         self,
         langchain_model: LangchainModelOrRunnable,
@@ -102,7 +105,9 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
             "If you believe the task is complete based on the user's prompt and the observations, use the 'ResponseAction'."
         )
-    async def fetch_response(self, observation: Observation) -> tuple[list[dict], bool]:
+    async def fetch_response(
+        self, observation: Observation
+    ) -> tuple[list[dict | SingleCLAction], bool]:
         """
         Fetches a response from the configured Langchain model, expecting a single
         structured CLA action.
@@ -168,11 +173,11 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
             ai_message_content_for_history = actual_action.model_dump()
             if isinstance(actual_action, ResponseAction):
                 is_done = True
-                logger.info(
-                    f"LangchainAgent determined task is done with response: {actual_action.text[:100]}..."
-                )
-            else:
-                logger.info(f"LangchainAgent produced action: {type(actual_action).__name__}")
+                # logger.info(
+                #     f"LangchainAgent determined task is done with response: {actual_action.text[:100]}..."
+                # )
+            # else:
+            #     logger.info(f"LangchainAgent produced action: {type(actual_action).__name__}")
         else:
             logger.warning(
@@ -198,7 +203,7 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
         if actual_action:
             # Return the single action dictionary within a list
-            return [actual_action.model_dump()], is_done
+            return [actual_action], is_done
         else:
             # Should ideally not happen if structure validation worked, but as a fallback
             return [], is_done

hud/agent/misc/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .response_agent import ResponseAgent
+__all__ = ["ResponseAgent"]

hud/agent/misc/response_agent.py ADDED Viewed

@@ -0,0 +1,80 @@
+import json
+import os
+from typing import Literal, Optional
+from openai import AsyncOpenAI
+ResponseType = Literal["STOP", "CONTINUE"]
+class ResponseAgent:
+    """
+    An assistant that helps determine whether an agent should stop or continue
+    based on the agent's final response message.
+    """
+    def __init__(self, api_key: Optional[str] = None):
+        self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
+        if not self.api_key:
+            raise ValueError(
+                "OpenAI API key must be provided or set as OPENAI_API_KEY environment variable"
+            )
+        self.client = AsyncOpenAI(api_key=self.api_key)
+        self.system_prompt = """
+        You are an assistant that helps determine the appropriate response to an agent's message.
+        You will receive messages from an agent that is performing tasks for a user.
+        Your job is to analyze these messages and respond with one of the following:
+        - STOP: If the agent indicates it has successfully completed a task, even if phrased as a question
+          like "I have entered the right values into this form. Would you like me to do anything else?"
+          or "Here is the website. Is there any other information you need?"
+        - CONTINUE: If the agent is asking for clarification before proceeding with a task
+          like "I'm about to clear cookies from this website. Would you like me to proceed?"
+          or "I've entered the right values into this form. Would you like me to continue with the rest of the task?"
+        Respond ONLY with one of these two options.
+        """
+    async def determine_response(self, agent_message: str) -> ResponseType:
+        """
+        Determine whether the agent should stop or continue based on its message.
+        Args:
+            agent_message: The message from the agent
+        Returns:
+            ResponseType: Either "STOP" or "CONTINUE"
+        """
+        try:
+            response = await self.client.chat.completions.create(
+                model="gpt-4o",
+                messages=[
+                    {"role": "system", "content": self.system_prompt},
+                    {
+                        "role": "user",
+                        "content": f"Agent message: {agent_message}\n\nWhat is the appropriate response?",
+                    },
+                ],
+                temperature=0.1,  # Low temperature for more deterministic responses
+                max_tokens=5,  # We only need a short response
+            )
+            response_text = response.choices[0].message.content
+            if not response_text:
+                return "CONTINUE"
+            response_text = response_text.strip().upper()
+            # Validate the response
+            if "STOP" in response_text:
+                return "STOP"
+            else:
+                return "CONTINUE"
+        except Exception as e:
+            print(f"Error determining response: {e}")
+            return "CONTINUE"  # Default to continue on error

hud/agent/operator.py CHANGED Viewed

@@ -3,7 +3,7 @@ import logging
 import os
 from typing import Any, Literal, cast
-from openai import OpenAI
+from openai import AsyncOpenAI
 from openai.types.responses import (
     ToolParam,
     ResponseInputParam,
@@ -16,13 +16,14 @@ from openai.types.responses import (
 from hud.adapters import Adapter
 from hud.agent.base import Agent
 from hud.adapters.operator import OperatorAdapter
+from hud.types import Gym
 from hud.utils.common import Observation
 from hud.settings import settings
 logger = logging.getLogger(__name__)
-class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
+class OperatorAgent(Agent[AsyncOpenAI, dict[str, Any]]):
     """
     An agent implementation using OpenAI's Computer Use API.
@@ -30,11 +31,13 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
     through the OperatorAdapter which converts actions to the format expected by HUD.
     """
+    transfer_gyms: dict[Gym, Gym] = {"qa": "hud-browser"}
     def __init__(
         self,
-        client: OpenAI | None = None,
+        client: AsyncOpenAI | None = None,
         model: str = "computer-use-preview",
-        environment: Literal["windows", "mac", "linux", "browser"] = "windows",
+        environment: Literal["windows", "mac", "linux", "browser"] = "linux",
         adapter: Adapter | None = None,
         max_iterations: int = 8,
     ):
@@ -42,7 +45,7 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
         Initialize the OperatorAgent.
         Args:
-            client: The OpenAI client for API calls (optional, created automatically if not provided)
+            client: The AsyncOpenAI client for API calls (optional, created automatically if not provided)
             model: The model to use for computer use
             environment: The environment type (windows, mac, linux, browser)
             adapter: The adapter to use for preprocessing and postprocessing
@@ -57,8 +60,8 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
                     "OpenAI API key not found in settings or environment variables. Set OPENAI_API_KEY."
                 )
-            # Create synchronous client
-            client = OpenAI(api_key=api_key)
+            # Create asynchronous client
+            client = AsyncOpenAI(api_key=api_key)
         adapter = adapter or OperatorAdapter()
@@ -81,6 +84,7 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
         self.last_response_id = None
         self.pending_call_id = None
         self.initial_prompt = None
+        self.pending_safety_checks = []
     async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
         """
@@ -129,8 +133,8 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
             # Structure the input correctly for the API using cast
             input_param = cast(ResponseInputParam, [{"role": "user", "content": input_content}])
-            # Call OpenAI API for the initial prompt (synchronous call)
-            response = self.client.responses.create(
+            # Call OpenAI API for the initial prompt (asynchronous call)
+            response = await self.client.responses.create(
                 model=self.model, tools=[computer_tool], input=input_param, truncation="auto"
             )
@@ -153,13 +157,15 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
                                 "type": "input_image",
                                 "image_url": f"data:image/png;base64,{observation.screenshot}",
                             },
+                            "acknowledged_safety_checks": self.pending_safety_checks,
                         },
                     )
                 ],
             )
+            self.pending_safety_checks = []
-            # Call OpenAI API for follow-up (synchronous call)
-            response = self.client.responses.create(
+            # Call OpenAI API for follow-up (asynchronous call)
+            response = await self.client.responses.create(
                 model=self.model,
                 previous_response_id=self.last_response_id,
                 tools=[computer_tool],
@@ -188,12 +194,13 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
             for computer_call in computer_calls:
                 self.pending_call_id = computer_call.call_id
                 action = computer_call.action
+                self.pending_safety_checks = computer_call.pending_safety_checks
                 actions.append(action.model_dump())  # Convert Pydantic model to dict
-                logger.info(f"Computer call action: {action}")
+                # logger.info(f"Computer call action: {action}")
         else:
             # No computer calls, check for a final text message
-            logger.info("No computer call found. Checking for final message.")
-            logger.info(response.output)
+            # logger.info("No computer call found. Checking for final message.")
+            # logger.info(response.output)
             for item in response.output:
                 if isinstance(item, ResponseOutputMessage) and item.type == "message":
                     # Extract text from content blocks within the message
@@ -202,15 +209,16 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
                     )
                     if full_text:
                         final_text_response = full_text
-                        logger.info(f"Final text message: {final_text_response}")
+                        # logger.info(f"Final text message: {final_text_response}")
                         break  # Stop after finding the first text message
             # If we found final text, package it as a 'response' action
             if final_text_response:
+                # No ResponseAgent logic here anymore - just return the response
                 actions = [{"type": "response", "text": final_text_response}]
-                # Keep done = True
-            else:
-                logger.info("No computer calls and no final text message found.")
-                # Keep done = True, actions remains empty
+                done = True
+            # else:
+            #     logger.info("No computer calls and no final text message found.")
+            # Keep done = True, actions remains empty
         return actions, done

hud/agent/tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # Tests for hud.agent module

hud-python 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

Potentially problematic release.

hud-python 0.2.4py3-none-any.whl → 0.2.5py3-none-any.whl