PyPI - hud-python - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

hud-python 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (58) hide show

hud/__init__.py +4 -3
hud/adapters/claude/adapter.py +5 -14
hud/adapters/common/adapter.py +3 -3
hud/adapters/common/tests/__init__.py +0 -0
hud/adapters/common/tests/test_adapter.py +277 -0
hud/adapters/common/types.py +3 -3
hud/adapters/operator/adapter.py +16 -23
hud/agent/__init__.py +8 -1
hud/agent/base.py +28 -28
hud/agent/claude.py +69 -60
hud/agent/langchain.py +32 -26
hud/agent/operator.py +75 -67
hud/env/__init__.py +5 -5
hud/env/client.py +2 -2
hud/env/docker_client.py +37 -39
hud/env/environment.py +91 -66
hud/env/local_docker_client.py +5 -7
hud/env/remote_client.py +39 -32
hud/env/remote_docker_client.py +13 -3
hud/evaluators/__init__.py +2 -3
hud/evaluators/base.py +4 -3
hud/evaluators/inspect.py +3 -8
hud/evaluators/judge.py +34 -58
hud/evaluators/match.py +42 -49
hud/evaluators/remote.py +13 -26
hud/evaluators/tests/__init__.py +0 -0
hud/evaluators/tests/test_inspect.py +12 -0
hud/evaluators/tests/test_judge.py +231 -0
hud/evaluators/tests/test_match.py +115 -0
hud/evaluators/tests/test_remote.py +98 -0
hud/exceptions.py +167 -0
hud/gym.py +9 -7
hud/job.py +179 -109
hud/server/__init__.py +2 -2
hud/server/requests.py +148 -186
hud/server/tests/__init__.py +0 -0
hud/server/tests/test_requests.py +275 -0
hud/settings.py +3 -2
hud/task.py +9 -19
hud/taskset.py +44 -11
hud/trajectory.py +6 -9
hud/types.py +12 -9
hud/utils/__init__.py +2 -2
hud/utils/common.py +36 -15
hud/utils/config.py +45 -30
hud/utils/progress.py +34 -21
hud/utils/telemetry.py +10 -11
hud/utils/tests/__init__.py +0 -0
hud/utils/tests/test_common.py +52 -0
hud/utils/tests/test_config.py +129 -0
hud/utils/tests/test_progress.py +225 -0
hud/utils/tests/test_telemetry.py +37 -0
hud/utils/tests/test_version.py +8 -0
{hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/METADATA +9 -6
hud_python-0.2.4.dist-info/RECORD +62 -0
hud_python-0.2.2.dist-info/RECORD +0 -46
{hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/WHEEL +0 -0
{hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/licenses/LICENSE +0 -0

hud/agent/claude.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-import os
 from typing import Any, cast
 from anthropic import AsyncAnthropic
@@ -14,52 +13,48 @@ from anthropic.types.beta import (
 from hud.adapters import Adapter
 from hud.agent.base import Agent
 from hud.adapters.claude import ClaudeAdapter
-from hud.env.environment import Observation
+from hud.utils.common import Observation
 from hud.settings import settings
 logger = logging.getLogger(__name__)
 def base64_to_content_block(base64: str) -> BetaImageBlockParam:
     return {
         "type": "image",
-        "source": {
-            "type": "base64",
-            "media_type": "image/png",
-            "data": base64
-        }
+        "source": {"type": "base64", "media_type": "image/png", "data": base64},
     }
 def text_to_content_block(text: str) -> BetaTextBlockParam:
-    return {
-        "type": "text",
-        "text": text
-    }
+    return {"type": "text", "text": text}
+def tool_use_content_block(
+    tool_use_id: str, content: list[BetaTextBlockParam | BetaImageBlockParam]
+) -> BetaToolResultBlockParam:
+    return {"type": "tool_result", "tool_use_id": tool_use_id, "content": content}
-def tool_use_content_block(tool_use_id: str, content: list[BetaTextBlockParam | BetaImageBlockParam]) -> BetaToolResultBlockParam:
-    return {
-        "type": "tool_result",
-        "tool_use_id": tool_use_id,
-        "content": content
-    }
 # Claude's Computer Use Tool definition
 COMPUTER_TOOL: BetaToolComputerUse20250124Param = {
-    "type": "computer_20250124",
-    "name": "computer",
-    "display_width_px": 1024,
-    "display_height_px": 768
+    "type": "computer_20250124",
+    "name": "computer",
+    "display_width_px": 1024,
+    "display_height_px": 768,
 }
 class ClaudeAgent(Agent[AsyncAnthropic, Any]):
     """
     An agent implementation using Anthropic's Claude API with Computer Use.
     This agent interacts with HUD environments using Claude's Computer Use API
     through the ClaudeAdapter which converts actions to the format expected by HUD.
     """
     def __init__(
-        self,
+        self,
         client: AsyncAnthropic | None = None,
         adapter: Adapter | None = None,
         model: str = "claude-3-7-sonnet-20250219",
@@ -68,7 +63,7 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
     ):
         """
         Initialize the ClaudeAgent.
         Args:
             client: The AsyncAnthropic client for API calls (optional, created automatically if not provided)
             adapter: The adapter to use for preprocessing and postprocessing
@@ -81,28 +76,30 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
             # Get API key from settings
             api_key = settings.anthropic_api_key
             if not api_key:
-                raise ValueError("Anthropic API key not found in settings or environment variables. Set ANTHROPIC_API_KEY.")
+                raise ValueError(
+                    "Anthropic API key not found in settings or environment variables. Set ANTHROPIC_API_KEY."
+                )
             # Create client
             client = AsyncAnthropic(api_key=api_key)
         adapter = adapter or ClaudeAdapter()
         super().__init__(client=client, adapter=adapter)
         self.model = model
         self.max_tokens = max_tokens
         self.max_iterations = max_iterations
         # Default dimensions - will be updated if adapter is provided
         self.width_px = 1024
         self.height_px = 768
         # Update dimensions if adapter is provided
         if self.adapter:
             self.width_px = self.adapter.agent_width
             self.height_px = self.adapter.agent_height
         # Message history
         self.messages: list[BetaMessageParam] = []
         self.pending_computer_use_tool_id = None
@@ -110,17 +107,17 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
     async def fetch_response(self, observation: Observation) -> tuple[list[Any], bool]:
         """
         Fetch a response from Claude based on the observation.
         Args:
             observation: The preprocessed observation
         Returns:
             tuple[list[Any], bool]: A tuple containing the list of raw actions and a
                                    boolean indicating if the agent believes the task is complete
         """
         if not self.client:
             raise ValueError("Client is required")
         # Prepare the user content for Claude
         user_content: list[BetaImageBlockParam | BetaTextBlockParam | BetaToolResultBlockParam] = []
@@ -128,7 +125,7 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
         if observation.text:
             logger.info("Adding text to user content: %s", observation.text)
             user_content.append(text_to_content_block(str(observation.text)))
         # Add screenshot if present
         if observation.screenshot:
             logger.info("Adding screenshot to user content")
@@ -136,20 +133,28 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
                 logger.info("Adding screenshot to user content, no tool id")
                 user_content.append(base64_to_content_block(observation.screenshot))
             else:
-                logger.info("Adding screenshot to user content, tool id: %s", self.pending_computer_use_tool_id)
+                logger.info(
+                    "Adding screenshot to user content, tool id: %s",
+                    self.pending_computer_use_tool_id,
+                )
                 user_content.append(
                     tool_use_content_block(
-                        self.pending_computer_use_tool_id,
-                        [base64_to_content_block(observation.screenshot)]
+                        self.pending_computer_use_tool_id,
+                        [base64_to_content_block(observation.screenshot)],
                     )
                 )
                 self.pending_computer_use_tool_id = None
         # Add the user content to the messages
-        self.messages.append(cast(BetaMessageParam, {
-            "role": "user",
-            "content": user_content,
-        }))
+        self.messages.append(
+            cast(
+                BetaMessageParam,
+                {
+                    "role": "user",
+                    "content": user_content,
+                },
+            )
+        )
         # Call Claude API using async client
         response = await self.client.beta.messages.create(
@@ -158,30 +163,35 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
             messages=self.messages,
             tools=[COMPUTER_TOOL],
             betas=["computer-use-2025-01-24"],
-            tool_choice={"type": "auto", "disable_parallel_tool_use": True}
+            tool_choice={"type": "auto", "disable_parallel_tool_use": True},
         )
         # Add Claude's response to the conversation history
         response_content = response.content
-        self.messages.append(cast(BetaMessageParam, {
-            "role": "assistant",
-            "content": response_content,
-        }))
+        self.messages.append(
+            cast(
+                BetaMessageParam,
+                {
+                    "role": "assistant",
+                    "content": response_content,
+                },
+            )
+        )
         # Process tool use
         actions: list[Any] = []
         done = True  # Assume we're done unless we find a tool use
         for block in response_content:
             logger.info("Processing block: %s", block)
             if block.type == "tool_use":
                 logger.info("Processing tool use: %s", block)
                 assert block.name == "computer"
                 # Store the raw action
                 actions.append(block.input)
                 self.pending_computer_use_tool_id = block.id
                 # If we found a tool use, we're not done
                 done = False
                 break
@@ -192,16 +202,15 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
             for block in response_content:
                 if block.type == "text":
                     final_text_response += block.text
             if final_text_response.strip():
-                logger.info(f"No tool use found. Using final text as response: {final_text_response}")
-                actions = [{
-                    "action": "response",
-                    "text": final_text_response.strip()
-                }]
+                logger.info(
+                    f"No tool use found. Using final text as response: {final_text_response}"
+                )
+                actions = [{"action": "response", "text": final_text_response.strip()}]
                 # Keep done = True
             else:
-                 logger.info("No tool use and no final text block found.")
-                 # Keep done = True, actions remains empty
+                logger.info("No tool use and no final text block found.")
+                # Keep done = True, actions remains empty
         return actions, done

hud/agent/langchain.py CHANGED Viewed

@@ -10,9 +10,8 @@ from pydantic import Field, BaseModel
 # HUD imports
 from hud.adapters import Adapter
 from hud.agent.base import Agent
-from hud.env.environment import Observation
+from hud.utils.common import Observation
 from hud.adapters.common.types import (
-    CLA,
     ClickAction,
     TypeAction,
     ScrollAction,
@@ -44,17 +43,23 @@ SingleCLAction = Union[
     ResponseAction,
 ]
 # Define a Pydantic model to wrap the single action, potentially making it
 # easier for the LLM to consistently output the desired structure.
 class StepAction(BaseModel):
     """Wrapper model requesting a single concrete CLA action from the Langchain model."""
-    action: SingleCLAction = Field(..., description="The single CLA action to perform for this step.")
+    action: SingleCLAction = Field(
+        ..., description="The single CLA action to perform for this step."
+    )
 # Generic Type for the Langchain Model/Runnable
 # Allows flexibility in what the user provides (model, chain, etc.)
 # Bound to BaseLanguageModel as .with_structured_output is expected
 LangchainModelOrRunnable = TypeVar("LangchainModelOrRunnable", bound=BaseLanguageModel)
 class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainModelOrRunnable]):
     """
     An agent that uses an arbitrary Langchain model or runnable, leveraging
@@ -80,8 +85,8 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
             system_prompt: An optional system prompt to guide the Langchain model.
                            If None, a default prompt encouraging single CLA output is used.
         """
-        super().__init__(client=langchain_model, adapter=adapter) # Store model as 'client'
-        self.langchain_model = langchain_model # Also store with specific name
+        super().__init__(client=langchain_model, adapter=adapter)  # Store model as 'client'
+        self.langchain_model = langchain_model  # Also store with specific name
         self.system_prompt_str = system_prompt or self._get_default_system_prompt()
         self.history: List[BaseMessage] = []
@@ -97,7 +102,7 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
             "If you believe the task is complete based on the user's prompt and the observations, use the 'ResponseAction'."
         )
-    async def fetch_response(self, observation: Observation) -> tuple[CLA | None, bool]:
+    async def fetch_response(self, observation: Observation) -> tuple[list[dict], bool]:
         """
         Fetches a response from the configured Langchain model, expecting a single
         structured CLA action.
@@ -117,17 +122,17 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
         if observation.screenshot:
             # Assuming the Langchain model/chain can handle base64 images
             # This might need adjustment based on the specific model used.
-            human_content.append({
-                "type": "image_url",
-                "image_url": {
-                    "url": f"data:image/png;base64,{observation.screenshot}"
+            human_content.append(
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{observation.screenshot}"},
                 }
-            })
+            )
         if not human_content:
-             logger.warning("LangchainAgent received an observation with no text or screenshot.")
-             # Decide how to handle empty observation - perhaps return no action?
-             return [], False # Or raise an error?
+            logger.warning("LangchainAgent received an observation with no text or screenshot.")
+            # Decide how to handle empty observation - perhaps return no action?
+            return [], False  # Or raise an error?
         current_human_message = HumanMessage(content=human_content)
@@ -142,8 +147,7 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
         # We ask for the StepAction wrapper, which contains the actual SingleCLAAction
         # Explicitly use method="function_calling" to handle schemas with default values
         structured_llm = self.langchain_model.with_structured_output(
-            schema=StepAction,
-            method="function_calling"
+            schema=StepAction, method="function_calling"
         )
         # 4. Invoke Langchain model asynchronously
@@ -156,7 +160,7 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
         # 5. Process the structured response
         is_done = False
-        ai_message_content_for_history = "" # For storing in history
+        ai_message_content_for_history = ""  # For storing in history
         if isinstance(ai_response_structured, StepAction):
             # Successfully got the wrapper, extract the actual action
@@ -164,22 +168,24 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
             ai_message_content_for_history = actual_action.model_dump()
             if isinstance(actual_action, ResponseAction):
                 is_done = True
-                logger.info(f"LangchainAgent determined task is done with response: {actual_action.text[:100]}...")
+                logger.info(
+                    f"LangchainAgent determined task is done with response: {actual_action.text[:100]}..."
+                )
             else:
-                 logger.info(f"LangchainAgent produced action: {type(actual_action).__name__}")
+                logger.info(f"LangchainAgent produced action: {type(actual_action).__name__}")
         else:
             logger.warning(
                 f"Langchain model did not return the expected StepAction structure. "
                 f"Received type: {type(ai_response_structured)}. Value: {ai_response_structured!r}"
             )
-             # Attempt to add raw response to history for debugging
+            # Attempt to add raw response to history for debugging
             if isinstance(ai_response_structured, BaseMessage):
-                 ai_message_content_for_history = ai_response_structured.content
+                ai_message_content_for_history = ai_response_structured.content
             elif isinstance(ai_response_structured, str):
-                 ai_message_content_for_history = ai_response_structured
+                ai_message_content_for_history = ai_response_structured
             else:
-                 ai_message_content_for_history = repr(ai_response_structured)
+                ai_message_content_for_history = repr(ai_response_structured)
             # Return no action as we didn't get the expected structure
             return [], False
@@ -192,7 +198,7 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
         if actual_action:
             # Return the single action dictionary within a list
-            return [actual_action], is_done
+            return [actual_action.model_dump()], is_done
         else:
             # Should ideally not happen if structure validation worked, but as a fallback
-            return [], is_done
+            return [], is_done

hud/agent/operator.py CHANGED Viewed

@@ -10,36 +10,37 @@ from openai.types.responses import (
     ResponseInputItemParam,
     ResponseOutputMessage,
     ResponseComputerToolCall,
-    ResponseOutputText
+    ResponseOutputText,
 )
 from hud.adapters import Adapter
 from hud.agent.base import Agent
 from hud.adapters.operator import OperatorAdapter
-from hud.env.environment import Observation
+from hud.utils.common import Observation
 from hud.settings import settings
 logger = logging.getLogger(__name__)
 class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
     """
     An agent implementation using OpenAI's Computer Use API.
     This agent interacts with HUD environments using OpenAI's Computer Use API
     through the OperatorAdapter which converts actions to the format expected by HUD.
     """
     def __init__(
-        self,
+        self,
         client: OpenAI | None = None,
         model: str = "computer-use-preview",
         environment: Literal["windows", "mac", "linux", "browser"] = "windows",
         adapter: Adapter | None = None,
-        max_iterations: int = 8
+        max_iterations: int = 8,
     ):
         """
         Initialize the OperatorAgent.
         Args:
             client: The OpenAI client for API calls (optional, created automatically if not provided)
             model: The model to use for computer use
@@ -52,28 +53,30 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
             # Get API key from settings
             api_key = settings.openai_api_key
             if not api_key:
-                raise ValueError("OpenAI API key not found in settings or environment variables. Set OPENAI_API_KEY.")
+                raise ValueError(
+                    "OpenAI API key not found in settings or environment variables. Set OPENAI_API_KEY."
+                )
             # Create synchronous client
             client = OpenAI(api_key=api_key)
         adapter = adapter or OperatorAdapter()
         super().__init__(client=client, adapter=adapter)
         self.model = model
         self.environment = environment
         self.max_iterations = max_iterations
         # Default dimensions
         self.width = 1024
         self.height = 768
         # Update dimensions if adapter is provided
         if self.adapter:
             self.width = self.adapter.agent_width
             self.height = self.adapter.agent_height
         # Message history and state tracking
         self.last_response_id = None
         self.pending_call_id = None
@@ -82,86 +85,91 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
     async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
         """
         Fetch a response from the model based on the observation.
         Args:
             observation: The preprocessed observation
         Returns:
             tuple[list[dict[str, Any]], bool]: A tuple containing the list of raw actions and a
                                              boolean indicating if the agent believes the task is complete
         """
         if not self.client:
             raise ValueError("Client is required")
         # Define the computer use tool with correct type using cast
-        computer_tool = cast(ToolParam, {
-            "type": "computer_use_preview",
-            "display_width": self.width,
-            "display_height": self.height,
-            "environment": self.environment
-        })
+        computer_tool = cast(
+            ToolParam,
+            {
+                "type": "computer_use_preview",
+                "display_width": self.width,
+                "display_height": self.height,
+                "environment": self.environment,
+            },
+        )
         # Process the observation based on whether it's the first one or a response to an action
         if self.pending_call_id is None and self.last_response_id is None:
             # This is the first observation, store and send the prompt
             self.initial_prompt = observation.text
             # Create the initial request following the required structure
             input_content: list[dict[str, Any]] = [
                 {"type": "input_text", "text": observation.text or ""}
             ]
             # Add screenshot if present
             if observation.screenshot:
-                input_content.append({
-                    "type": "input_image",
-                    "image_url": f"data:image/png;base64,{observation.screenshot}"
-                })
+                input_content.append(
+                    {
+                        "type": "input_image",
+                        "image_url": f"data:image/png;base64,{observation.screenshot}",
+                    }
+                )
             # Structure the input correctly for the API using cast
-            input_param = cast(ResponseInputParam, [{
-                "role": "user",
-                "content": input_content
-            }])
+            input_param = cast(ResponseInputParam, [{"role": "user", "content": input_content}])
             # Call OpenAI API for the initial prompt (synchronous call)
             response = self.client.responses.create(
-                model=self.model,
-                tools=[computer_tool],
-                input=input_param,
-                truncation="auto"
+                model=self.model, tools=[computer_tool], input=input_param, truncation="auto"
             )
         else:
             # This is a response to a previous action
             if not observation.screenshot:
                 logger.warning("No screenshot provided for response to action")
                 return [], True
             # Create a response to the previous action with the new screenshot
-            input_param_followup = cast(ResponseInputParam, [
-                    cast(ResponseInputItemParam, {
-                        "call_id": self.pending_call_id,
-                        "type": "computer_call_output",
-                        "output": {
-                            "type": "input_image",
-                            "image_url": f"data:image/png;base64,{observation.screenshot}"
-                        }
-                    })
-                ])
+            input_param_followup = cast(
+                ResponseInputParam,
+                [
+                    cast(
+                        ResponseInputItemParam,
+                        {
+                            "call_id": self.pending_call_id,
+                            "type": "computer_call_output",
+                            "output": {
+                                "type": "input_image",
+                                "image_url": f"data:image/png;base64,{observation.screenshot}",
+                            },
+                        },
+                    )
+                ],
+            )
             # Call OpenAI API for follow-up (synchronous call)
             response = self.client.responses.create(
                 model=self.model,
                 previous_response_id=self.last_response_id,
                 tools=[computer_tool],
                 input=input_param_followup,
-                truncation="auto"
+                truncation="auto",
             )
         # Store the response ID for the next call
         self.last_response_id = response.id
         # Process the response to extract actions or final text
         actions = []
         done = True  # Assume done unless a computer call is found
@@ -169,17 +177,18 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
         # Check for computer calls first
         computer_calls = [
-            item for item in response.output
+            item
+            for item in response.output
             if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
         ]
         if computer_calls:
             # If computer calls exist, process them and set done=False
             done = False
             for computer_call in computer_calls:
                 self.pending_call_id = computer_call.call_id
                 action = computer_call.action
-                actions.append(action.model_dump()) # Convert Pydantic model to dict
+                actions.append(action.model_dump())  # Convert Pydantic model to dict
                 logger.info(f"Computer call action: {action}")
         else:
             # No computer calls, check for a final text message
@@ -188,21 +197,20 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
             for item in response.output:
                 if isinstance(item, ResponseOutputMessage) and item.type == "message":
                     # Extract text from content blocks within the message
-                    full_text = "".join([c.text for c in item.content if isinstance(c, ResponseOutputText)])
+                    full_text = "".join(
+                        [c.text for c in item.content if isinstance(c, ResponseOutputText)]
+                    )
                     if full_text:
                         final_text_response = full_text
                         logger.info(f"Final text message: {final_text_response}")
-                        break # Stop after finding the first text message
+                        break  # Stop after finding the first text message
             # If we found final text, package it as a 'response' action
             if final_text_response:
-                actions = [{
-                    "type": "response",
-                    "text": final_text_response
-                }]
+                actions = [{"type": "response", "text": final_text_response}]
                 # Keep done = True
             else:
                 logger.info("No computer calls and no final text message found.")
                 # Keep done = True, actions remains empty
-        return actions, done
+        return actions, done

hud-python 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

Potentially problematic release.

hud-python 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl