hud-python 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

hud/__init__.py CHANGED
@@ -5,10 +5,10 @@ HUD Gym SDK - A Python SDK for interacting with HUD environments.
5
5
  from __future__ import annotations
6
6
 
7
7
  from . import agent, env, gym, settings, task, taskset, types, utils
8
- from .job import create_job, job, load_job
8
+ from .job import create_job, job, load_job, run_job
9
9
  from .taskset import load_taskset
10
10
 
11
- __version__ = "0.2.0"
11
+ __version__ = "0.2.2"
12
12
 
13
13
  __all__ = [
14
14
  "agent",
@@ -18,6 +18,7 @@ __all__ = [
18
18
  "job",
19
19
  "load_job",
20
20
  "load_taskset",
21
+ "run_job",
21
22
  "settings",
22
23
  "task",
23
24
  "taskset",
hud/adapters/__init__.py CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  from .claude import ClaudeAdapter
4
4
  from .common import CLA, Adapter
5
+ from .common.types import ResponseAction
5
6
  from .operator import OperatorAdapter
6
7
 
7
- __all__ = ["CLA", "Adapter", "ClaudeAdapter", "OperatorAdapter"]
8
+ __all__ = ["CLA", "Adapter", "ClaudeAdapter", "OperatorAdapter", "ResponseAction"]
@@ -13,6 +13,7 @@ from hud.adapters.common.types import (
13
13
  Point,
14
14
  PositionFetch,
15
15
  PressAction,
16
+ ResponseAction,
16
17
  ScreenshotFetch,
17
18
  ScrollAction,
18
19
  TypeAction,
@@ -21,7 +22,14 @@ from hud.adapters.common.types import (
21
22
 
22
23
 
23
24
  class ClaudeAdapter(Adapter):
24
- KEY_MAP: ClassVar[dict[str, CLAKey]] = {"Return": "enter"}
25
+ KEY_MAP: ClassVar[dict[str, CLAKey]] = {
26
+ "return": "enter",
27
+ "super": "win",
28
+ "super_l": "win",
29
+ "super_r": "win",
30
+ "right shift": "shift",
31
+ "left shift": "shift",
32
+ }
25
33
 
26
34
  def __init__(self) -> None:
27
35
  super().__init__()
@@ -30,7 +38,8 @@ class ClaudeAdapter(Adapter):
30
38
 
31
39
  def _map_key(self, key: str) -> CLAKey:
32
40
  """Map a key to its standardized form."""
33
- return self.KEY_MAP.get(key, key.lower()) # type: ignore
41
+ return self.KEY_MAP.get(key.lower(), key.lower()) # type: ignore
42
+
34
43
  def convert(self, data: Any) -> CLA:
35
44
  try:
36
45
  action_type = data.get("action")
@@ -151,6 +160,10 @@ class ClaudeAdapter(Adapter):
151
160
  elif action_type == "wait":
152
161
  assert "duration" in data
153
162
  return WaitAction(time=data["duration"])
163
+
164
+ elif action_type == "response":
165
+ return ResponseAction(text=data.get("text", ""))
166
+
154
167
  else:
155
168
  raise ValueError(f"Unsupported action type: {action_type}")
156
169
  except AssertionError:
@@ -20,7 +20,6 @@ class Point(BaseModel):
20
20
  class ClickAction(CLAAction):
21
21
  type: Literal["click"] = "click"
22
22
  point: Point | None = None
23
- selector: str | None = None
24
23
  button: Literal["left", "right", "wheel", "back", "forward"] = "left"
25
24
  pattern: list[int] | None = None # [delay_1, delay_2, ...]
26
25
  hold_keys: list[CLAKey] | None = None
@@ -48,7 +47,6 @@ class KeyUpAction(CLAAction):
48
47
  class TypeAction(CLAAction):
49
48
  type: Literal["type"] = "type"
50
49
  text: str
51
- selector: str | None = None
52
50
  enter_after: bool | None = False
53
51
 
54
52
 
@@ -64,7 +62,6 @@ class ScrollAction(CLAAction):
64
62
  class MoveAction(CLAAction):
65
63
  type: Literal["move"] = "move"
66
64
  point: Point | None = None
67
- selector: str | None = None
68
65
  offset: Point | None = None
69
66
 
70
67
 
@@ -82,6 +79,12 @@ class DragAction(CLAAction):
82
79
  hold_keys: list[CLAKey] | None = None
83
80
 
84
81
 
82
+ # RESPONSE ACTION from agent
83
+ class ResponseAction(CLAAction):
84
+ type: Literal["response"] = "response"
85
+ text: str # The final textual response from the agent
86
+
87
+
85
88
  # SCREENSHOT ACTION
86
89
  class ScreenshotFetch(CLAAction):
87
90
  type: Literal["screenshot"] = "screenshot"
@@ -103,6 +106,7 @@ CLA = Annotated[
103
106
  | KeyDownAction
104
107
  | KeyUpAction
105
108
  | TypeAction
109
+ | ResponseAction
106
110
  | ScrollAction
107
111
  | MoveAction
108
112
  | WaitAction
@@ -10,6 +10,7 @@ from hud.adapters.common.types import (
10
10
  MoveAction,
11
11
  Point,
12
12
  PressAction,
13
+ ResponseAction,
13
14
  ScreenshotFetch,
14
15
  ScrollAction,
15
16
  TypeAction,
@@ -19,11 +20,11 @@ from hud.adapters.common.types import (
19
20
 
20
21
  class OperatorAdapter(Adapter):
21
22
  KEY_MAP: ClassVar[dict[str, CLAKey]] = {
22
- "Return": "enter",
23
- "ArrowUp": "up",
24
- "ArrowDown": "down",
25
- "ArrowLeft": "left",
26
- "ArrowRight": "right",
23
+ "return": "enter",
24
+ "arrowup": "up",
25
+ "arrowdown": "down",
26
+ "arrowleft": "left",
27
+ "arrowright": "right",
27
28
  }
28
29
 
29
30
  def __init__(self) -> None:
@@ -34,7 +35,7 @@ class OperatorAdapter(Adapter):
34
35
 
35
36
  def _map_key(self, key: str) -> CLAKey:
36
37
  """Map a key to its standardized form."""
37
- return self.KEY_MAP.get(key, key.lower()) # type: ignore
38
+ return self.KEY_MAP.get(key.lower(), key.lower()) # type: ignore
38
39
 
39
40
  def convert(self, data: Any) -> CLA:
40
41
  """Convert a Computer Use action to a HUD action"""
@@ -86,6 +87,9 @@ class OperatorAdapter(Adapter):
86
87
 
87
88
  elif action_type == "screenshot":
88
89
  return ScreenshotFetch()
90
+
91
+ elif action_type == "response":
92
+ return ResponseAction(text=data.get("text", ""))
89
93
  else:
90
94
  raise ValueError(f"Unsupported action type: {action_type}")
91
95
 
hud/agent/__init__.py CHANGED
@@ -1,7 +1,8 @@
1
1
  from .base import Agent
2
2
  from .claude import ClaudeAgent
3
3
  from .operator import OperatorAgent
4
+ from .langchain import LangchainAgent
4
5
 
5
6
  from hud.adapters import OperatorAdapter, ClaudeAdapter
6
7
 
7
- __all__ = ["Agent", "ClaudeAgent", "OperatorAgent", "OperatorAdapter", "ClaudeAdapter"]
8
+ __all__ = ["Agent", "ClaudeAgent", "OperatorAgent", "OperatorAdapter", "ClaudeAdapter", "LangchainAgent"]
hud/agent/claude.py CHANGED
@@ -11,7 +11,7 @@ from anthropic.types.beta import (
11
11
  BetaImageBlockParam,
12
12
  )
13
13
 
14
-
14
+ from hud.adapters import Adapter
15
15
  from hud.agent.base import Agent
16
16
  from hud.adapters.claude import ClaudeAdapter
17
17
  from hud.env.environment import Observation
@@ -61,7 +61,7 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
61
61
  def __init__(
62
62
  self,
63
63
  client: AsyncAnthropic | None = None,
64
- adapter: ClaudeAdapter | None = None,
64
+ adapter: Adapter | None = None,
65
65
  model: str = "claude-3-7-sonnet-20250219",
66
66
  max_tokens: int = 4096,
67
67
  max_iterations: int = 10,
@@ -85,6 +85,8 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
85
85
 
86
86
  # Create client
87
87
  client = AsyncAnthropic(api_key=api_key)
88
+
89
+ adapter = adapter or ClaudeAdapter()
88
90
 
89
91
  super().__init__(client=client, adapter=adapter)
90
92
 
@@ -184,4 +186,22 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
184
186
  done = False
185
187
  break
186
188
 
189
+ # If no tool use action was found, check for a final text response
190
+ if not actions and done:
191
+ final_text_response = ""
192
+ for block in response_content:
193
+ if block.type == "text":
194
+ final_text_response += block.text
195
+
196
+ if final_text_response.strip():
197
+ logger.info(f"No tool use found. Using final text as response: {final_text_response}")
198
+ actions = [{
199
+ "action": "response",
200
+ "text": final_text_response.strip()
201
+ }]
202
+ # Keep done = True
203
+ else:
204
+ logger.info("No tool use and no final text block found.")
205
+ # Keep done = True, actions remains empty
206
+
187
207
  return actions, done
hud/agent/langchain.py ADDED
@@ -0,0 +1,198 @@
1
+ import logging
2
+ from typing import Any, Generic, List, Optional, TypeVar, Union, cast
3
+
4
+ # Langchain imports
5
+ from langchain_core.language_models import BaseLanguageModel
6
+ from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage
7
+ from langchain_core.runnables import Runnable, RunnableSerializable
8
+ from pydantic import Field, BaseModel
9
+
10
+ # HUD imports
11
+ from hud.adapters import Adapter
12
+ from hud.agent.base import Agent
13
+ from hud.env.environment import Observation
14
+ from hud.adapters.common.types import (
15
+ CLA,
16
+ ClickAction,
17
+ TypeAction,
18
+ ScrollAction,
19
+ MoveAction,
20
+ DragAction,
21
+ PressAction,
22
+ KeyDownAction,
23
+ KeyUpAction,
24
+ WaitAction,
25
+ ResponseAction,
26
+ CustomAction,
27
+ # Exclude ScreenshotFetch, PositionFetch as they are internal
28
+ )
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # Define a Pydantic Union type representing exactly ONE possible CLA action
33
+ # This is what we'll ask the Langchain model to output.
34
+ SingleCLAction = Union[
35
+ ClickAction,
36
+ TypeAction,
37
+ ScrollAction,
38
+ MoveAction,
39
+ DragAction,
40
+ PressAction,
41
+ KeyDownAction,
42
+ KeyUpAction,
43
+ WaitAction,
44
+ ResponseAction,
45
+ ]
46
+
47
+ # Define a Pydantic model to wrap the single action, potentially making it
48
+ # easier for the LLM to consistently output the desired structure.
49
+ class StepAction(BaseModel):
50
+ """Wrapper model requesting a single concrete CLA action from the Langchain model."""
51
+ action: SingleCLAction = Field(..., description="The single CLA action to perform for this step.")
52
+
53
+ # Generic Type for the Langchain Model/Runnable
54
+ # Allows flexibility in what the user provides (model, chain, etc.)
55
+ # Bound to BaseLanguageModel as .with_structured_output is expected
56
+ LangchainModelOrRunnable = TypeVar("LangchainModelOrRunnable", bound=BaseLanguageModel)
57
+
58
+ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainModelOrRunnable]):
59
+ """
60
+ An agent that uses an arbitrary Langchain model or runnable, leveraging
61
+ Langchain's structured output capabilities to produce a single CLA action per step.
62
+ """
63
+
64
+ def __init__(
65
+ self,
66
+ langchain_model: LangchainModelOrRunnable,
67
+ adapter: Optional[Adapter] = None,
68
+ system_prompt: str | None = None,
69
+ ):
70
+ """
71
+ Initialize the LangchainAgent.
72
+
73
+ Args:
74
+ langchain_model: The Langchain language model or runnable chain to use.
75
+ Must support asynchronous invocation (`ainvoke`) and
76
+ `.with_structured_output()`.
77
+ adapter: An optional HUD adapter. If provided, it will be used for
78
+ preprocessing observations (rescaling) and postprocessing
79
+ the single CLA action (coordinate rescaling).
80
+ system_prompt: An optional system prompt to guide the Langchain model.
81
+ If None, a default prompt encouraging single CLA output is used.
82
+ """
83
+ super().__init__(client=langchain_model, adapter=adapter) # Store model as 'client'
84
+ self.langchain_model = langchain_model # Also store with specific name
85
+
86
+ self.system_prompt_str = system_prompt or self._get_default_system_prompt()
87
+ self.history: List[BaseMessage] = []
88
+
89
+ def _get_default_system_prompt(self) -> str:
90
+ # TODO: Refine this prompt based on testing.
91
+ # It needs to strongly encourage outputting *only* the StepAction structure.
92
+ return (
93
+ "You are an agent interacting with a computer environment (either a web browser or an OS desktop). "
94
+ "Your goal is to follow the user's instructions based on the provided text and screenshot observations."
95
+ "For each step, you must choose exactly ONE action to perform from the available CLA action types."
96
+ "Output your chosen action using the provided 'StepAction' tool/function."
97
+ "If you believe the task is complete based on the user's prompt and the observations, use the 'ResponseAction'."
98
+ )
99
+
100
+ async def fetch_response(self, observation: Observation) -> tuple[CLA | None, bool]:
101
+ """
102
+ Fetches a response from the configured Langchain model, expecting a single
103
+ structured CLA action.
104
+
105
+ Args:
106
+ observation: The preprocessed observation (screenshot potentially rescaled by adapter).
107
+
108
+ Returns:
109
+ A tuple containing:
110
+ - A list with a single dictionary representing the raw CLA action (before adapter postprocessing).
111
+ - A boolean indicating if the agent chose ResponseAction (task completion).
112
+ """
113
+ # 1. Format observation into Langchain message(s)
114
+ human_content: List[Union[str, dict]] = []
115
+ if observation.text:
116
+ human_content.append(observation.text)
117
+ if observation.screenshot:
118
+ # Assuming the Langchain model/chain can handle base64 images
119
+ # This might need adjustment based on the specific model used.
120
+ human_content.append({
121
+ "type": "image_url",
122
+ "image_url": {
123
+ "url": f"data:image/png;base64,{observation.screenshot}"
124
+ }
125
+ })
126
+
127
+ if not human_content:
128
+ logger.warning("LangchainAgent received an observation with no text or screenshot.")
129
+ # Decide how to handle empty observation - perhaps return no action?
130
+ return [], False # Or raise an error?
131
+
132
+ current_human_message = HumanMessage(content=human_content)
133
+
134
+ # 2. Prepare message history for the model
135
+ messages_for_llm: List[BaseMessage] = [
136
+ SystemMessage(content=self.system_prompt_str),
137
+ *self.history,
138
+ current_human_message,
139
+ ]
140
+
141
+ # 3. Configure structured output
142
+ # We ask for the StepAction wrapper, which contains the actual SingleCLAAction
143
+ # Explicitly use method="function_calling" to handle schemas with default values
144
+ structured_llm = self.langchain_model.with_structured_output(
145
+ schema=StepAction,
146
+ method="function_calling"
147
+ )
148
+
149
+ # 4. Invoke Langchain model asynchronously
150
+ try:
151
+ ai_response_structured = await structured_llm.ainvoke(messages_for_llm)
152
+ except Exception as e:
153
+ logger.error(f"Langchain model invocation failed: {e}", exc_info=True)
154
+ # Decide how to handle LLM errors - maybe retry or return empty action?
155
+ return [], False
156
+
157
+ # 5. Process the structured response
158
+ is_done = False
159
+ ai_message_content_for_history = "" # For storing in history
160
+
161
+ if isinstance(ai_response_structured, StepAction):
162
+ # Successfully got the wrapper, extract the actual action
163
+ actual_action = ai_response_structured.action
164
+ ai_message_content_for_history = actual_action.model_dump()
165
+ if isinstance(actual_action, ResponseAction):
166
+ is_done = True
167
+ logger.info(f"LangchainAgent determined task is done with response: {actual_action.text[:100]}...")
168
+ else:
169
+ logger.info(f"LangchainAgent produced action: {type(actual_action).__name__}")
170
+
171
+ else:
172
+ logger.warning(
173
+ f"Langchain model did not return the expected StepAction structure. "
174
+ f"Received type: {type(ai_response_structured)}. Value: {ai_response_structured!r}"
175
+ )
176
+ # Attempt to add raw response to history for debugging
177
+ if isinstance(ai_response_structured, BaseMessage):
178
+ ai_message_content_for_history = ai_response_structured.content
179
+ elif isinstance(ai_response_structured, str):
180
+ ai_message_content_for_history = ai_response_structured
181
+ else:
182
+ ai_message_content_for_history = repr(ai_response_structured)
183
+ # Return no action as we didn't get the expected structure
184
+ return [], False
185
+
186
+ # 6. Update history
187
+ self.history.append(current_human_message)
188
+ # Add the AI response (containing the structured action dict) to history
189
+ # Convert dict to string representation for AIMessage content
190
+ self.history.append(AIMessage(content=repr(ai_message_content_for_history)))
191
+ # TODO: Consider history truncation/summarization if it grows too long
192
+
193
+ if actual_action:
194
+ # Return the single action dictionary within a list
195
+ return [actual_action], is_done
196
+ else:
197
+ # Should ideally not happen if structure validation worked, but as a fallback
198
+ return [], is_done
hud/agent/operator.py CHANGED
@@ -9,9 +9,11 @@ from openai.types.responses import (
9
9
  ResponseInputParam,
10
10
  ResponseInputItemParam,
11
11
  ResponseOutputMessage,
12
- ResponseComputerToolCall
12
+ ResponseComputerToolCall,
13
+ ResponseOutputText
13
14
  )
14
15
 
16
+ from hud.adapters import Adapter
15
17
  from hud.agent.base import Agent
16
18
  from hud.adapters.operator import OperatorAdapter
17
19
  from hud.env.environment import Observation
@@ -32,7 +34,7 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
32
34
  client: OpenAI | None = None,
33
35
  model: str = "computer-use-preview",
34
36
  environment: Literal["windows", "mac", "linux", "browser"] = "windows",
35
- adapter: OperatorAdapter | None = None,
37
+ adapter: Adapter | None = None,
36
38
  max_iterations: int = 8
37
39
  ):
38
40
  """
@@ -54,6 +56,8 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
54
56
 
55
57
  # Create synchronous client
56
58
  client = OpenAI(api_key=api_key)
59
+
60
+ adapter = adapter or OperatorAdapter()
57
61
 
58
62
  super().__init__(client=client, adapter=adapter)
59
63
 
@@ -74,7 +78,7 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
74
78
  self.last_response_id = None
75
79
  self.pending_call_id = None
76
80
  self.initial_prompt = None
77
-
81
+
78
82
  async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
79
83
  """
80
84
  Fetch a response from the model based on the observation.
@@ -158,33 +162,47 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
158
162
  # Store the response ID for the next call
159
163
  self.last_response_id = response.id
160
164
 
161
- # Process the response to extract computer calls
165
+ # Process the response to extract actions or final text
162
166
  actions = []
163
- done = True # Assume we're done unless we find a computer call
164
-
165
- # Loop through all items in the output to find computer_call items
167
+ done = True # Assume done unless a computer call is found
168
+ final_text_response = ""
169
+
170
+ # Check for computer calls first
166
171
  computer_calls = [
167
172
  item for item in response.output
168
173
  if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
169
174
  ]
170
175
 
171
176
  if computer_calls:
172
- # Extract the computer calls and mark that we're not done
177
+ # If computer calls exist, process them and set done=False
173
178
  done = False
174
-
175
- # Process all computer calls
176
179
  for computer_call in computer_calls:
177
180
  self.pending_call_id = computer_call.call_id
178
181
  action = computer_call.action
179
- actions.append(action.model_dump())
180
-
181
- # Log the action
182
+ actions.append(action.model_dump()) # Convert Pydantic model to dict
182
183
  logger.info(f"Computer call action: {action}")
183
184
  else:
184
- # If there are no computer calls, print some debug info
185
- logger.info("No computer call found in the response. Either complete or error.")
185
+ # No computer calls, check for a final text message
186
+ logger.info("No computer call found. Checking for final message.")
187
+ logger.info(response.output)
186
188
  for item in response.output:
187
189
  if isinstance(item, ResponseOutputMessage) and item.type == "message":
188
- logger.info(f"Message: {item.content}")
189
-
190
+ # Extract text from content blocks within the message
191
+ full_text = "".join([c.text for c in item.content if isinstance(c, ResponseOutputText)])
192
+ if full_text:
193
+ final_text_response = full_text
194
+ logger.info(f"Final text message: {final_text_response}")
195
+ break # Stop after finding the first text message
196
+
197
+ # If we found final text, package it as a 'response' action
198
+ if final_text_response:
199
+ actions = [{
200
+ "type": "response",
201
+ "text": final_text_response
202
+ }]
203
+ # Keep done = True
204
+ else:
205
+ logger.info("No computer calls and no final text message found.")
206
+ # Keep done = True, actions remains empty
207
+
190
208
  return actions, done
hud/env/docker_client.py CHANGED
@@ -215,7 +215,7 @@ class DockerClient(Client):
215
215
  raise ValueError("Could not find package name in pyproject.toml")
216
216
  logger.info("Installing %s in /root/controller", self._package_name)
217
217
  result = await self.execute(
218
- ["bash", "-c", "cd /root/controller && pip install -e ."],
218
+ ["bash", "-c", "cd /root/controller && pip install -e . --break-system-packages"],
219
219
  timeout=60,
220
220
  )
221
221
  if result["stdout"]: