hud-python 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +4 -3
- hud/adapters/claude/adapter.py +5 -14
- hud/adapters/common/adapter.py +3 -3
- hud/adapters/common/tests/__init__.py +0 -0
- hud/adapters/common/tests/test_adapter.py +277 -0
- hud/adapters/common/types.py +3 -3
- hud/adapters/operator/adapter.py +16 -23
- hud/agent/__init__.py +8 -1
- hud/agent/base.py +28 -28
- hud/agent/claude.py +69 -60
- hud/agent/langchain.py +32 -26
- hud/agent/operator.py +75 -67
- hud/env/__init__.py +5 -5
- hud/env/client.py +2 -2
- hud/env/docker_client.py +37 -39
- hud/env/environment.py +91 -66
- hud/env/local_docker_client.py +5 -7
- hud/env/remote_client.py +39 -32
- hud/env/remote_docker_client.py +13 -3
- hud/evaluators/__init__.py +2 -3
- hud/evaluators/base.py +4 -3
- hud/evaluators/inspect.py +3 -8
- hud/evaluators/judge.py +34 -58
- hud/evaluators/match.py +42 -49
- hud/evaluators/remote.py +13 -26
- hud/evaluators/tests/__init__.py +0 -0
- hud/evaluators/tests/test_inspect.py +12 -0
- hud/evaluators/tests/test_judge.py +231 -0
- hud/evaluators/tests/test_match.py +115 -0
- hud/evaluators/tests/test_remote.py +98 -0
- hud/exceptions.py +167 -0
- hud/gym.py +9 -7
- hud/job.py +179 -109
- hud/server/__init__.py +2 -2
- hud/server/requests.py +148 -186
- hud/server/tests/__init__.py +0 -0
- hud/server/tests/test_requests.py +275 -0
- hud/settings.py +3 -2
- hud/task.py +9 -19
- hud/taskset.py +44 -11
- hud/trajectory.py +6 -9
- hud/types.py +12 -9
- hud/utils/__init__.py +2 -2
- hud/utils/common.py +36 -15
- hud/utils/config.py +45 -30
- hud/utils/progress.py +34 -21
- hud/utils/telemetry.py +10 -11
- hud/utils/tests/__init__.py +0 -0
- hud/utils/tests/test_common.py +52 -0
- hud/utils/tests/test_config.py +129 -0
- hud/utils/tests/test_progress.py +225 -0
- hud/utils/tests/test_telemetry.py +37 -0
- hud/utils/tests/test_version.py +8 -0
- {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/METADATA +9 -6
- hud_python-0.2.4.dist-info/RECORD +62 -0
- hud_python-0.2.2.dist-info/RECORD +0 -46
- {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/WHEEL +0 -0
- {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/licenses/LICENSE +0 -0
hud/agent/claude.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import os
|
|
3
2
|
from typing import Any, cast
|
|
4
3
|
|
|
5
4
|
from anthropic import AsyncAnthropic
|
|
@@ -14,52 +13,48 @@ from anthropic.types.beta import (
|
|
|
14
13
|
from hud.adapters import Adapter
|
|
15
14
|
from hud.agent.base import Agent
|
|
16
15
|
from hud.adapters.claude import ClaudeAdapter
|
|
17
|
-
from hud.
|
|
16
|
+
from hud.utils.common import Observation
|
|
18
17
|
from hud.settings import settings
|
|
19
18
|
|
|
20
19
|
logger = logging.getLogger(__name__)
|
|
21
20
|
|
|
21
|
+
|
|
22
22
|
def base64_to_content_block(base64: str) -> BetaImageBlockParam:
|
|
23
23
|
return {
|
|
24
24
|
"type": "image",
|
|
25
|
-
"source": {
|
|
26
|
-
"type": "base64",
|
|
27
|
-
"media_type": "image/png",
|
|
28
|
-
"data": base64
|
|
29
|
-
}
|
|
25
|
+
"source": {"type": "base64", "media_type": "image/png", "data": base64},
|
|
30
26
|
}
|
|
31
27
|
|
|
28
|
+
|
|
32
29
|
def text_to_content_block(text: str) -> BetaTextBlockParam:
|
|
33
|
-
return {
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
30
|
+
return {"type": "text", "text": text}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def tool_use_content_block(
|
|
34
|
+
tool_use_id: str, content: list[BetaTextBlockParam | BetaImageBlockParam]
|
|
35
|
+
) -> BetaToolResultBlockParam:
|
|
36
|
+
return {"type": "tool_result", "tool_use_id": tool_use_id, "content": content}
|
|
37
37
|
|
|
38
|
-
def tool_use_content_block(tool_use_id: str, content: list[BetaTextBlockParam | BetaImageBlockParam]) -> BetaToolResultBlockParam:
|
|
39
|
-
return {
|
|
40
|
-
"type": "tool_result",
|
|
41
|
-
"tool_use_id": tool_use_id,
|
|
42
|
-
"content": content
|
|
43
|
-
}
|
|
44
38
|
|
|
45
39
|
# Claude's Computer Use Tool definition
|
|
46
40
|
COMPUTER_TOOL: BetaToolComputerUse20250124Param = {
|
|
47
|
-
"type": "computer_20250124",
|
|
48
|
-
"name": "computer",
|
|
49
|
-
"display_width_px": 1024,
|
|
50
|
-
"display_height_px": 768
|
|
41
|
+
"type": "computer_20250124",
|
|
42
|
+
"name": "computer",
|
|
43
|
+
"display_width_px": 1024,
|
|
44
|
+
"display_height_px": 768,
|
|
51
45
|
}
|
|
52
46
|
|
|
47
|
+
|
|
53
48
|
class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
54
49
|
"""
|
|
55
50
|
An agent implementation using Anthropic's Claude API with Computer Use.
|
|
56
|
-
|
|
51
|
+
|
|
57
52
|
This agent interacts with HUD environments using Claude's Computer Use API
|
|
58
53
|
through the ClaudeAdapter which converts actions to the format expected by HUD.
|
|
59
54
|
"""
|
|
60
|
-
|
|
55
|
+
|
|
61
56
|
def __init__(
|
|
62
|
-
self,
|
|
57
|
+
self,
|
|
63
58
|
client: AsyncAnthropic | None = None,
|
|
64
59
|
adapter: Adapter | None = None,
|
|
65
60
|
model: str = "claude-3-7-sonnet-20250219",
|
|
@@ -68,7 +63,7 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
68
63
|
):
|
|
69
64
|
"""
|
|
70
65
|
Initialize the ClaudeAgent.
|
|
71
|
-
|
|
66
|
+
|
|
72
67
|
Args:
|
|
73
68
|
client: The AsyncAnthropic client for API calls (optional, created automatically if not provided)
|
|
74
69
|
adapter: The adapter to use for preprocessing and postprocessing
|
|
@@ -81,28 +76,30 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
81
76
|
# Get API key from settings
|
|
82
77
|
api_key = settings.anthropic_api_key
|
|
83
78
|
if not api_key:
|
|
84
|
-
raise ValueError(
|
|
85
|
-
|
|
79
|
+
raise ValueError(
|
|
80
|
+
"Anthropic API key not found in settings or environment variables. Set ANTHROPIC_API_KEY."
|
|
81
|
+
)
|
|
82
|
+
|
|
86
83
|
# Create client
|
|
87
84
|
client = AsyncAnthropic(api_key=api_key)
|
|
88
85
|
|
|
89
86
|
adapter = adapter or ClaudeAdapter()
|
|
90
|
-
|
|
87
|
+
|
|
91
88
|
super().__init__(client=client, adapter=adapter)
|
|
92
|
-
|
|
89
|
+
|
|
93
90
|
self.model = model
|
|
94
91
|
self.max_tokens = max_tokens
|
|
95
92
|
self.max_iterations = max_iterations
|
|
96
|
-
|
|
93
|
+
|
|
97
94
|
# Default dimensions - will be updated if adapter is provided
|
|
98
95
|
self.width_px = 1024
|
|
99
96
|
self.height_px = 768
|
|
100
|
-
|
|
97
|
+
|
|
101
98
|
# Update dimensions if adapter is provided
|
|
102
99
|
if self.adapter:
|
|
103
100
|
self.width_px = self.adapter.agent_width
|
|
104
101
|
self.height_px = self.adapter.agent_height
|
|
105
|
-
|
|
102
|
+
|
|
106
103
|
# Message history
|
|
107
104
|
self.messages: list[BetaMessageParam] = []
|
|
108
105
|
self.pending_computer_use_tool_id = None
|
|
@@ -110,17 +107,17 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
110
107
|
async def fetch_response(self, observation: Observation) -> tuple[list[Any], bool]:
|
|
111
108
|
"""
|
|
112
109
|
Fetch a response from Claude based on the observation.
|
|
113
|
-
|
|
110
|
+
|
|
114
111
|
Args:
|
|
115
112
|
observation: The preprocessed observation
|
|
116
|
-
|
|
113
|
+
|
|
117
114
|
Returns:
|
|
118
115
|
tuple[list[Any], bool]: A tuple containing the list of raw actions and a
|
|
119
116
|
boolean indicating if the agent believes the task is complete
|
|
120
117
|
"""
|
|
121
118
|
if not self.client:
|
|
122
119
|
raise ValueError("Client is required")
|
|
123
|
-
|
|
120
|
+
|
|
124
121
|
# Prepare the user content for Claude
|
|
125
122
|
user_content: list[BetaImageBlockParam | BetaTextBlockParam | BetaToolResultBlockParam] = []
|
|
126
123
|
|
|
@@ -128,7 +125,7 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
128
125
|
if observation.text:
|
|
129
126
|
logger.info("Adding text to user content: %s", observation.text)
|
|
130
127
|
user_content.append(text_to_content_block(str(observation.text)))
|
|
131
|
-
|
|
128
|
+
|
|
132
129
|
# Add screenshot if present
|
|
133
130
|
if observation.screenshot:
|
|
134
131
|
logger.info("Adding screenshot to user content")
|
|
@@ -136,20 +133,28 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
136
133
|
logger.info("Adding screenshot to user content, no tool id")
|
|
137
134
|
user_content.append(base64_to_content_block(observation.screenshot))
|
|
138
135
|
else:
|
|
139
|
-
logger.info(
|
|
136
|
+
logger.info(
|
|
137
|
+
"Adding screenshot to user content, tool id: %s",
|
|
138
|
+
self.pending_computer_use_tool_id,
|
|
139
|
+
)
|
|
140
140
|
user_content.append(
|
|
141
141
|
tool_use_content_block(
|
|
142
|
-
self.pending_computer_use_tool_id,
|
|
143
|
-
[base64_to_content_block(observation.screenshot)]
|
|
142
|
+
self.pending_computer_use_tool_id,
|
|
143
|
+
[base64_to_content_block(observation.screenshot)],
|
|
144
144
|
)
|
|
145
145
|
)
|
|
146
146
|
self.pending_computer_use_tool_id = None
|
|
147
147
|
|
|
148
148
|
# Add the user content to the messages
|
|
149
|
-
self.messages.append(
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
149
|
+
self.messages.append(
|
|
150
|
+
cast(
|
|
151
|
+
BetaMessageParam,
|
|
152
|
+
{
|
|
153
|
+
"role": "user",
|
|
154
|
+
"content": user_content,
|
|
155
|
+
},
|
|
156
|
+
)
|
|
157
|
+
)
|
|
153
158
|
|
|
154
159
|
# Call Claude API using async client
|
|
155
160
|
response = await self.client.beta.messages.create(
|
|
@@ -158,30 +163,35 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
158
163
|
messages=self.messages,
|
|
159
164
|
tools=[COMPUTER_TOOL],
|
|
160
165
|
betas=["computer-use-2025-01-24"],
|
|
161
|
-
tool_choice={"type": "auto", "disable_parallel_tool_use": True}
|
|
166
|
+
tool_choice={"type": "auto", "disable_parallel_tool_use": True},
|
|
162
167
|
)
|
|
163
168
|
|
|
164
169
|
# Add Claude's response to the conversation history
|
|
165
170
|
response_content = response.content
|
|
166
|
-
self.messages.append(
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
171
|
+
self.messages.append(
|
|
172
|
+
cast(
|
|
173
|
+
BetaMessageParam,
|
|
174
|
+
{
|
|
175
|
+
"role": "assistant",
|
|
176
|
+
"content": response_content,
|
|
177
|
+
},
|
|
178
|
+
)
|
|
179
|
+
)
|
|
170
180
|
|
|
171
181
|
# Process tool use
|
|
172
182
|
actions: list[Any] = []
|
|
173
183
|
done = True # Assume we're done unless we find a tool use
|
|
174
|
-
|
|
184
|
+
|
|
175
185
|
for block in response_content:
|
|
176
186
|
logger.info("Processing block: %s", block)
|
|
177
187
|
if block.type == "tool_use":
|
|
178
188
|
logger.info("Processing tool use: %s", block)
|
|
179
189
|
assert block.name == "computer"
|
|
180
|
-
|
|
190
|
+
|
|
181
191
|
# Store the raw action
|
|
182
192
|
actions.append(block.input)
|
|
183
193
|
self.pending_computer_use_tool_id = block.id
|
|
184
|
-
|
|
194
|
+
|
|
185
195
|
# If we found a tool use, we're not done
|
|
186
196
|
done = False
|
|
187
197
|
break
|
|
@@ -192,16 +202,15 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
192
202
|
for block in response_content:
|
|
193
203
|
if block.type == "text":
|
|
194
204
|
final_text_response += block.text
|
|
195
|
-
|
|
205
|
+
|
|
196
206
|
if final_text_response.strip():
|
|
197
|
-
logger.info(
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
}]
|
|
207
|
+
logger.info(
|
|
208
|
+
f"No tool use found. Using final text as response: {final_text_response}"
|
|
209
|
+
)
|
|
210
|
+
actions = [{"action": "response", "text": final_text_response.strip()}]
|
|
202
211
|
# Keep done = True
|
|
203
212
|
else:
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
213
|
+
logger.info("No tool use and no final text block found.")
|
|
214
|
+
# Keep done = True, actions remains empty
|
|
215
|
+
|
|
207
216
|
return actions, done
|
hud/agent/langchain.py
CHANGED
|
@@ -10,9 +10,8 @@ from pydantic import Field, BaseModel
|
|
|
10
10
|
# HUD imports
|
|
11
11
|
from hud.adapters import Adapter
|
|
12
12
|
from hud.agent.base import Agent
|
|
13
|
-
from hud.
|
|
13
|
+
from hud.utils.common import Observation
|
|
14
14
|
from hud.adapters.common.types import (
|
|
15
|
-
CLA,
|
|
16
15
|
ClickAction,
|
|
17
16
|
TypeAction,
|
|
18
17
|
ScrollAction,
|
|
@@ -44,17 +43,23 @@ SingleCLAction = Union[
|
|
|
44
43
|
ResponseAction,
|
|
45
44
|
]
|
|
46
45
|
|
|
46
|
+
|
|
47
47
|
# Define a Pydantic model to wrap the single action, potentially making it
|
|
48
48
|
# easier for the LLM to consistently output the desired structure.
|
|
49
49
|
class StepAction(BaseModel):
|
|
50
50
|
"""Wrapper model requesting a single concrete CLA action from the Langchain model."""
|
|
51
|
-
|
|
51
|
+
|
|
52
|
+
action: SingleCLAction = Field(
|
|
53
|
+
..., description="The single CLA action to perform for this step."
|
|
54
|
+
)
|
|
55
|
+
|
|
52
56
|
|
|
53
57
|
# Generic Type for the Langchain Model/Runnable
|
|
54
58
|
# Allows flexibility in what the user provides (model, chain, etc.)
|
|
55
59
|
# Bound to BaseLanguageModel as .with_structured_output is expected
|
|
56
60
|
LangchainModelOrRunnable = TypeVar("LangchainModelOrRunnable", bound=BaseLanguageModel)
|
|
57
61
|
|
|
62
|
+
|
|
58
63
|
class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainModelOrRunnable]):
|
|
59
64
|
"""
|
|
60
65
|
An agent that uses an arbitrary Langchain model or runnable, leveraging
|
|
@@ -80,8 +85,8 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
|
|
|
80
85
|
system_prompt: An optional system prompt to guide the Langchain model.
|
|
81
86
|
If None, a default prompt encouraging single CLA output is used.
|
|
82
87
|
"""
|
|
83
|
-
super().__init__(client=langchain_model, adapter=adapter)
|
|
84
|
-
self.langchain_model = langchain_model
|
|
88
|
+
super().__init__(client=langchain_model, adapter=adapter) # Store model as 'client'
|
|
89
|
+
self.langchain_model = langchain_model # Also store with specific name
|
|
85
90
|
|
|
86
91
|
self.system_prompt_str = system_prompt or self._get_default_system_prompt()
|
|
87
92
|
self.history: List[BaseMessage] = []
|
|
@@ -97,7 +102,7 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
|
|
|
97
102
|
"If you believe the task is complete based on the user's prompt and the observations, use the 'ResponseAction'."
|
|
98
103
|
)
|
|
99
104
|
|
|
100
|
-
async def fetch_response(self, observation: Observation) -> tuple[
|
|
105
|
+
async def fetch_response(self, observation: Observation) -> tuple[list[dict], bool]:
|
|
101
106
|
"""
|
|
102
107
|
Fetches a response from the configured Langchain model, expecting a single
|
|
103
108
|
structured CLA action.
|
|
@@ -117,17 +122,17 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
|
|
|
117
122
|
if observation.screenshot:
|
|
118
123
|
# Assuming the Langchain model/chain can handle base64 images
|
|
119
124
|
# This might need adjustment based on the specific model used.
|
|
120
|
-
human_content.append(
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
"url": f"data:image/png;base64,{observation.screenshot}"
|
|
125
|
+
human_content.append(
|
|
126
|
+
{
|
|
127
|
+
"type": "image_url",
|
|
128
|
+
"image_url": {"url": f"data:image/png;base64,{observation.screenshot}"},
|
|
124
129
|
}
|
|
125
|
-
|
|
126
|
-
|
|
130
|
+
)
|
|
131
|
+
|
|
127
132
|
if not human_content:
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
133
|
+
logger.warning("LangchainAgent received an observation with no text or screenshot.")
|
|
134
|
+
# Decide how to handle empty observation - perhaps return no action?
|
|
135
|
+
return [], False # Or raise an error?
|
|
131
136
|
|
|
132
137
|
current_human_message = HumanMessage(content=human_content)
|
|
133
138
|
|
|
@@ -142,8 +147,7 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
|
|
|
142
147
|
# We ask for the StepAction wrapper, which contains the actual SingleCLAAction
|
|
143
148
|
# Explicitly use method="function_calling" to handle schemas with default values
|
|
144
149
|
structured_llm = self.langchain_model.with_structured_output(
|
|
145
|
-
schema=StepAction,
|
|
146
|
-
method="function_calling"
|
|
150
|
+
schema=StepAction, method="function_calling"
|
|
147
151
|
)
|
|
148
152
|
|
|
149
153
|
# 4. Invoke Langchain model asynchronously
|
|
@@ -156,7 +160,7 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
|
|
|
156
160
|
|
|
157
161
|
# 5. Process the structured response
|
|
158
162
|
is_done = False
|
|
159
|
-
ai_message_content_for_history = ""
|
|
163
|
+
ai_message_content_for_history = "" # For storing in history
|
|
160
164
|
|
|
161
165
|
if isinstance(ai_response_structured, StepAction):
|
|
162
166
|
# Successfully got the wrapper, extract the actual action
|
|
@@ -164,22 +168,24 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
|
|
|
164
168
|
ai_message_content_for_history = actual_action.model_dump()
|
|
165
169
|
if isinstance(actual_action, ResponseAction):
|
|
166
170
|
is_done = True
|
|
167
|
-
logger.info(
|
|
171
|
+
logger.info(
|
|
172
|
+
f"LangchainAgent determined task is done with response: {actual_action.text[:100]}..."
|
|
173
|
+
)
|
|
168
174
|
else:
|
|
169
|
-
|
|
175
|
+
logger.info(f"LangchainAgent produced action: {type(actual_action).__name__}")
|
|
170
176
|
|
|
171
177
|
else:
|
|
172
178
|
logger.warning(
|
|
173
179
|
f"Langchain model did not return the expected StepAction structure. "
|
|
174
180
|
f"Received type: {type(ai_response_structured)}. Value: {ai_response_structured!r}"
|
|
175
181
|
)
|
|
176
|
-
|
|
182
|
+
# Attempt to add raw response to history for debugging
|
|
177
183
|
if isinstance(ai_response_structured, BaseMessage):
|
|
178
|
-
|
|
184
|
+
ai_message_content_for_history = ai_response_structured.content
|
|
179
185
|
elif isinstance(ai_response_structured, str):
|
|
180
|
-
|
|
186
|
+
ai_message_content_for_history = ai_response_structured
|
|
181
187
|
else:
|
|
182
|
-
|
|
188
|
+
ai_message_content_for_history = repr(ai_response_structured)
|
|
183
189
|
# Return no action as we didn't get the expected structure
|
|
184
190
|
return [], False
|
|
185
191
|
|
|
@@ -192,7 +198,7 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
|
|
|
192
198
|
|
|
193
199
|
if actual_action:
|
|
194
200
|
# Return the single action dictionary within a list
|
|
195
|
-
return [actual_action], is_done
|
|
201
|
+
return [actual_action.model_dump()], is_done
|
|
196
202
|
else:
|
|
197
203
|
# Should ideally not happen if structure validation worked, but as a fallback
|
|
198
|
-
return [], is_done
|
|
204
|
+
return [], is_done
|
hud/agent/operator.py
CHANGED
|
@@ -10,36 +10,37 @@ from openai.types.responses import (
|
|
|
10
10
|
ResponseInputItemParam,
|
|
11
11
|
ResponseOutputMessage,
|
|
12
12
|
ResponseComputerToolCall,
|
|
13
|
-
ResponseOutputText
|
|
13
|
+
ResponseOutputText,
|
|
14
14
|
)
|
|
15
15
|
|
|
16
16
|
from hud.adapters import Adapter
|
|
17
17
|
from hud.agent.base import Agent
|
|
18
18
|
from hud.adapters.operator import OperatorAdapter
|
|
19
|
-
from hud.
|
|
19
|
+
from hud.utils.common import Observation
|
|
20
20
|
from hud.settings import settings
|
|
21
21
|
|
|
22
22
|
logger = logging.getLogger(__name__)
|
|
23
23
|
|
|
24
|
+
|
|
24
25
|
class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
|
|
25
26
|
"""
|
|
26
27
|
An agent implementation using OpenAI's Computer Use API.
|
|
27
|
-
|
|
28
|
+
|
|
28
29
|
This agent interacts with HUD environments using OpenAI's Computer Use API
|
|
29
30
|
through the OperatorAdapter which converts actions to the format expected by HUD.
|
|
30
31
|
"""
|
|
31
|
-
|
|
32
|
+
|
|
32
33
|
def __init__(
|
|
33
|
-
self,
|
|
34
|
+
self,
|
|
34
35
|
client: OpenAI | None = None,
|
|
35
36
|
model: str = "computer-use-preview",
|
|
36
37
|
environment: Literal["windows", "mac", "linux", "browser"] = "windows",
|
|
37
38
|
adapter: Adapter | None = None,
|
|
38
|
-
max_iterations: int = 8
|
|
39
|
+
max_iterations: int = 8,
|
|
39
40
|
):
|
|
40
41
|
"""
|
|
41
42
|
Initialize the OperatorAgent.
|
|
42
|
-
|
|
43
|
+
|
|
43
44
|
Args:
|
|
44
45
|
client: The OpenAI client for API calls (optional, created automatically if not provided)
|
|
45
46
|
model: The model to use for computer use
|
|
@@ -52,28 +53,30 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
|
|
|
52
53
|
# Get API key from settings
|
|
53
54
|
api_key = settings.openai_api_key
|
|
54
55
|
if not api_key:
|
|
55
|
-
raise ValueError(
|
|
56
|
-
|
|
56
|
+
raise ValueError(
|
|
57
|
+
"OpenAI API key not found in settings or environment variables. Set OPENAI_API_KEY."
|
|
58
|
+
)
|
|
59
|
+
|
|
57
60
|
# Create synchronous client
|
|
58
61
|
client = OpenAI(api_key=api_key)
|
|
59
62
|
|
|
60
63
|
adapter = adapter or OperatorAdapter()
|
|
61
|
-
|
|
64
|
+
|
|
62
65
|
super().__init__(client=client, adapter=adapter)
|
|
63
|
-
|
|
66
|
+
|
|
64
67
|
self.model = model
|
|
65
68
|
self.environment = environment
|
|
66
69
|
self.max_iterations = max_iterations
|
|
67
|
-
|
|
70
|
+
|
|
68
71
|
# Default dimensions
|
|
69
72
|
self.width = 1024
|
|
70
73
|
self.height = 768
|
|
71
|
-
|
|
74
|
+
|
|
72
75
|
# Update dimensions if adapter is provided
|
|
73
76
|
if self.adapter:
|
|
74
77
|
self.width = self.adapter.agent_width
|
|
75
78
|
self.height = self.adapter.agent_height
|
|
76
|
-
|
|
79
|
+
|
|
77
80
|
# Message history and state tracking
|
|
78
81
|
self.last_response_id = None
|
|
79
82
|
self.pending_call_id = None
|
|
@@ -82,86 +85,91 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
|
|
|
82
85
|
async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
|
|
83
86
|
"""
|
|
84
87
|
Fetch a response from the model based on the observation.
|
|
85
|
-
|
|
88
|
+
|
|
86
89
|
Args:
|
|
87
90
|
observation: The preprocessed observation
|
|
88
|
-
|
|
91
|
+
|
|
89
92
|
Returns:
|
|
90
93
|
tuple[list[dict[str, Any]], bool]: A tuple containing the list of raw actions and a
|
|
91
94
|
boolean indicating if the agent believes the task is complete
|
|
92
95
|
"""
|
|
93
96
|
if not self.client:
|
|
94
97
|
raise ValueError("Client is required")
|
|
95
|
-
|
|
98
|
+
|
|
96
99
|
# Define the computer use tool with correct type using cast
|
|
97
|
-
computer_tool = cast(
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
100
|
+
computer_tool = cast(
|
|
101
|
+
ToolParam,
|
|
102
|
+
{
|
|
103
|
+
"type": "computer_use_preview",
|
|
104
|
+
"display_width": self.width,
|
|
105
|
+
"display_height": self.height,
|
|
106
|
+
"environment": self.environment,
|
|
107
|
+
},
|
|
108
|
+
)
|
|
109
|
+
|
|
104
110
|
# Process the observation based on whether it's the first one or a response to an action
|
|
105
111
|
if self.pending_call_id is None and self.last_response_id is None:
|
|
106
112
|
# This is the first observation, store and send the prompt
|
|
107
113
|
self.initial_prompt = observation.text
|
|
108
|
-
|
|
114
|
+
|
|
109
115
|
# Create the initial request following the required structure
|
|
110
116
|
input_content: list[dict[str, Any]] = [
|
|
111
117
|
{"type": "input_text", "text": observation.text or ""}
|
|
112
118
|
]
|
|
113
|
-
|
|
119
|
+
|
|
114
120
|
# Add screenshot if present
|
|
115
121
|
if observation.screenshot:
|
|
116
|
-
input_content.append(
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
122
|
+
input_content.append(
|
|
123
|
+
{
|
|
124
|
+
"type": "input_image",
|
|
125
|
+
"image_url": f"data:image/png;base64,{observation.screenshot}",
|
|
126
|
+
}
|
|
127
|
+
)
|
|
128
|
+
|
|
121
129
|
# Structure the input correctly for the API using cast
|
|
122
|
-
input_param = cast(ResponseInputParam, [{
|
|
123
|
-
|
|
124
|
-
"content": input_content
|
|
125
|
-
}])
|
|
126
|
-
|
|
130
|
+
input_param = cast(ResponseInputParam, [{"role": "user", "content": input_content}])
|
|
131
|
+
|
|
127
132
|
# Call OpenAI API for the initial prompt (synchronous call)
|
|
128
133
|
response = self.client.responses.create(
|
|
129
|
-
model=self.model,
|
|
130
|
-
tools=[computer_tool],
|
|
131
|
-
input=input_param,
|
|
132
|
-
truncation="auto"
|
|
134
|
+
model=self.model, tools=[computer_tool], input=input_param, truncation="auto"
|
|
133
135
|
)
|
|
134
|
-
|
|
136
|
+
|
|
135
137
|
else:
|
|
136
138
|
# This is a response to a previous action
|
|
137
139
|
if not observation.screenshot:
|
|
138
140
|
logger.warning("No screenshot provided for response to action")
|
|
139
141
|
return [], True
|
|
140
|
-
|
|
142
|
+
|
|
141
143
|
# Create a response to the previous action with the new screenshot
|
|
142
|
-
input_param_followup = cast(
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
"
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
144
|
+
input_param_followup = cast(
|
|
145
|
+
ResponseInputParam,
|
|
146
|
+
[
|
|
147
|
+
cast(
|
|
148
|
+
ResponseInputItemParam,
|
|
149
|
+
{
|
|
150
|
+
"call_id": self.pending_call_id,
|
|
151
|
+
"type": "computer_call_output",
|
|
152
|
+
"output": {
|
|
153
|
+
"type": "input_image",
|
|
154
|
+
"image_url": f"data:image/png;base64,{observation.screenshot}",
|
|
155
|
+
},
|
|
156
|
+
},
|
|
157
|
+
)
|
|
158
|
+
],
|
|
159
|
+
)
|
|
160
|
+
|
|
153
161
|
# Call OpenAI API for follow-up (synchronous call)
|
|
154
162
|
response = self.client.responses.create(
|
|
155
163
|
model=self.model,
|
|
156
164
|
previous_response_id=self.last_response_id,
|
|
157
165
|
tools=[computer_tool],
|
|
158
166
|
input=input_param_followup,
|
|
159
|
-
truncation="auto"
|
|
167
|
+
truncation="auto",
|
|
160
168
|
)
|
|
161
|
-
|
|
169
|
+
|
|
162
170
|
# Store the response ID for the next call
|
|
163
171
|
self.last_response_id = response.id
|
|
164
|
-
|
|
172
|
+
|
|
165
173
|
# Process the response to extract actions or final text
|
|
166
174
|
actions = []
|
|
167
175
|
done = True # Assume done unless a computer call is found
|
|
@@ -169,17 +177,18 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
|
|
|
169
177
|
|
|
170
178
|
# Check for computer calls first
|
|
171
179
|
computer_calls = [
|
|
172
|
-
item
|
|
180
|
+
item
|
|
181
|
+
for item in response.output
|
|
173
182
|
if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
|
|
174
183
|
]
|
|
175
|
-
|
|
184
|
+
|
|
176
185
|
if computer_calls:
|
|
177
186
|
# If computer calls exist, process them and set done=False
|
|
178
187
|
done = False
|
|
179
188
|
for computer_call in computer_calls:
|
|
180
189
|
self.pending_call_id = computer_call.call_id
|
|
181
190
|
action = computer_call.action
|
|
182
|
-
actions.append(action.model_dump())
|
|
191
|
+
actions.append(action.model_dump()) # Convert Pydantic model to dict
|
|
183
192
|
logger.info(f"Computer call action: {action}")
|
|
184
193
|
else:
|
|
185
194
|
# No computer calls, check for a final text message
|
|
@@ -188,21 +197,20 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
|
|
|
188
197
|
for item in response.output:
|
|
189
198
|
if isinstance(item, ResponseOutputMessage) and item.type == "message":
|
|
190
199
|
# Extract text from content blocks within the message
|
|
191
|
-
full_text = "".join(
|
|
200
|
+
full_text = "".join(
|
|
201
|
+
[c.text for c in item.content if isinstance(c, ResponseOutputText)]
|
|
202
|
+
)
|
|
192
203
|
if full_text:
|
|
193
204
|
final_text_response = full_text
|
|
194
205
|
logger.info(f"Final text message: {final_text_response}")
|
|
195
|
-
break
|
|
196
|
-
|
|
206
|
+
break # Stop after finding the first text message
|
|
207
|
+
|
|
197
208
|
# If we found final text, package it as a 'response' action
|
|
198
209
|
if final_text_response:
|
|
199
|
-
actions = [{
|
|
200
|
-
"type": "response",
|
|
201
|
-
"text": final_text_response
|
|
202
|
-
}]
|
|
210
|
+
actions = [{"type": "response", "text": final_text_response}]
|
|
203
211
|
# Keep done = True
|
|
204
212
|
else:
|
|
205
213
|
logger.info("No computer calls and no final text message found.")
|
|
206
214
|
# Keep done = True, actions remains empty
|
|
207
215
|
|
|
208
|
-
return actions, done
|
|
216
|
+
return actions, done
|