hud-python 0.4.21__py3-none-any.whl → 0.4.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/agents/base.py +2 -0
- hud/agents/claude.py +11 -6
- hud/agents/grounded_openai.py +280 -0
- hud/agents/tests/test_client.py +6 -1
- hud/agents/tests/test_grounded_openai_agent.py +155 -0
- hud/cli/eval.py +2 -2
- hud/cli/utils/interactive.py +1 -1
- hud/settings.py +6 -0
- hud/tools/executors/tests/test_base_executor.py +1 -1
- hud/tools/executors/xdo.py +1 -1
- hud/tools/grounding/__init__.py +13 -0
- hud/tools/grounding/config.py +54 -0
- hud/tools/grounding/grounded_tool.py +314 -0
- hud/tools/grounding/grounder.py +301 -0
- hud/tools/grounding/tests/__init__.py +1 -0
- hud/tools/grounding/tests/test_grounded_tool.py +196 -0
- hud/tools/tests/test_playwright_tool.py +1 -1
- hud/tools/tests/test_tools_init.py +1 -1
- hud/tools/tests/test_utils.py +2 -2
- hud/utils/agent_factories.py +86 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.21.dist-info → hud_python-0.4.22.dist-info}/METADATA +1 -1
- {hud_python-0.4.21.dist-info → hud_python-0.4.22.dist-info}/RECORD +27 -18
- {hud_python-0.4.21.dist-info → hud_python-0.4.22.dist-info}/WHEEL +0 -0
- {hud_python-0.4.21.dist-info → hud_python-0.4.22.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.21.dist-info → hud_python-0.4.22.dist-info}/licenses/LICENSE +0 -0
hud/agents/base.py
CHANGED
hud/agents/claude.py
CHANGED
|
@@ -364,16 +364,21 @@ class ClaudeAgent(MCPAgent):
|
|
|
364
364
|
messages_cached = copy.deepcopy(messages)
|
|
365
365
|
|
|
366
366
|
# Mark last user message with cache control
|
|
367
|
-
if
|
|
367
|
+
if (
|
|
368
|
+
messages_cached
|
|
369
|
+
and isinstance(messages_cached[-1], dict)
|
|
370
|
+
and messages_cached[-1].get("role") == "user"
|
|
371
|
+
):
|
|
368
372
|
last_content = messages_cached[-1]["content"]
|
|
369
373
|
# Content is formatted to be list of ContentBlock in format_blocks and format_message
|
|
370
374
|
if isinstance(last_content, list):
|
|
371
375
|
for block in last_content:
|
|
372
|
-
# Only add cache control to block types that support it
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
376
|
+
# Only add cache control to dict-like block types that support it
|
|
377
|
+
if isinstance(block, dict):
|
|
378
|
+
block_type = block.get("type")
|
|
379
|
+
if block_type in ["text", "image", "tool_use", "tool_result"]:
|
|
380
|
+
cache_control: BetaCacheControlEphemeralParam = {"type": "ephemeral"}
|
|
381
|
+
block["cache_control"] = cache_control # type: ignore[reportGeneralTypeIssues]
|
|
377
382
|
|
|
378
383
|
return messages_cached
|
|
379
384
|
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
"""Grounded OpenAI agent that separates visual grounding from reasoning."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from hud import instrument
|
|
9
|
+
from hud.tools.grounding import GroundedComputerTool, Grounder, GrounderConfig
|
|
10
|
+
from hud.types import AgentResponse, MCPToolCall, MCPToolResult
|
|
11
|
+
|
|
12
|
+
from .openai_chat_generic import GenericOpenAIChatAgent
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
|
|
16
|
+
"""OpenAI agent that uses a separate grounding model for element detection.
|
|
17
|
+
|
|
18
|
+
This agent:
|
|
19
|
+
- Exposes only a synthetic "computer" tool to the planning model
|
|
20
|
+
- Intercepts tool calls to ground element descriptions to coordinates
|
|
21
|
+
- Converts grounded results to real computer tool calls
|
|
22
|
+
- Maintains screenshot state for grounding operations
|
|
23
|
+
|
|
24
|
+
The architecture separates concerns:
|
|
25
|
+
- Planning model (GPT-4o etc) focuses on high-level reasoning
|
|
26
|
+
- Grounding model (Qwen2-VL etc) handles visual element detection
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
*,
|
|
32
|
+
grounder_config: GrounderConfig,
|
|
33
|
+
model_name: str = "gpt-4o-mini",
|
|
34
|
+
allowed_tools: list[str] | None = None,
|
|
35
|
+
append_setup_output: bool = False,
|
|
36
|
+
system_prompt: str | None = None,
|
|
37
|
+
**kwargs: Any,
|
|
38
|
+
) -> None:
|
|
39
|
+
"""Initialize the grounded OpenAI agent.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
grounder_config: Configuration for the grounding model
|
|
43
|
+
openai_client: OpenAI client for the planning model
|
|
44
|
+
model: Name of the OpenAI model to use for planning (e.g., "gpt-4o", "gpt-4o-mini")
|
|
45
|
+
real_computer_tool_name: Name of the actual computer tool to execute
|
|
46
|
+
**kwargs: Additional arguments passed to GenericOpenAIChatAgent
|
|
47
|
+
"""
|
|
48
|
+
# Set defaults for grounded agent
|
|
49
|
+
if allowed_tools is None:
|
|
50
|
+
allowed_tools = ["computer"]
|
|
51
|
+
|
|
52
|
+
if system_prompt is None:
|
|
53
|
+
system_prompt = (
|
|
54
|
+
"You are a helpful AI assistant that can control the computer "
|
|
55
|
+
"through visual interaction.\n\n"
|
|
56
|
+
"IMPORTANT: Always explain your reasoning and observations before taking actions:\n"
|
|
57
|
+
"1. First, describe what you see on the screen\n"
|
|
58
|
+
"2. Explain what you plan to do and why\n"
|
|
59
|
+
"3. Then use the computer tool with natural language descriptions\n\n"
|
|
60
|
+
"For example:\n"
|
|
61
|
+
"- 'I can see a login form with username and password fields. "
|
|
62
|
+
"I need to click on the username field first.'\n"
|
|
63
|
+
"- 'There's a blue submit button at the bottom. "
|
|
64
|
+
"I'll click on it to submit the form.'\n"
|
|
65
|
+
"- 'I notice a red close button in the top right corner. "
|
|
66
|
+
"I'll click it to close this dialog.'\n\n"
|
|
67
|
+
"Use descriptive element descriptions like:\n"
|
|
68
|
+
"- Colors: 'red button', 'blue link', 'green checkmark'\n"
|
|
69
|
+
"- Position: 'top right corner', 'bottom of the page', 'left sidebar'\n"
|
|
70
|
+
"- Text content: 'Submit button', 'Login link', 'Cancel option'\n"
|
|
71
|
+
"- Element type: 'text field', 'dropdown menu', 'checkbox'"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
super().__init__(
|
|
75
|
+
model_name=model_name,
|
|
76
|
+
allowed_tools=allowed_tools,
|
|
77
|
+
append_setup_output=append_setup_output,
|
|
78
|
+
system_prompt=system_prompt,
|
|
79
|
+
**kwargs,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
self.grounder = Grounder(grounder_config)
|
|
83
|
+
self.grounded_tool = None
|
|
84
|
+
|
|
85
|
+
async def initialize(self, task: Any = None) -> None:
|
|
86
|
+
"""Initialize the agent and create the grounded tool with mcp_client."""
|
|
87
|
+
# Call parent initialization first
|
|
88
|
+
await super().initialize(task)
|
|
89
|
+
|
|
90
|
+
if self.mcp_client is None:
|
|
91
|
+
raise ValueError("mcp_client must be initialized before creating grounded tool")
|
|
92
|
+
self.grounded_tool = GroundedComputerTool(
|
|
93
|
+
grounder=self.grounder, mcp_client=self.mcp_client, computer_tool_name="computer"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def get_tool_schemas(self) -> list[Any]:
|
|
97
|
+
"""Override to expose only the synthetic grounded tool.
|
|
98
|
+
|
|
99
|
+
The planning model only sees the synthetic "computer" tool,
|
|
100
|
+
which is provided by the grounded tool itself.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
List containing only the grounded computer tool schema
|
|
104
|
+
"""
|
|
105
|
+
if self.grounded_tool is None:
|
|
106
|
+
return []
|
|
107
|
+
return [self.grounded_tool.get_openai_tool_schema()]
|
|
108
|
+
|
|
109
|
+
@instrument(
|
|
110
|
+
span_type="agent",
|
|
111
|
+
record_args=False,
|
|
112
|
+
record_result=True,
|
|
113
|
+
)
|
|
114
|
+
async def get_response(self, messages: Any) -> AgentResponse:
|
|
115
|
+
"""Get response from the planning model and handle grounded tool calls.
|
|
116
|
+
|
|
117
|
+
This method:
|
|
118
|
+
1. Calls the planning model with the grounded tool schema
|
|
119
|
+
2. Executes any tool calls directly through the grounded tool
|
|
120
|
+
3. Returns the response
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
messages: Conversation messages
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
AgentResponse with either content or tool calls for MCP execution
|
|
127
|
+
"""
|
|
128
|
+
tool_schemas = self.get_tool_schemas()
|
|
129
|
+
|
|
130
|
+
# Take initial screenshot and add to messages if this is the first turn
|
|
131
|
+
has_image = any(
|
|
132
|
+
isinstance(m.get("content"), list)
|
|
133
|
+
and any(
|
|
134
|
+
block.get("type") == "image_url"
|
|
135
|
+
for block in m["content"]
|
|
136
|
+
if isinstance(block, dict)
|
|
137
|
+
)
|
|
138
|
+
for m in messages
|
|
139
|
+
if isinstance(m.get("content"), list)
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
if not has_image:
|
|
143
|
+
if self.mcp_client is None:
|
|
144
|
+
raise ValueError("mcp_client is not initialized")
|
|
145
|
+
screenshot_result = await self.mcp_client.call_tool(
|
|
146
|
+
MCPToolCall(name="computer", arguments={"action": "screenshot"})
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
for block in screenshot_result.content:
|
|
150
|
+
# Check for ImageContent type from MCP
|
|
151
|
+
if hasattr(block, "data") and hasattr(block, "mimeType"):
|
|
152
|
+
mime_type = getattr(block, "mimeType", "image/png")
|
|
153
|
+
data = getattr(block, "data", "")
|
|
154
|
+
messages.append(
|
|
155
|
+
{
|
|
156
|
+
"role": "user",
|
|
157
|
+
"content": [
|
|
158
|
+
{
|
|
159
|
+
"type": "image_url",
|
|
160
|
+
"image_url": {"url": f"data:{mime_type};base64,{data}"},
|
|
161
|
+
}
|
|
162
|
+
],
|
|
163
|
+
}
|
|
164
|
+
)
|
|
165
|
+
break
|
|
166
|
+
|
|
167
|
+
protected_keys = {"model", "messages", "tools", "parallel_tool_calls"}
|
|
168
|
+
extra = {k: v for k, v in (self.completion_kwargs or {}).items() if k not in protected_keys}
|
|
169
|
+
|
|
170
|
+
response = await self.oai.chat.completions.create(
|
|
171
|
+
model=self.model_name,
|
|
172
|
+
messages=messages,
|
|
173
|
+
tools=tool_schemas,
|
|
174
|
+
parallel_tool_calls=False,
|
|
175
|
+
**extra,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
choice = response.choices[0]
|
|
179
|
+
msg = choice.message
|
|
180
|
+
|
|
181
|
+
assistant_msg: dict[str, Any] = {"role": "assistant"}
|
|
182
|
+
if msg.content:
|
|
183
|
+
assistant_msg["content"] = msg.content
|
|
184
|
+
if msg.tool_calls:
|
|
185
|
+
assistant_msg["tool_calls"] = msg.tool_calls
|
|
186
|
+
|
|
187
|
+
messages.append(assistant_msg)
|
|
188
|
+
|
|
189
|
+
self.conversation_history = messages.copy()
|
|
190
|
+
|
|
191
|
+
if not msg.tool_calls:
|
|
192
|
+
return AgentResponse(
|
|
193
|
+
content=msg.content or "",
|
|
194
|
+
tool_calls=[],
|
|
195
|
+
done=choice.finish_reason in ("stop", "length"),
|
|
196
|
+
raw=response,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
tc = msg.tool_calls[0]
|
|
200
|
+
|
|
201
|
+
if tc.function.name != "computer":
|
|
202
|
+
return AgentResponse(
|
|
203
|
+
content=f"Error: Model called unexpected tool '{tc.function.name}'",
|
|
204
|
+
tool_calls=[],
|
|
205
|
+
done=True,
|
|
206
|
+
raw=response,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Parse the arguments
|
|
210
|
+
try:
|
|
211
|
+
args = json.loads(tc.function.arguments or "{}")
|
|
212
|
+
except json.JSONDecodeError:
|
|
213
|
+
return AgentResponse(
|
|
214
|
+
content="Error: Invalid tool arguments", tool_calls=[], done=True, raw=response
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
tool_call = MCPToolCall(name="computer", arguments=args, id=tc.id)
|
|
218
|
+
|
|
219
|
+
return AgentResponse(
|
|
220
|
+
content=msg.content or "", tool_calls=[tool_call], done=False, raw=response
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
async def call_tools(
|
|
224
|
+
self, tool_call: MCPToolCall | list[MCPToolCall] | None = None
|
|
225
|
+
) -> list[MCPToolResult]:
|
|
226
|
+
"""Override call_tools to intercept computer tool calls.
|
|
227
|
+
|
|
228
|
+
Execute them through grounded tool.
|
|
229
|
+
"""
|
|
230
|
+
if tool_call is None:
|
|
231
|
+
return []
|
|
232
|
+
|
|
233
|
+
if isinstance(tool_call, MCPToolCall):
|
|
234
|
+
tool_call = [tool_call]
|
|
235
|
+
|
|
236
|
+
results: list[MCPToolResult] = []
|
|
237
|
+
for tc in tool_call:
|
|
238
|
+
if tc.name == "computer":
|
|
239
|
+
# Execute through grounded tool instead of MCP
|
|
240
|
+
try:
|
|
241
|
+
# Extract latest screenshot from conversation history
|
|
242
|
+
screenshot_b64 = None
|
|
243
|
+
for m in reversed(self.conversation_history):
|
|
244
|
+
if m.get("role") == "user" and isinstance(m.get("content"), list):
|
|
245
|
+
for block in m["content"]:
|
|
246
|
+
if (
|
|
247
|
+
isinstance(block, dict)
|
|
248
|
+
and block.get("type") == "image_url"
|
|
249
|
+
and isinstance(block.get("image_url"), dict)
|
|
250
|
+
):
|
|
251
|
+
url = block["image_url"].get("url", "")
|
|
252
|
+
if url.startswith("data:"):
|
|
253
|
+
screenshot_b64 = (
|
|
254
|
+
url.split(",", 1)[1] if "," in url else None
|
|
255
|
+
)
|
|
256
|
+
break
|
|
257
|
+
if screenshot_b64:
|
|
258
|
+
break
|
|
259
|
+
|
|
260
|
+
# Pass screenshot to grounded tool
|
|
261
|
+
args_with_screenshot = dict(tc.arguments) if tc.arguments else {}
|
|
262
|
+
if screenshot_b64:
|
|
263
|
+
args_with_screenshot["screenshot_b64"] = screenshot_b64
|
|
264
|
+
|
|
265
|
+
if self.grounded_tool is None:
|
|
266
|
+
raise ValueError("Grounded tool is not initialized")
|
|
267
|
+
content_blocks = await self.grounded_tool(**args_with_screenshot)
|
|
268
|
+
results.append(MCPToolResult(content=content_blocks, isError=False))
|
|
269
|
+
except Exception as e:
|
|
270
|
+
# Create error result
|
|
271
|
+
from mcp.types import TextContent
|
|
272
|
+
|
|
273
|
+
error_content = TextContent(text=str(e), type="text")
|
|
274
|
+
results.append(MCPToolResult(content=[error_content], isError=True))
|
|
275
|
+
else:
|
|
276
|
+
# For non-computer tools, use parent implementation
|
|
277
|
+
parent_results = await super().call_tools(tc)
|
|
278
|
+
results.extend(parent_results)
|
|
279
|
+
|
|
280
|
+
return results
|
hud/agents/tests/test_client.py
CHANGED
|
@@ -200,7 +200,12 @@ class TestMCPClient:
|
|
|
200
200
|
# Calling a non-existent tool should return an error result
|
|
201
201
|
result = await client.call_tool(name="nonexistent", arguments={})
|
|
202
202
|
assert result.isError is True
|
|
203
|
-
|
|
203
|
+
# Check that the error message is in the text content
|
|
204
|
+
text_content = ""
|
|
205
|
+
for content in result.content:
|
|
206
|
+
if isinstance(content, types.TextContent):
|
|
207
|
+
text_content += content.text
|
|
208
|
+
assert "Tool 'nonexistent' not found" in text_content
|
|
204
209
|
|
|
205
210
|
@pytest.mark.asyncio
|
|
206
211
|
async def test_get_telemetry_data(self, mock_telemetry, mock_mcp_use_client):
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import mcp.types as types
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from hud.agents.grounded_openai import GroundedOpenAIChatAgent
|
|
10
|
+
from hud.tools.grounding import GrounderConfig
|
|
11
|
+
from hud.types import MCPToolCall, MCPToolResult
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DummyOpenAI:
|
|
15
|
+
class chat: # type: ignore[no-redef]
|
|
16
|
+
class completions:
|
|
17
|
+
@staticmethod
|
|
18
|
+
async def create(**kwargs: Any) -> Any:
|
|
19
|
+
# Return a minimal object mimicking OpenAI response
|
|
20
|
+
class Msg:
|
|
21
|
+
def __init__(self) -> None:
|
|
22
|
+
self.content = "Thinking..."
|
|
23
|
+
self.tool_calls = [
|
|
24
|
+
type(
|
|
25
|
+
"ToolCall",
|
|
26
|
+
(),
|
|
27
|
+
{
|
|
28
|
+
"id": "call_1",
|
|
29
|
+
"function": type(
|
|
30
|
+
"Fn",
|
|
31
|
+
(),
|
|
32
|
+
{
|
|
33
|
+
"name": "computer",
|
|
34
|
+
"arguments": json.dumps(
|
|
35
|
+
{
|
|
36
|
+
"action": "click",
|
|
37
|
+
"element_description": "blue button",
|
|
38
|
+
}
|
|
39
|
+
),
|
|
40
|
+
},
|
|
41
|
+
),
|
|
42
|
+
},
|
|
43
|
+
)()
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
class Choice:
|
|
47
|
+
def __init__(self) -> None:
|
|
48
|
+
self.message = Msg()
|
|
49
|
+
self.finish_reason = "tool_calls"
|
|
50
|
+
|
|
51
|
+
class Resp:
|
|
52
|
+
def __init__(self) -> None:
|
|
53
|
+
self.choices = [Choice()]
|
|
54
|
+
|
|
55
|
+
return Resp()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class FakeMCPClient:
|
|
59
|
+
def __init__(self) -> None:
|
|
60
|
+
self.tools: list[types.Tool] = [
|
|
61
|
+
types.Tool(name="computer", description="", inputSchema={}),
|
|
62
|
+
types.Tool(name="setup", description="internal functions", inputSchema={}),
|
|
63
|
+
]
|
|
64
|
+
self.called: list[MCPToolCall] = []
|
|
65
|
+
|
|
66
|
+
async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None:
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
async def list_tools(self) -> list[types.Tool]:
|
|
70
|
+
return self.tools
|
|
71
|
+
|
|
72
|
+
async def call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
|
|
73
|
+
self.called.append(tool_call)
|
|
74
|
+
return MCPToolResult(content=[types.TextContent(text="ok", type="text")], isError=False)
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def mcp_config(self) -> dict[str, dict[str, Any]]:
|
|
78
|
+
return {"local": {"command": "echo", "args": ["ok"]}}
|
|
79
|
+
|
|
80
|
+
async def shutdown(self) -> None:
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
async def list_resources(self) -> list[types.Resource]: # not used here
|
|
84
|
+
return []
|
|
85
|
+
|
|
86
|
+
async def read_resource(self, uri: str) -> types.ReadResourceResult | None:
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class DummyGrounder:
|
|
91
|
+
async def predict_click(self, *, image_b64: str, instruction: str, max_retries: int = 3):
|
|
92
|
+
return (7, 9)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class DummyGroundedTool:
|
|
96
|
+
def __init__(self) -> None:
|
|
97
|
+
self.last_args: dict[str, Any] | None = None
|
|
98
|
+
|
|
99
|
+
async def __call__(self, **kwargs: Any):
|
|
100
|
+
self.last_args = kwargs
|
|
101
|
+
return [types.TextContent(text="ok", type="text")]
|
|
102
|
+
|
|
103
|
+
def get_openai_tool_schema(self) -> dict:
|
|
104
|
+
return {
|
|
105
|
+
"type": "function",
|
|
106
|
+
"function": {"name": "computer", "parameters": {"type": "object"}},
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@pytest.mark.asyncio
|
|
111
|
+
async def test_call_tools_injects_screenshot_and_delegates(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
112
|
+
# Agent with fake OpenAI client and fake MCP client
|
|
113
|
+
grounder_cfg = GrounderConfig(api_base="http://example", model="qwen")
|
|
114
|
+
agent = GroundedOpenAIChatAgent(
|
|
115
|
+
grounder_config=grounder_cfg,
|
|
116
|
+
openai_client=DummyOpenAI(),
|
|
117
|
+
model_name="gpt-4o-mini",
|
|
118
|
+
mcp_client=FakeMCPClient(),
|
|
119
|
+
initial_screenshot=False,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Inject a dummy grounded tool to observe args without full initialization
|
|
123
|
+
dummy_tool = DummyGroundedTool()
|
|
124
|
+
agent.grounded_tool = dummy_tool # type: ignore
|
|
125
|
+
|
|
126
|
+
# Seed conversation history with a user image
|
|
127
|
+
png_b64 = (
|
|
128
|
+
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGMAAQAABQAB"
|
|
129
|
+
"J2n0mQAAAABJRU5ErkJggg=="
|
|
130
|
+
)
|
|
131
|
+
agent.conversation_history = [
|
|
132
|
+
{
|
|
133
|
+
"role": "user",
|
|
134
|
+
"content": [
|
|
135
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{png_b64}"}},
|
|
136
|
+
],
|
|
137
|
+
}
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
# Build a tool call as GroundedOpenAIChatAgent.get_response would produce
|
|
141
|
+
tool_call = MCPToolCall(
|
|
142
|
+
name="computer", arguments={"action": "click", "element_description": "blue button"}
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
results = await agent.call_tools(tool_call)
|
|
146
|
+
|
|
147
|
+
# One result returned
|
|
148
|
+
assert len(results) == 1 and not results[0].isError
|
|
149
|
+
|
|
150
|
+
# Grounded tool received screenshot_b64 injected
|
|
151
|
+
assert dummy_tool.last_args is not None
|
|
152
|
+
assert dummy_tool.last_args["action"] == "click"
|
|
153
|
+
assert dummy_tool.last_args["element_description"] == "blue button"
|
|
154
|
+
assert "screenshot_b64" in dummy_tool.last_args
|
|
155
|
+
assert isinstance(dummy_tool.last_args["screenshot_b64"], str)
|
hud/cli/eval.py
CHANGED
|
@@ -87,7 +87,7 @@ async def run_single_task(
|
|
|
87
87
|
except ImportError as e:
|
|
88
88
|
design.error(
|
|
89
89
|
"Dataset dependencies are not installed. "
|
|
90
|
-
"Please install with: pip install 'hud-python\
|
|
90
|
+
"Please install with: pip install 'hud-python\u27e6agent\u27e7'"
|
|
91
91
|
)
|
|
92
92
|
raise typer.Exit(1) from e
|
|
93
93
|
|
|
@@ -111,7 +111,7 @@ async def run_single_task(
|
|
|
111
111
|
except ImportError as e:
|
|
112
112
|
design.error(
|
|
113
113
|
"OpenAI agent dependencies are not installed. "
|
|
114
|
-
"Please install with: pip install 'hud-python\
|
|
114
|
+
"Please install with: pip install 'hud-python\u27e6agent\u27e7'"
|
|
115
115
|
)
|
|
116
116
|
raise typer.Exit(1) from e
|
|
117
117
|
|
hud/cli/utils/interactive.py
CHANGED
hud/settings.py
CHANGED
|
@@ -44,6 +44,12 @@ class Settings(BaseSettings):
|
|
|
44
44
|
validation_alias="OPENAI_API_KEY",
|
|
45
45
|
)
|
|
46
46
|
|
|
47
|
+
openrouter_api_key: str | None = Field(
|
|
48
|
+
default=None,
|
|
49
|
+
description="API key for OpenRouter models",
|
|
50
|
+
validation_alias="OPENROUTER_API_KEY",
|
|
51
|
+
)
|
|
52
|
+
|
|
47
53
|
wandb_api_key: str | None = Field(
|
|
48
54
|
default=None,
|
|
49
55
|
description="API key for Weights & Biases",
|
|
@@ -361,5 +361,5 @@ class TestLazyImports:
|
|
|
361
361
|
"""Test lazy import with invalid attribute name."""
|
|
362
362
|
import hud.tools.executors as executors_module
|
|
363
363
|
|
|
364
|
-
with pytest.raises(AttributeError, match="module '.*' has no attribute 'InvalidExecutor'"):
|
|
364
|
+
with pytest.raises(AttributeError, match=r"module '.*' has no attribute 'InvalidExecutor'"):
|
|
365
365
|
_ = executors_module.InvalidExecutor
|
hud/tools/executors/xdo.py
CHANGED
|
@@ -175,7 +175,7 @@ class XDOExecutor(BaseExecutor):
|
|
|
175
175
|
|
|
176
176
|
screenshot_cmd = f"{self._display_prefix}scrot -p {screenshot_path}"
|
|
177
177
|
|
|
178
|
-
returncode, _,
|
|
178
|
+
returncode, _, _stderr = await run(screenshot_cmd)
|
|
179
179
|
|
|
180
180
|
if returncode == 0 and screenshot_path.exists():
|
|
181
181
|
try:
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Grounding module for visual element detection and coordinate resolution."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .config import GrounderConfig
|
|
6
|
+
from .grounded_tool import GroundedComputerTool
|
|
7
|
+
from .grounder import Grounder
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"GroundedComputerTool",
|
|
11
|
+
"Grounder",
|
|
12
|
+
"GrounderConfig",
|
|
13
|
+
]
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Configuration for grounding models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
SYSTEM_PROMPT = (
|
|
9
|
+
"You are a visual grounding model. Given an image and a description, "
|
|
10
|
+
"return ONLY the center pixel coordinates of the described element as a "
|
|
11
|
+
"single point in parentheses format: (x, y). Do not return bounding boxes "
|
|
12
|
+
"or multiple coordinates."
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class GrounderConfig:
|
|
18
|
+
"""Configuration for grounding model clients.
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
api_base: Base URL for the grounding model API endpoint
|
|
22
|
+
model: Model identifier to use for grounding
|
|
23
|
+
api_key: API key for authentication (default: "EMPTY" for local models)
|
|
24
|
+
system_prompt: System prompt to guide the grounding model
|
|
25
|
+
output_format: Format for coordinate output ("pixels", "norm_0_1", "norm_0_999")
|
|
26
|
+
parser_regex: Regular expression to parse coordinates from model output
|
|
27
|
+
resize: Image resizing configuration dictionary
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
api_base: str
|
|
31
|
+
model: str
|
|
32
|
+
api_key: str = "EMPTY"
|
|
33
|
+
system_prompt: str = SYSTEM_PROMPT
|
|
34
|
+
output_format: str = "pixels" # "pixels" | "norm_0_1" | "norm_0_999"
|
|
35
|
+
parser_regex: str = r"\((\d+),\s*(\d+)\)"
|
|
36
|
+
resize: dict[str, Any] = field(
|
|
37
|
+
default_factory=lambda: {
|
|
38
|
+
"enabled": True,
|
|
39
|
+
"min_pixels": 3136,
|
|
40
|
+
"max_pixels": 4096 * 2160,
|
|
41
|
+
"factor": 28,
|
|
42
|
+
}
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def __post_init__(self) -> None:
|
|
46
|
+
"""Validate configuration after initialization."""
|
|
47
|
+
if self.output_format not in ("pixels", "norm_0_1", "norm_0_999"):
|
|
48
|
+
raise ValueError(f"Invalid output_format: {self.output_format}")
|
|
49
|
+
|
|
50
|
+
if not self.api_base:
|
|
51
|
+
raise ValueError("api_base is required")
|
|
52
|
+
|
|
53
|
+
if not self.model:
|
|
54
|
+
raise ValueError("model is required")
|