hud-python 0.4.21__py3-none-any.whl → 0.4.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/agents/base.py +37 -37
- hud/agents/claude.py +11 -6
- hud/agents/grounded_openai.py +282 -0
- hud/agents/misc/response_agent.py +3 -2
- hud/agents/openai.py +2 -2
- hud/agents/openai_chat_generic.py +3 -1
- hud/agents/tests/test_client.py +6 -1
- hud/agents/tests/test_grounded_openai_agent.py +155 -0
- hud/cli/__init__.py +34 -24
- hud/cli/analyze.py +27 -26
- hud/cli/build.py +50 -46
- hud/cli/debug.py +7 -7
- hud/cli/dev.py +107 -99
- hud/cli/eval.py +33 -31
- hud/cli/hf.py +53 -53
- hud/cli/init.py +28 -28
- hud/cli/list_func.py +22 -22
- hud/cli/pull.py +36 -36
- hud/cli/push.py +76 -74
- hud/cli/remove.py +42 -40
- hud/cli/rl/__init__.py +2 -2
- hud/cli/rl/init.py +41 -41
- hud/cli/rl/pod.py +97 -91
- hud/cli/rl/ssh.py +42 -40
- hud/cli/rl/train.py +75 -73
- hud/cli/rl/utils.py +10 -10
- hud/cli/tests/test_analyze.py +1 -1
- hud/cli/tests/test_analyze_metadata.py +2 -2
- hud/cli/tests/test_pull.py +45 -45
- hud/cli/tests/test_push.py +31 -29
- hud/cli/tests/test_registry.py +15 -15
- hud/cli/utils/environment.py +11 -11
- hud/cli/utils/interactive.py +18 -18
- hud/cli/utils/logging.py +12 -12
- hud/cli/utils/metadata.py +12 -12
- hud/cli/utils/registry.py +5 -5
- hud/cli/utils/runner.py +23 -23
- hud/cli/utils/server.py +16 -16
- hud/settings.py +6 -0
- hud/shared/hints.py +7 -7
- hud/tools/executors/tests/test_base_executor.py +1 -1
- hud/tools/executors/xdo.py +1 -1
- hud/tools/grounding/__init__.py +13 -0
- hud/tools/grounding/config.py +54 -0
- hud/tools/grounding/grounded_tool.py +314 -0
- hud/tools/grounding/grounder.py +302 -0
- hud/tools/grounding/tests/__init__.py +1 -0
- hud/tools/grounding/tests/test_grounded_tool.py +196 -0
- hud/tools/tests/test_playwright_tool.py +1 -1
- hud/tools/tests/test_tools_init.py +1 -1
- hud/tools/tests/test_utils.py +2 -2
- hud/types.py +4 -4
- hud/utils/__init__.py +3 -3
- hud/utils/agent_factories.py +86 -0
- hud/utils/{design.py → hud_console.py} +39 -33
- hud/utils/pretty_errors.py +6 -6
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/METADATA +3 -1
- {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/RECORD +63 -54
- {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/WHEEL +0 -0
- {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
"""OpenAI-based grounder for visual element detection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
import io
|
|
7
|
+
import json
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
from openai import AsyncOpenAI
|
|
11
|
+
from opentelemetry import trace
|
|
12
|
+
|
|
13
|
+
from hud import instrument
|
|
14
|
+
from hud.tools.grounding.config import GrounderConfig # noqa: TC001
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Grounder:
|
|
18
|
+
"""Grounder that uses AsyncOpenAI to call vLLM or other model endpoints for visual grounding.
|
|
19
|
+
|
|
20
|
+
This class handles:
|
|
21
|
+
- Image resizing based on configuration
|
|
22
|
+
- API calls to grounding models via AsyncOpenAI
|
|
23
|
+
- Coordinate parsing from model outputs
|
|
24
|
+
- Coordinate format conversion (pixels, normalized)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, config: GrounderConfig) -> None:
|
|
28
|
+
"""Initialize the grounder with configuration.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
config: GrounderConfig with API endpoint, model, and parsing settings
|
|
32
|
+
"""
|
|
33
|
+
self.config = config
|
|
34
|
+
self.client = AsyncOpenAI(api_key=config.api_key, base_url=config.api_base)
|
|
35
|
+
|
|
36
|
+
def _resize_image(self, image_b64: str) -> tuple[str, tuple[int, int], tuple[int, int]]:
|
|
37
|
+
"""Resize image according to configuration.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
image_b64: Base64-encoded image string
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Tuple of (processed_base64, (original_width, original_height),
|
|
44
|
+
(processed_width, processed_height))
|
|
45
|
+
"""
|
|
46
|
+
# Decode image
|
|
47
|
+
from PIL import Image
|
|
48
|
+
|
|
49
|
+
image_bytes = base64.b64decode(image_b64)
|
|
50
|
+
img = Image.open(io.BytesIO(image_bytes))
|
|
51
|
+
original_size = (img.width, img.height)
|
|
52
|
+
|
|
53
|
+
if not self.config.resize["enabled"]:
|
|
54
|
+
return image_b64, original_size, original_size
|
|
55
|
+
|
|
56
|
+
# Calculate total pixels
|
|
57
|
+
total_pixels = img.width * img.height
|
|
58
|
+
min_pixels = self.config.resize["min_pixels"]
|
|
59
|
+
max_pixels = self.config.resize["max_pixels"]
|
|
60
|
+
factor = self.config.resize["factor"]
|
|
61
|
+
|
|
62
|
+
# Determine if resizing is needed
|
|
63
|
+
if total_pixels < min_pixels or total_pixels > max_pixels:
|
|
64
|
+
# Calculate scaling factor
|
|
65
|
+
if total_pixels < min_pixels:
|
|
66
|
+
scale = (min_pixels / total_pixels) ** 0.5
|
|
67
|
+
else:
|
|
68
|
+
scale = (max_pixels / total_pixels) ** 0.5
|
|
69
|
+
|
|
70
|
+
# Round dimensions to nearest factor
|
|
71
|
+
new_width = int((img.width * scale) // factor) * factor
|
|
72
|
+
new_height = int((img.height * scale) // factor) * factor
|
|
73
|
+
|
|
74
|
+
# Ensure minimum dimensions
|
|
75
|
+
new_width = max(new_width, factor)
|
|
76
|
+
new_height = max(new_height, factor)
|
|
77
|
+
|
|
78
|
+
# Resize image
|
|
79
|
+
img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
|
80
|
+
|
|
81
|
+
# Convert back to base64
|
|
82
|
+
buffer = io.BytesIO()
|
|
83
|
+
img.save(buffer, format="PNG")
|
|
84
|
+
resized_b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
|
85
|
+
return resized_b64, original_size, (new_width, new_height)
|
|
86
|
+
|
|
87
|
+
return image_b64, original_size, original_size
|
|
88
|
+
|
|
89
|
+
def _parse_coordinates(self, response_text: str) -> tuple[float, float] | None:
|
|
90
|
+
"""Parse coordinates from model response.
|
|
91
|
+
|
|
92
|
+
Handles multiple formats:
|
|
93
|
+
- (x, y) format from configured regex
|
|
94
|
+
- [x1, y1, x2, y2] bounding box format (returns center point)
|
|
95
|
+
- [x, y] point format
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
response_text: Text output from the grounding model
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Tuple of (x, y) coordinates or None if parsing fails
|
|
102
|
+
"""
|
|
103
|
+
# First try the configured regex pattern
|
|
104
|
+
match = re.search(self.config.parser_regex, response_text)
|
|
105
|
+
if match:
|
|
106
|
+
try:
|
|
107
|
+
x = float(match.group(1))
|
|
108
|
+
y = float(match.group(2))
|
|
109
|
+
return (x, y)
|
|
110
|
+
except (ValueError, IndexError):
|
|
111
|
+
# If parsing fails, continue to fallback strategies
|
|
112
|
+
pass
|
|
113
|
+
|
|
114
|
+
# Try to parse as a list/array format [x1, y1, x2, y2] or [x, y]
|
|
115
|
+
# Also handles (x1, y1, x2, y2)
|
|
116
|
+
# Updated pattern to handle both integers and floats
|
|
117
|
+
list_pattern = (
|
|
118
|
+
r"[\[\(](\d+(?:\.\d+)?)[,\s]+(\d+(?:\.\d+)?)"
|
|
119
|
+
r"(?:[,\s]+(\d+(?:\.\d+)?)[,\s]+(\d+(?:\.\d+)?))?[\]\)]"
|
|
120
|
+
)
|
|
121
|
+
list_match = re.search(list_pattern, response_text)
|
|
122
|
+
if list_match:
|
|
123
|
+
x1 = float(list_match.group(1))
|
|
124
|
+
y1 = float(list_match.group(2))
|
|
125
|
+
|
|
126
|
+
# Check if it's a bounding box (4 values) or a point (2 values)
|
|
127
|
+
if list_match.group(3) and list_match.group(4):
|
|
128
|
+
# Bounding box format - return center point
|
|
129
|
+
x2 = float(list_match.group(3))
|
|
130
|
+
y2 = float(list_match.group(4))
|
|
131
|
+
center_x = (x1 + x2) / 2
|
|
132
|
+
center_y = (y1 + y2) / 2
|
|
133
|
+
return (center_x, center_y)
|
|
134
|
+
else:
|
|
135
|
+
# Point format
|
|
136
|
+
return (x1, y1)
|
|
137
|
+
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
def _convert_coordinates(
|
|
141
|
+
self,
|
|
142
|
+
coords: tuple[float, float],
|
|
143
|
+
processed_size: tuple[int, int],
|
|
144
|
+
original_size: tuple[int, int],
|
|
145
|
+
) -> tuple[int, int]:
|
|
146
|
+
"""Convert coordinates based on output format configuration and scale to original size.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
coords: Raw coordinates from model (can be float for normalized formats)
|
|
150
|
+
processed_size: Dimensions of the processed/resized image (width, height)
|
|
151
|
+
original_size: Original image dimensions (width, height)
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Converted coordinates in original image pixels
|
|
155
|
+
"""
|
|
156
|
+
x, y = coords
|
|
157
|
+
proc_width, proc_height = processed_size
|
|
158
|
+
orig_width, orig_height = original_size
|
|
159
|
+
|
|
160
|
+
# First convert to pixels in the processed image space
|
|
161
|
+
if self.config.output_format == "pixels":
|
|
162
|
+
# Already in pixels of processed image
|
|
163
|
+
proc_x, proc_y = x, y
|
|
164
|
+
elif self.config.output_format == "norm_0_1":
|
|
165
|
+
# Convert from 0-1 normalized to pixels
|
|
166
|
+
proc_x = x * proc_width
|
|
167
|
+
proc_y = y * proc_height
|
|
168
|
+
elif self.config.output_format == "norm_0_999":
|
|
169
|
+
# Convert from 0-999 normalized to pixels
|
|
170
|
+
proc_x = x * proc_width / 999
|
|
171
|
+
proc_y = y * proc_height / 999
|
|
172
|
+
else:
|
|
173
|
+
proc_x, proc_y = x, y
|
|
174
|
+
|
|
175
|
+
# Scale from processed image coordinates to original image coordinates
|
|
176
|
+
scale_x = orig_width / proc_width
|
|
177
|
+
scale_y = orig_height / proc_height
|
|
178
|
+
|
|
179
|
+
final_x = int(proc_x * scale_x)
|
|
180
|
+
final_y = int(proc_y * scale_y)
|
|
181
|
+
|
|
182
|
+
return (final_x, final_y)
|
|
183
|
+
|
|
184
|
+
@instrument(
|
|
185
|
+
name="Grounding.predict_click",
|
|
186
|
+
span_type="agent",
|
|
187
|
+
record_args=True,
|
|
188
|
+
record_result=True,
|
|
189
|
+
)
|
|
190
|
+
async def predict_click(
|
|
191
|
+
self, *, image_b64: str, instruction: str, max_retries: int = 3
|
|
192
|
+
) -> tuple[int, int] | None:
|
|
193
|
+
"""Predict click coordinates for the given instruction on the image.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
image_b64: Base64-encoded screenshot
|
|
197
|
+
instruction: Natural language description of the element to click
|
|
198
|
+
max_retries: Maximum number of retry attempts (default: 3)
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Tuple of (x, y) pixel coordinates or None if grounding fails
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
# Resize image once outside the retry loop
|
|
205
|
+
processed_image, original_size, processed_size = self._resize_image(image_b64)
|
|
206
|
+
|
|
207
|
+
# Build messages once
|
|
208
|
+
messages = []
|
|
209
|
+
|
|
210
|
+
# Add system prompt if configured
|
|
211
|
+
if self.config.system_prompt:
|
|
212
|
+
messages.append(
|
|
213
|
+
{
|
|
214
|
+
"role": "system",
|
|
215
|
+
"content": (
|
|
216
|
+
self.config.system_prompt
|
|
217
|
+
+ f" The image resolution is height {processed_size[1]} "
|
|
218
|
+
+ f"and width {processed_size[0]}."
|
|
219
|
+
),
|
|
220
|
+
}
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# Add user message with image and instruction
|
|
224
|
+
messages.append(
|
|
225
|
+
{
|
|
226
|
+
"role": "user",
|
|
227
|
+
"content": [
|
|
228
|
+
{
|
|
229
|
+
"type": "image_url",
|
|
230
|
+
"image_url": {"url": f"data:image/png;base64,{processed_image}"},
|
|
231
|
+
},
|
|
232
|
+
{"type": "text", "text": instruction},
|
|
233
|
+
],
|
|
234
|
+
}
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# Retry loop
|
|
238
|
+
for attempt in range(max_retries):
|
|
239
|
+
try:
|
|
240
|
+
# Call the grounding model via AsyncOpenAI
|
|
241
|
+
response = await self.client.chat.completions.create(
|
|
242
|
+
model=self.config.model,
|
|
243
|
+
messages=messages,
|
|
244
|
+
temperature=0.0,
|
|
245
|
+
max_tokens=50,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Extract response text
|
|
249
|
+
response_text = response.choices[0].message.content
|
|
250
|
+
|
|
251
|
+
# Manually record the raw response in the span
|
|
252
|
+
span = trace.get_current_span()
|
|
253
|
+
if span and span.is_recording():
|
|
254
|
+
span.set_attribute("grounder.raw_response", json.dumps(response.model_dump()))
|
|
255
|
+
span.set_attribute("grounder.attempt", attempt + 1)
|
|
256
|
+
|
|
257
|
+
# Parse coordinates from response
|
|
258
|
+
if response_text is None:
|
|
259
|
+
if attempt < max_retries - 1:
|
|
260
|
+
continue
|
|
261
|
+
return None
|
|
262
|
+
|
|
263
|
+
coords = self._parse_coordinates(response_text)
|
|
264
|
+
if coords is None:
|
|
265
|
+
if attempt < max_retries - 1:
|
|
266
|
+
continue
|
|
267
|
+
return None
|
|
268
|
+
|
|
269
|
+
# Convert coordinates to original image pixels based on output format and scaling
|
|
270
|
+
pixel_coords = self._convert_coordinates(coords, processed_size, original_size)
|
|
271
|
+
|
|
272
|
+
# Validate coordinates are within image bounds
|
|
273
|
+
x, y = pixel_coords
|
|
274
|
+
if x < 0 or y < 0 or x >= original_size[0] or y >= original_size[1]:
|
|
275
|
+
# Clamp to image bounds
|
|
276
|
+
x = max(0, min(x, original_size[0] - 1))
|
|
277
|
+
y = max(0, min(y, original_size[1] - 1))
|
|
278
|
+
pixel_coords = (x, y)
|
|
279
|
+
|
|
280
|
+
# Record successful grounding in span
|
|
281
|
+
span = trace.get_current_span()
|
|
282
|
+
if span and span.is_recording():
|
|
283
|
+
span.set_attribute("grounder.success", True)
|
|
284
|
+
span.set_attribute(
|
|
285
|
+
"grounder.final_coords", f"{pixel_coords[0]},{pixel_coords[1]}"
|
|
286
|
+
)
|
|
287
|
+
span.set_attribute("grounder.total_attempts", attempt + 1)
|
|
288
|
+
|
|
289
|
+
return pixel_coords
|
|
290
|
+
|
|
291
|
+
except Exception:
|
|
292
|
+
if attempt < max_retries - 1:
|
|
293
|
+
continue
|
|
294
|
+
|
|
295
|
+
# Record failure in span
|
|
296
|
+
span = trace.get_current_span()
|
|
297
|
+
if span and span.is_recording():
|
|
298
|
+
span.set_attribute("grounder.success", False)
|
|
299
|
+
span.set_attribute("grounder.total_attempts", max_retries)
|
|
300
|
+
span.set_attribute("grounder.failure_reason", "All attempts exhausted")
|
|
301
|
+
|
|
302
|
+
return None
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Tests for grounding tools."""
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import mcp.types as types
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from hud.tools.grounding.grounded_tool import GroundedComputerTool
|
|
10
|
+
from hud.types import MCPToolCall, MCPToolResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class FakeResult:
|
|
15
|
+
content: list[types.ContentBlock]
|
|
16
|
+
isError: bool = False
|
|
17
|
+
structuredContent: dict | None = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class FakeMCPClient:
|
|
21
|
+
"""Fake MCP client that implements AgentMCPClient protocol."""
|
|
22
|
+
|
|
23
|
+
_initialized: bool
|
|
24
|
+
|
|
25
|
+
def __init__(self) -> None:
|
|
26
|
+
self.calls: list[tuple[str, dict[str, Any]]] = []
|
|
27
|
+
self._initialized = False
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def mcp_config(self) -> dict[str, dict[str, Any]]:
|
|
31
|
+
return {"test": {"command": "echo", "args": ["test"]}}
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def is_connected(self) -> bool:
|
|
35
|
+
return self._initialized
|
|
36
|
+
|
|
37
|
+
async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None:
|
|
38
|
+
self._initialized = True
|
|
39
|
+
|
|
40
|
+
async def list_tools(self) -> list[types.Tool]:
|
|
41
|
+
return [types.Tool(name="computer", description="Test tool", inputSchema={})]
|
|
42
|
+
|
|
43
|
+
async def call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
|
|
44
|
+
self.calls.append((tool_call.name, tool_call.arguments or {}))
|
|
45
|
+
return MCPToolResult(content=[types.TextContent(text="ok", type="text")], isError=False)
|
|
46
|
+
|
|
47
|
+
async def shutdown(self) -> None:
|
|
48
|
+
self._initialized = False
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class FakeGrounder:
|
|
52
|
+
"""Fake grounder that implements Grounder interface."""
|
|
53
|
+
|
|
54
|
+
def __init__(self, coords: tuple[int, int] | None = (10, 20)) -> None:
|
|
55
|
+
self.coords = coords
|
|
56
|
+
self.calls: list[tuple[str, str]] = []
|
|
57
|
+
|
|
58
|
+
async def predict_click(
|
|
59
|
+
self, *, image_b64: str, instruction: str, max_retries: int = 3
|
|
60
|
+
) -> tuple[int, int] | None:
|
|
61
|
+
self.calls.append((image_b64[:10], instruction))
|
|
62
|
+
return self.coords
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _png_b64() -> str:
|
|
66
|
+
# 1x1 transparent PNG base64 (valid minimal image)
|
|
67
|
+
return (
|
|
68
|
+
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGMAAQAABQAB"
|
|
69
|
+
"J2n0mQAAAABJRU5ErkJggg=="
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@pytest.mark.asyncio
|
|
74
|
+
async def test_click_action_grounds_and_calls_mcp() -> None:
|
|
75
|
+
client = FakeMCPClient()
|
|
76
|
+
grounder = FakeGrounder(coords=(123, 456))
|
|
77
|
+
tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
|
|
78
|
+
|
|
79
|
+
blocks = await tool(
|
|
80
|
+
action="click",
|
|
81
|
+
element_description="red button",
|
|
82
|
+
screenshot_b64=_png_b64(),
|
|
83
|
+
button="left",
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
assert isinstance(blocks, list)
|
|
87
|
+
# Grounder called once
|
|
88
|
+
assert len(grounder.calls) == 1
|
|
89
|
+
# MCP called with resolved coordinates
|
|
90
|
+
assert client.calls == [("computer", {"action": "click", "x": 123, "y": 456, "button": "left"})]
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@pytest.mark.asyncio
|
|
94
|
+
async def test_move_and_scroll_require_element_description_and_screenshot() -> None:
|
|
95
|
+
client = FakeMCPClient()
|
|
96
|
+
grounder = FakeGrounder(coords=(5, 6))
|
|
97
|
+
tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
|
|
98
|
+
|
|
99
|
+
# Missing element_description
|
|
100
|
+
with pytest.raises(Exception) as ei:
|
|
101
|
+
await tool(action="move", screenshot_b64=_png_b64())
|
|
102
|
+
assert "element_description is required" in str(ei.value)
|
|
103
|
+
|
|
104
|
+
# Missing screenshot
|
|
105
|
+
with pytest.raises(Exception) as ei2:
|
|
106
|
+
await tool(action="scroll", element_description="list", scroll_y=100)
|
|
107
|
+
assert "No screenshot available" in str(ei2.value)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@pytest.mark.asyncio
|
|
111
|
+
async def test_drag_grounds_both_points_and_calls_mcp() -> None:
|
|
112
|
+
client = FakeMCPClient()
|
|
113
|
+
grounder = FakeGrounder(coords=(10, 20))
|
|
114
|
+
tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
|
|
115
|
+
|
|
116
|
+
await tool(
|
|
117
|
+
action="drag",
|
|
118
|
+
start_element_description="source",
|
|
119
|
+
end_element_description="target",
|
|
120
|
+
screenshot_b64=_png_b64(),
|
|
121
|
+
button="left",
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Two grounding calls (start and end)
|
|
125
|
+
assert len(grounder.calls) == 2
|
|
126
|
+
# Drag path contains two points, same coords from fake grounder
|
|
127
|
+
name, args = client.calls[0]
|
|
128
|
+
assert name == "computer"
|
|
129
|
+
assert args["action"] == "drag"
|
|
130
|
+
assert args["button"] == "left"
|
|
131
|
+
assert args["path"] == [(10, 20), (10, 20)]
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@pytest.mark.asyncio
|
|
135
|
+
async def test_drag_requires_both_descriptions_and_screenshot() -> None:
|
|
136
|
+
client = FakeMCPClient()
|
|
137
|
+
grounder = FakeGrounder()
|
|
138
|
+
tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
|
|
139
|
+
|
|
140
|
+
with pytest.raises(Exception) as ei:
|
|
141
|
+
await tool(action="drag", start_element_description="a", screenshot_b64=_png_b64())
|
|
142
|
+
assert "start_element_description and end_element_description" in str(ei.value)
|
|
143
|
+
|
|
144
|
+
with pytest.raises(Exception) as ei2:
|
|
145
|
+
await tool(
|
|
146
|
+
action="drag",
|
|
147
|
+
start_element_description="a",
|
|
148
|
+
end_element_description="b",
|
|
149
|
+
)
|
|
150
|
+
assert "No screenshot available" in str(ei2.value)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@pytest.mark.asyncio
|
|
154
|
+
async def test_direct_actions_bypass_grounding_and_call_mcp() -> None:
|
|
155
|
+
client = FakeMCPClient()
|
|
156
|
+
grounder = FakeGrounder()
|
|
157
|
+
tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
|
|
158
|
+
|
|
159
|
+
# Actions that bypass grounding
|
|
160
|
+
for action, extra in [
|
|
161
|
+
("screenshot", {}),
|
|
162
|
+
("type", {"text": "hello"}),
|
|
163
|
+
("keypress", {"keys": ["ctrl", "a"]}),
|
|
164
|
+
("wait", {}),
|
|
165
|
+
("get_current_url", {}),
|
|
166
|
+
("get_dimensions", {}),
|
|
167
|
+
("get_environment", {}),
|
|
168
|
+
]:
|
|
169
|
+
client.calls.clear()
|
|
170
|
+
_ = await tool(action=action, **extra)
|
|
171
|
+
assert client.calls and client.calls[0][0] == "computer"
|
|
172
|
+
assert client.calls[0][1]["action"] == action
|
|
173
|
+
# Grounder not invoked for these
|
|
174
|
+
assert grounder.calls == []
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@pytest.mark.asyncio
|
|
178
|
+
async def test_unsupported_action_raises() -> None:
|
|
179
|
+
client = FakeMCPClient()
|
|
180
|
+
grounder = FakeGrounder()
|
|
181
|
+
tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
|
|
182
|
+
|
|
183
|
+
with pytest.raises(Exception) as ei:
|
|
184
|
+
await tool(action="zoom")
|
|
185
|
+
assert "Unsupported action" in str(ei.value)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
@pytest.mark.asyncio
|
|
189
|
+
async def test_grounding_failure_propagates_as_error() -> None:
|
|
190
|
+
client = FakeMCPClient()
|
|
191
|
+
grounder = FakeGrounder(coords=None)
|
|
192
|
+
tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
|
|
193
|
+
|
|
194
|
+
with pytest.raises(Exception) as ei:
|
|
195
|
+
await tool(action="click", element_description="x", screenshot_b64=_png_b64())
|
|
196
|
+
assert "Could not locate element" in str(ei.value)
|
|
@@ -52,7 +52,7 @@ class TestPlaywrightTool:
|
|
|
52
52
|
assert any(isinstance(b, TextContent) for b in blocks)
|
|
53
53
|
# The actual call includes wait_until parameter with a Field object
|
|
54
54
|
mock_page.goto.assert_called_once()
|
|
55
|
-
args,
|
|
55
|
+
args, _kwargs = mock_page.goto.call_args
|
|
56
56
|
assert args[0] == "https://example.com"
|
|
57
57
|
mock_ensure.assert_called_once()
|
|
58
58
|
|
|
@@ -33,7 +33,7 @@ class TestToolsInit:
|
|
|
33
33
|
"""Test lazy import with invalid attribute name."""
|
|
34
34
|
import hud.tools as tools_module
|
|
35
35
|
|
|
36
|
-
with pytest.raises(AttributeError, match="module '.*' has no attribute 'InvalidTool'"):
|
|
36
|
+
with pytest.raises(AttributeError, match=r"module '.*' has no attribute 'InvalidTool'"):
|
|
37
37
|
_ = tools_module.InvalidTool
|
|
38
38
|
|
|
39
39
|
def test_direct_imports_available(self):
|
hud/tools/tests/test_utils.py
CHANGED
|
@@ -58,7 +58,7 @@ class TestRun:
|
|
|
58
58
|
mock_proc.communicate = AsyncMock(return_value=(b"processed", b""))
|
|
59
59
|
|
|
60
60
|
with patch("asyncio.create_subprocess_shell", return_value=mock_proc):
|
|
61
|
-
return_code, stdout,
|
|
61
|
+
return_code, stdout, _stderr = await run("cat", input="test input")
|
|
62
62
|
|
|
63
63
|
assert return_code == 0
|
|
64
64
|
assert stdout == "processed"
|
|
@@ -91,7 +91,7 @@ class TestRun:
|
|
|
91
91
|
):
|
|
92
92
|
mock_wait_for.return_value = (b"done", b"")
|
|
93
93
|
|
|
94
|
-
|
|
94
|
+
_return_code, _stdout, _stderr = await run("sleep 1", timeout=5.0)
|
|
95
95
|
|
|
96
96
|
# Check that wait_for was called with the correct timeout
|
|
97
97
|
mock_wait_for.assert_called_once()
|
hud/types.py
CHANGED
|
@@ -29,9 +29,9 @@ class MCPToolCall(CallToolRequestParams):
|
|
|
29
29
|
|
|
30
30
|
def __rich__(self) -> str:
|
|
31
31
|
"""Rich representation with color formatting."""
|
|
32
|
-
from hud.utils.
|
|
32
|
+
from hud.utils.hud_console import hud_console
|
|
33
33
|
|
|
34
|
-
return
|
|
34
|
+
return hud_console.format_tool_call(self.name, self.arguments)
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
class MCPToolResult(CallToolResult):
|
|
@@ -74,10 +74,10 @@ class MCPToolResult(CallToolResult):
|
|
|
74
74
|
|
|
75
75
|
def __rich__(self) -> str:
|
|
76
76
|
"""Rich representation with color formatting."""
|
|
77
|
-
from hud.utils.
|
|
77
|
+
from hud.utils.hud_console import hud_console
|
|
78
78
|
|
|
79
79
|
content_summary = self._get_content_summary()
|
|
80
|
-
return
|
|
80
|
+
return hud_console.format_tool_result(content_summary, self.isError)
|
|
81
81
|
|
|
82
82
|
|
|
83
83
|
class AgentResponse(BaseModel):
|
hud/utils/__init__.py
CHANGED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Factory functions for creating agents compatible with run_dataset."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from openai import AsyncOpenAI
|
|
8
|
+
|
|
9
|
+
from hud.agents.grounded_openai import GroundedOpenAIChatAgent
|
|
10
|
+
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
11
|
+
from hud.tools.grounding import GrounderConfig
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def create_openai_agent(**kwargs: Any) -> GenericOpenAIChatAgent:
|
|
15
|
+
"""Factory for GenericOpenAIChatAgent with run_dataset compatibility.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
api_key: OpenAI API key
|
|
19
|
+
base_url: Optional custom API endpoint
|
|
20
|
+
model_name: Model to use (e.g., "gpt-4o-mini")
|
|
21
|
+
**kwargs: Additional arguments passed to GenericOpenAIChatAgent
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Configured GenericOpenAIChatAgent instance
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
>>> from hud.datasets import run_dataset
|
|
28
|
+
>>> from hud.utils.agent_factories import create_openai_agent
|
|
29
|
+
>>> results = await run_dataset(
|
|
30
|
+
... "My Eval",
|
|
31
|
+
... "hud-evals/SheetBench-50",
|
|
32
|
+
... create_openai_agent,
|
|
33
|
+
... {"api_key": "your-key", "model_name": "gpt-4o-mini"},
|
|
34
|
+
... )
|
|
35
|
+
"""
|
|
36
|
+
api_key = kwargs.pop("api_key", None)
|
|
37
|
+
base_url = kwargs.pop("base_url", None)
|
|
38
|
+
|
|
39
|
+
openai_client = AsyncOpenAI(api_key=api_key, base_url=base_url)
|
|
40
|
+
|
|
41
|
+
return GenericOpenAIChatAgent(openai_client=openai_client, **kwargs)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def create_grounded_agent(**kwargs: Any) -> GroundedOpenAIChatAgent:
|
|
45
|
+
"""Factory for GroundedOpenAIChatAgent with run_dataset compatibility.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
api_key: OpenAI API key for planning model
|
|
49
|
+
base_url: Optional custom API endpoint for planning model
|
|
50
|
+
model_name: Planning model to use (e.g., "gpt-4o-mini")
|
|
51
|
+
grounder_api_key: API key for grounding model
|
|
52
|
+
grounder_api_base: API base URL for grounding model (default: OpenRouter)
|
|
53
|
+
grounder_model: Grounding model to use (default: qwen/qwen-2.5-vl-7b-instruct)
|
|
54
|
+
**kwargs: Additional arguments passed to GroundedOpenAIChatAgent
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Configured GroundedOpenAIChatAgent instance
|
|
58
|
+
|
|
59
|
+
Example:
|
|
60
|
+
>>> from hud.datasets import run_dataset
|
|
61
|
+
>>> from hud.utils.agent_factories import create_grounded_agent
|
|
62
|
+
>>> results = await run_dataset(
|
|
63
|
+
... "Grounded Eval",
|
|
64
|
+
... dataset,
|
|
65
|
+
... create_grounded_agent,
|
|
66
|
+
... {
|
|
67
|
+
... "api_key": "openai-key",
|
|
68
|
+
... "grounder_api_key": "openrouter-key",
|
|
69
|
+
... "model_name": "gpt-4o-mini",
|
|
70
|
+
... },
|
|
71
|
+
... )
|
|
72
|
+
"""
|
|
73
|
+
api_key = kwargs.pop("api_key", None)
|
|
74
|
+
base_url = kwargs.pop("base_url", None)
|
|
75
|
+
grounder_api_key = kwargs.pop("grounder_api_key", None)
|
|
76
|
+
grounder_api_base = kwargs.pop("grounder_api_base", "https://openrouter.ai/api/v1")
|
|
77
|
+
grounder_model = kwargs.pop("grounder_model", "qwen/qwen-2.5-vl-7b-instruct")
|
|
78
|
+
|
|
79
|
+
openai_client = AsyncOpenAI(api_key=api_key, base_url=base_url)
|
|
80
|
+
grounder_config = GrounderConfig(
|
|
81
|
+
api_base=grounder_api_base, model=grounder_model, api_key=grounder_api_key
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return GroundedOpenAIChatAgent(
|
|
85
|
+
openai_client=openai_client, grounder_config=grounder_config, **kwargs
|
|
86
|
+
)
|