hud-python 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +7 -0
- hud/agents/base.py +42 -10
- hud/agents/claude.py +24 -14
- hud/agents/grounded_openai.py +280 -0
- hud/agents/tests/test_client.py +11 -27
- hud/agents/tests/test_grounded_openai_agent.py +155 -0
- hud/cli/__init__.py +50 -20
- hud/cli/build.py +3 -44
- hud/cli/eval.py +25 -6
- hud/cli/init.py +4 -4
- hud/cli/push.py +3 -1
- hud/cli/tests/test_push.py +6 -6
- hud/cli/utils/interactive.py +1 -1
- hud/clients/__init__.py +3 -2
- hud/clients/base.py +20 -9
- hud/clients/mcp_use.py +44 -22
- hud/datasets/task.py +6 -2
- hud/native/__init__.py +6 -0
- hud/native/comparator.py +546 -0
- hud/native/tests/__init__.py +1 -0
- hud/native/tests/test_comparator.py +539 -0
- hud/native/tests/test_native_init.py +79 -0
- hud/otel/instrumentation.py +0 -2
- hud/server/server.py +9 -2
- hud/settings.py +6 -0
- hud/shared/exceptions.py +204 -31
- hud/shared/hints.py +177 -0
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +385 -144
- hud/tools/__init__.py +2 -0
- hud/tools/executors/tests/test_base_executor.py +1 -1
- hud/tools/executors/xdo.py +1 -1
- hud/tools/grounding/__init__.py +13 -0
- hud/tools/grounding/config.py +54 -0
- hud/tools/grounding/grounded_tool.py +314 -0
- hud/tools/grounding/grounder.py +301 -0
- hud/tools/grounding/tests/__init__.py +1 -0
- hud/tools/grounding/tests/test_grounded_tool.py +196 -0
- hud/tools/submit.py +66 -0
- hud/tools/tests/test_playwright_tool.py +1 -1
- hud/tools/tests/test_tools_init.py +1 -1
- hud/tools/tests/test_utils.py +2 -2
- hud/types.py +33 -5
- hud/utils/agent_factories.py +86 -0
- hud/utils/design.py +57 -0
- hud/utils/mcp.py +6 -0
- hud/utils/pretty_errors.py +68 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/METADATA +2 -4
- {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/RECORD +54 -37
- {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/WHEEL +0 -0
- {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Configuration for grounding models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
SYSTEM_PROMPT = (
|
|
9
|
+
"You are a visual grounding model. Given an image and a description, "
|
|
10
|
+
"return ONLY the center pixel coordinates of the described element as a "
|
|
11
|
+
"single point in parentheses format: (x, y). Do not return bounding boxes "
|
|
12
|
+
"or multiple coordinates."
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class GrounderConfig:
|
|
18
|
+
"""Configuration for grounding model clients.
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
api_base: Base URL for the grounding model API endpoint
|
|
22
|
+
model: Model identifier to use for grounding
|
|
23
|
+
api_key: API key for authentication (default: "EMPTY" for local models)
|
|
24
|
+
system_prompt: System prompt to guide the grounding model
|
|
25
|
+
output_format: Format for coordinate output ("pixels", "norm_0_1", "norm_0_999")
|
|
26
|
+
parser_regex: Regular expression to parse coordinates from model output
|
|
27
|
+
resize: Image resizing configuration dictionary
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
api_base: str
|
|
31
|
+
model: str
|
|
32
|
+
api_key: str = "EMPTY"
|
|
33
|
+
system_prompt: str = SYSTEM_PROMPT
|
|
34
|
+
output_format: str = "pixels" # "pixels" | "norm_0_1" | "norm_0_999"
|
|
35
|
+
parser_regex: str = r"\((\d+),\s*(\d+)\)"
|
|
36
|
+
resize: dict[str, Any] = field(
|
|
37
|
+
default_factory=lambda: {
|
|
38
|
+
"enabled": True,
|
|
39
|
+
"min_pixels": 3136,
|
|
40
|
+
"max_pixels": 4096 * 2160,
|
|
41
|
+
"factor": 28,
|
|
42
|
+
}
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def __post_init__(self) -> None:
|
|
46
|
+
"""Validate configuration after initialization."""
|
|
47
|
+
if self.output_format not in ("pixels", "norm_0_1", "norm_0_999"):
|
|
48
|
+
raise ValueError(f"Invalid output_format: {self.output_format}")
|
|
49
|
+
|
|
50
|
+
if not self.api_base:
|
|
51
|
+
raise ValueError("api_base is required")
|
|
52
|
+
|
|
53
|
+
if not self.model:
|
|
54
|
+
raise ValueError("model is required")
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
"""Grounded computer tool that resolves element descriptions to coordinates."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from mcp import ErrorData, McpError
|
|
9
|
+
from mcp.types import INVALID_PARAMS, ContentBlock
|
|
10
|
+
|
|
11
|
+
from hud.clients.base import AgentMCPClient # noqa: TC001
|
|
12
|
+
from hud.tools.grounding.grounder import Grounder # noqa: TC001
|
|
13
|
+
from hud.types import MCPToolCall
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GroundedComputerTool:
|
|
19
|
+
"""Computer tool wrapper that grounds element descriptions to coordinates.
|
|
20
|
+
|
|
21
|
+
This tool acts as a local wrapper that:
|
|
22
|
+
1. Accepts natural language element descriptions from the agent
|
|
23
|
+
2. Calls the environment's computer tool via MCP to take screenshots
|
|
24
|
+
3. Uses a grounding model to resolve descriptions to coordinates
|
|
25
|
+
4. Calls the environment's computer tool via MCP with resolved coordinates
|
|
26
|
+
5. Returns the result to the agent
|
|
27
|
+
|
|
28
|
+
This allows the agent to use element descriptions while ensuring all
|
|
29
|
+
computer actions happen in the correct environment.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
*,
|
|
35
|
+
grounder: Grounder,
|
|
36
|
+
mcp_client: AgentMCPClient,
|
|
37
|
+
computer_tool_name: str = "computer",
|
|
38
|
+
) -> None:
|
|
39
|
+
"""Initialize the grounded computer tool.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
grounder: Grounder instance for visual grounding
|
|
43
|
+
mcp_client: MCP client to call the environment's computer tool
|
|
44
|
+
computer_tool_name: Name of the computer tool in the environment
|
|
45
|
+
"""
|
|
46
|
+
self._grounder = grounder
|
|
47
|
+
self._mcp_client = mcp_client
|
|
48
|
+
self._computer_tool_name = computer_tool_name
|
|
49
|
+
|
|
50
|
+
def get_openai_tool_schema(self) -> dict:
|
|
51
|
+
"""Get the OpenAI tool schema for the grounded computer tool.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Dictionary containing the tool schema in OpenAI format
|
|
55
|
+
"""
|
|
56
|
+
return {
|
|
57
|
+
"type": "function",
|
|
58
|
+
"function": {
|
|
59
|
+
"name": "computer",
|
|
60
|
+
"description": (
|
|
61
|
+
"Control a computer by interacting with UI elements. This tool uses "
|
|
62
|
+
"element descriptions to locate and interact with UI elements on the "
|
|
63
|
+
"screen (e.g., 'red submit button', 'search text field', 'hamburger menu "
|
|
64
|
+
"icon', 'close button in top right corner')."
|
|
65
|
+
),
|
|
66
|
+
"parameters": {
|
|
67
|
+
"type": "object",
|
|
68
|
+
"properties": {
|
|
69
|
+
"action": {
|
|
70
|
+
"type": "string",
|
|
71
|
+
"enum": [
|
|
72
|
+
"click",
|
|
73
|
+
"double_click",
|
|
74
|
+
"move",
|
|
75
|
+
"scroll",
|
|
76
|
+
"drag",
|
|
77
|
+
"type",
|
|
78
|
+
"keypress",
|
|
79
|
+
"wait",
|
|
80
|
+
"screenshot",
|
|
81
|
+
"get_current_url",
|
|
82
|
+
"get_dimensions",
|
|
83
|
+
"get_environment",
|
|
84
|
+
],
|
|
85
|
+
"description": "The action to perform",
|
|
86
|
+
},
|
|
87
|
+
"element_description": {
|
|
88
|
+
"type": "string",
|
|
89
|
+
"description": (
|
|
90
|
+
"Natural language description of the element for "
|
|
91
|
+
"click/move/scroll actions"
|
|
92
|
+
),
|
|
93
|
+
},
|
|
94
|
+
"start_element_description": {
|
|
95
|
+
"type": "string",
|
|
96
|
+
"description": "Description of the start element for drag actions",
|
|
97
|
+
},
|
|
98
|
+
"end_element_description": {
|
|
99
|
+
"type": "string",
|
|
100
|
+
"description": "Description of the end element for drag actions",
|
|
101
|
+
},
|
|
102
|
+
"text": {"type": "string", "description": "Text to type"},
|
|
103
|
+
"keys": {
|
|
104
|
+
"type": "array",
|
|
105
|
+
"items": {"type": "string"},
|
|
106
|
+
"description": "Keys to press (e.g., ['ctrl', 'a'] for Ctrl+A)",
|
|
107
|
+
},
|
|
108
|
+
"button": {
|
|
109
|
+
"type": "string",
|
|
110
|
+
"enum": ["left", "right", "middle"],
|
|
111
|
+
"description": "Mouse button to use",
|
|
112
|
+
},
|
|
113
|
+
"scroll_x": {"type": "integer", "description": "Horizontal scroll amount"},
|
|
114
|
+
"scroll_y": {"type": "integer", "description": "Vertical scroll amount"},
|
|
115
|
+
},
|
|
116
|
+
"required": ["action"],
|
|
117
|
+
},
|
|
118
|
+
},
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
async def __call__(
|
|
122
|
+
self,
|
|
123
|
+
action: str,
|
|
124
|
+
# Screenshot from conversation
|
|
125
|
+
screenshot_b64: str | None = None,
|
|
126
|
+
# Grounding-specific parameters
|
|
127
|
+
element_description: str | None = None,
|
|
128
|
+
start_element_description: str | None = None,
|
|
129
|
+
end_element_description: str | None = None,
|
|
130
|
+
# Pass-through parameters
|
|
131
|
+
text: str | None = None,
|
|
132
|
+
keys: list[str] | None = None,
|
|
133
|
+
button: str | None = None,
|
|
134
|
+
scroll_x: int | None = None,
|
|
135
|
+
scroll_y: int | None = None,
|
|
136
|
+
**kwargs: Any,
|
|
137
|
+
) -> list[ContentBlock]:
|
|
138
|
+
"""Execute a computer action, grounding element descriptions to coordinates first.
|
|
139
|
+
|
|
140
|
+
This method calls the environment's computer tool through MCP to ensure
|
|
141
|
+
actions happen in the correct environment.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
action: The action to perform
|
|
145
|
+
element_description: Description of element for click/move/scroll actions
|
|
146
|
+
start_element_description: Start element for drag actions
|
|
147
|
+
end_element_description: End element for drag actions
|
|
148
|
+
text: Text to type for type actions
|
|
149
|
+
keys: Keys to press for keypress actions
|
|
150
|
+
button: Mouse button (left, right, middle)
|
|
151
|
+
scroll_x: Horizontal scroll amount
|
|
152
|
+
scroll_y: Vertical scroll amount
|
|
153
|
+
**kwargs: Additional arguments
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
List of ContentBlocks with action results from the environment
|
|
157
|
+
"""
|
|
158
|
+
try:
|
|
159
|
+
# For actions that don't need grounding, call environment tool directly
|
|
160
|
+
if action in (
|
|
161
|
+
"screenshot",
|
|
162
|
+
"type",
|
|
163
|
+
"keypress",
|
|
164
|
+
"wait",
|
|
165
|
+
"get_current_url",
|
|
166
|
+
"get_dimensions",
|
|
167
|
+
"get_environment",
|
|
168
|
+
):
|
|
169
|
+
computer_args: dict[str, Any] = {"action": action}
|
|
170
|
+
if text is not None:
|
|
171
|
+
computer_args["text"] = text
|
|
172
|
+
if keys is not None:
|
|
173
|
+
computer_args["keys"] = keys
|
|
174
|
+
|
|
175
|
+
result = await self._mcp_client.call_tool(
|
|
176
|
+
MCPToolCall(
|
|
177
|
+
name=self._computer_tool_name, arguments={**computer_args, **kwargs}
|
|
178
|
+
)
|
|
179
|
+
)
|
|
180
|
+
return result.content
|
|
181
|
+
|
|
182
|
+
# For actions that need coordinates, we need to ground element descriptions
|
|
183
|
+
if action in ("click", "double_click", "move", "scroll"):
|
|
184
|
+
if not element_description:
|
|
185
|
+
raise McpError(
|
|
186
|
+
ErrorData(
|
|
187
|
+
code=INVALID_PARAMS,
|
|
188
|
+
message=f"element_description is required for {action} action",
|
|
189
|
+
)
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
if not screenshot_b64:
|
|
193
|
+
raise McpError(
|
|
194
|
+
ErrorData(
|
|
195
|
+
code=INVALID_PARAMS, message="No screenshot available for grounding"
|
|
196
|
+
)
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Ground the element description to coordinates
|
|
200
|
+
coords = await self._grounder.predict_click(
|
|
201
|
+
image_b64=screenshot_b64, instruction=element_description
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
if not coords:
|
|
205
|
+
raise McpError(
|
|
206
|
+
ErrorData(
|
|
207
|
+
code=INVALID_PARAMS,
|
|
208
|
+
message=(
|
|
209
|
+
f"Could not locate element: '{element_description}'. "
|
|
210
|
+
"Try a more specific description or different identifying "
|
|
211
|
+
"features (color, position, text, etc.)"
|
|
212
|
+
),
|
|
213
|
+
)
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
x, y = coords
|
|
217
|
+
|
|
218
|
+
# Execute action with resolved coordinates
|
|
219
|
+
computer_args: dict[str, Any] = {"action": action, "x": x, "y": y}
|
|
220
|
+
if button:
|
|
221
|
+
computer_args["button"] = button
|
|
222
|
+
if scroll_x is not None:
|
|
223
|
+
computer_args["scroll_x"] = scroll_x
|
|
224
|
+
if scroll_y is not None:
|
|
225
|
+
computer_args["scroll_y"] = scroll_y
|
|
226
|
+
|
|
227
|
+
result = await self._mcp_client.call_tool(
|
|
228
|
+
MCPToolCall(
|
|
229
|
+
name=self._computer_tool_name, arguments={**computer_args, **kwargs}
|
|
230
|
+
)
|
|
231
|
+
)
|
|
232
|
+
return result.content
|
|
233
|
+
|
|
234
|
+
elif action == "drag":
|
|
235
|
+
if not start_element_description or not end_element_description:
|
|
236
|
+
raise McpError(
|
|
237
|
+
ErrorData(
|
|
238
|
+
code=INVALID_PARAMS,
|
|
239
|
+
message=(
|
|
240
|
+
"start_element_description and end_element_description "
|
|
241
|
+
"are required for drag action"
|
|
242
|
+
),
|
|
243
|
+
)
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
if not screenshot_b64:
|
|
247
|
+
raise McpError(
|
|
248
|
+
ErrorData(
|
|
249
|
+
code=INVALID_PARAMS, message="No screenshot available for grounding"
|
|
250
|
+
)
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Ground both start and end points
|
|
254
|
+
start_coords = await self._grounder.predict_click(
|
|
255
|
+
image_b64=screenshot_b64, instruction=start_element_description
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
if not start_coords:
|
|
259
|
+
raise McpError(
|
|
260
|
+
ErrorData(
|
|
261
|
+
code=INVALID_PARAMS,
|
|
262
|
+
message=(
|
|
263
|
+
f"Could not locate start element: '{start_element_description}'. "
|
|
264
|
+
"Try a more specific description or different identifying features."
|
|
265
|
+
),
|
|
266
|
+
)
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
end_coords = await self._grounder.predict_click(
|
|
270
|
+
image_b64=screenshot_b64, instruction=end_element_description
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
if not end_coords:
|
|
274
|
+
raise McpError(
|
|
275
|
+
ErrorData(
|
|
276
|
+
code=INVALID_PARAMS,
|
|
277
|
+
message=(
|
|
278
|
+
f"Could not locate end element: '{end_element_description}'. "
|
|
279
|
+
"Try a more specific description or different identifying features."
|
|
280
|
+
),
|
|
281
|
+
)
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
# Execute drag with resolved coordinates
|
|
285
|
+
computer_args: dict[str, Any] = {
|
|
286
|
+
"action": "drag",
|
|
287
|
+
"path": [
|
|
288
|
+
(start_coords[0], start_coords[1]),
|
|
289
|
+
(end_coords[0], end_coords[1]),
|
|
290
|
+
],
|
|
291
|
+
}
|
|
292
|
+
if button:
|
|
293
|
+
computer_args["button"] = button
|
|
294
|
+
|
|
295
|
+
result = await self._mcp_client.call_tool(
|
|
296
|
+
MCPToolCall(
|
|
297
|
+
name=self._computer_tool_name, arguments={**computer_args, **kwargs}
|
|
298
|
+
)
|
|
299
|
+
)
|
|
300
|
+
return result.content
|
|
301
|
+
|
|
302
|
+
else:
|
|
303
|
+
raise McpError(
|
|
304
|
+
ErrorData(code=INVALID_PARAMS, message=f"Unsupported action: {action}")
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
except McpError:
|
|
308
|
+
# Re-raise MCP errors
|
|
309
|
+
raise
|
|
310
|
+
except Exception as e:
|
|
311
|
+
logger.error("Grounded tool failed: %s", e)
|
|
312
|
+
raise McpError(
|
|
313
|
+
ErrorData(code=INVALID_PARAMS, message=f"Grounding failed: {e!s}")
|
|
314
|
+
) from e
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""OpenAI-based grounder for visual element detection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
import io
|
|
7
|
+
import json
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
from openai import AsyncOpenAI
|
|
11
|
+
from opentelemetry import trace
|
|
12
|
+
from PIL import Image
|
|
13
|
+
|
|
14
|
+
from hud import instrument
|
|
15
|
+
from hud.tools.grounding.config import GrounderConfig # noqa: TC001
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Grounder:
|
|
19
|
+
"""Grounder that uses AsyncOpenAI to call vLLM or other model endpoints for visual grounding.
|
|
20
|
+
|
|
21
|
+
This class handles:
|
|
22
|
+
- Image resizing based on configuration
|
|
23
|
+
- API calls to grounding models via AsyncOpenAI
|
|
24
|
+
- Coordinate parsing from model outputs
|
|
25
|
+
- Coordinate format conversion (pixels, normalized)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, config: GrounderConfig) -> None:
|
|
29
|
+
"""Initialize the grounder with configuration.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
config: GrounderConfig with API endpoint, model, and parsing settings
|
|
33
|
+
"""
|
|
34
|
+
self.config = config
|
|
35
|
+
self.client = AsyncOpenAI(api_key=config.api_key, base_url=config.api_base)
|
|
36
|
+
|
|
37
|
+
def _resize_image(self, image_b64: str) -> tuple[str, tuple[int, int], tuple[int, int]]:
|
|
38
|
+
"""Resize image according to configuration.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
image_b64: Base64-encoded image string
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Tuple of (processed_base64, (original_width, original_height),
|
|
45
|
+
(processed_width, processed_height))
|
|
46
|
+
"""
|
|
47
|
+
# Decode image
|
|
48
|
+
image_bytes = base64.b64decode(image_b64)
|
|
49
|
+
img = Image.open(io.BytesIO(image_bytes))
|
|
50
|
+
original_size = (img.width, img.height)
|
|
51
|
+
|
|
52
|
+
if not self.config.resize["enabled"]:
|
|
53
|
+
return image_b64, original_size, original_size
|
|
54
|
+
|
|
55
|
+
# Calculate total pixels
|
|
56
|
+
total_pixels = img.width * img.height
|
|
57
|
+
min_pixels = self.config.resize["min_pixels"]
|
|
58
|
+
max_pixels = self.config.resize["max_pixels"]
|
|
59
|
+
factor = self.config.resize["factor"]
|
|
60
|
+
|
|
61
|
+
# Determine if resizing is needed
|
|
62
|
+
if total_pixels < min_pixels or total_pixels > max_pixels:
|
|
63
|
+
# Calculate scaling factor
|
|
64
|
+
if total_pixels < min_pixels:
|
|
65
|
+
scale = (min_pixels / total_pixels) ** 0.5
|
|
66
|
+
else:
|
|
67
|
+
scale = (max_pixels / total_pixels) ** 0.5
|
|
68
|
+
|
|
69
|
+
# Round dimensions to nearest factor
|
|
70
|
+
new_width = int((img.width * scale) // factor) * factor
|
|
71
|
+
new_height = int((img.height * scale) // factor) * factor
|
|
72
|
+
|
|
73
|
+
# Ensure minimum dimensions
|
|
74
|
+
new_width = max(new_width, factor)
|
|
75
|
+
new_height = max(new_height, factor)
|
|
76
|
+
|
|
77
|
+
# Resize image
|
|
78
|
+
img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
|
79
|
+
|
|
80
|
+
# Convert back to base64
|
|
81
|
+
buffer = io.BytesIO()
|
|
82
|
+
img.save(buffer, format="PNG")
|
|
83
|
+
resized_b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
|
84
|
+
return resized_b64, original_size, (new_width, new_height)
|
|
85
|
+
|
|
86
|
+
return image_b64, original_size, original_size
|
|
87
|
+
|
|
88
|
+
def _parse_coordinates(self, response_text: str) -> tuple[float, float] | None:
|
|
89
|
+
"""Parse coordinates from model response.
|
|
90
|
+
|
|
91
|
+
Handles multiple formats:
|
|
92
|
+
- (x, y) format from configured regex
|
|
93
|
+
- [x1, y1, x2, y2] bounding box format (returns center point)
|
|
94
|
+
- [x, y] point format
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
response_text: Text output from the grounding model
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Tuple of (x, y) coordinates or None if parsing fails
|
|
101
|
+
"""
|
|
102
|
+
# First try the configured regex pattern
|
|
103
|
+
match = re.search(self.config.parser_regex, response_text)
|
|
104
|
+
if match:
|
|
105
|
+
try:
|
|
106
|
+
x = float(match.group(1))
|
|
107
|
+
y = float(match.group(2))
|
|
108
|
+
return (x, y)
|
|
109
|
+
except (ValueError, IndexError):
|
|
110
|
+
# If parsing fails, continue to fallback strategies
|
|
111
|
+
pass
|
|
112
|
+
|
|
113
|
+
# Try to parse as a list/array format [x1, y1, x2, y2] or [x, y]
|
|
114
|
+
# Also handles (x1, y1, x2, y2)
|
|
115
|
+
# Updated pattern to handle both integers and floats
|
|
116
|
+
list_pattern = (
|
|
117
|
+
r"[\[\(](\d+(?:\.\d+)?)[,\s]+(\d+(?:\.\d+)?)"
|
|
118
|
+
r"(?:[,\s]+(\d+(?:\.\d+)?)[,\s]+(\d+(?:\.\d+)?))?[\]\)]"
|
|
119
|
+
)
|
|
120
|
+
list_match = re.search(list_pattern, response_text)
|
|
121
|
+
if list_match:
|
|
122
|
+
x1 = float(list_match.group(1))
|
|
123
|
+
y1 = float(list_match.group(2))
|
|
124
|
+
|
|
125
|
+
# Check if it's a bounding box (4 values) or a point (2 values)
|
|
126
|
+
if list_match.group(3) and list_match.group(4):
|
|
127
|
+
# Bounding box format - return center point
|
|
128
|
+
x2 = float(list_match.group(3))
|
|
129
|
+
y2 = float(list_match.group(4))
|
|
130
|
+
center_x = (x1 + x2) / 2
|
|
131
|
+
center_y = (y1 + y2) / 2
|
|
132
|
+
return (center_x, center_y)
|
|
133
|
+
else:
|
|
134
|
+
# Point format
|
|
135
|
+
return (x1, y1)
|
|
136
|
+
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
def _convert_coordinates(
|
|
140
|
+
self,
|
|
141
|
+
coords: tuple[float, float],
|
|
142
|
+
processed_size: tuple[int, int],
|
|
143
|
+
original_size: tuple[int, int],
|
|
144
|
+
) -> tuple[int, int]:
|
|
145
|
+
"""Convert coordinates based on output format configuration and scale to original size.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
coords: Raw coordinates from model (can be float for normalized formats)
|
|
149
|
+
processed_size: Dimensions of the processed/resized image (width, height)
|
|
150
|
+
original_size: Original image dimensions (width, height)
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Converted coordinates in original image pixels
|
|
154
|
+
"""
|
|
155
|
+
x, y = coords
|
|
156
|
+
proc_width, proc_height = processed_size
|
|
157
|
+
orig_width, orig_height = original_size
|
|
158
|
+
|
|
159
|
+
# First convert to pixels in the processed image space
|
|
160
|
+
if self.config.output_format == "pixels":
|
|
161
|
+
# Already in pixels of processed image
|
|
162
|
+
proc_x, proc_y = x, y
|
|
163
|
+
elif self.config.output_format == "norm_0_1":
|
|
164
|
+
# Convert from 0-1 normalized to pixels
|
|
165
|
+
proc_x = x * proc_width
|
|
166
|
+
proc_y = y * proc_height
|
|
167
|
+
elif self.config.output_format == "norm_0_999":
|
|
168
|
+
# Convert from 0-999 normalized to pixels
|
|
169
|
+
proc_x = x * proc_width / 999
|
|
170
|
+
proc_y = y * proc_height / 999
|
|
171
|
+
else:
|
|
172
|
+
proc_x, proc_y = x, y
|
|
173
|
+
|
|
174
|
+
# Scale from processed image coordinates to original image coordinates
|
|
175
|
+
scale_x = orig_width / proc_width
|
|
176
|
+
scale_y = orig_height / proc_height
|
|
177
|
+
|
|
178
|
+
final_x = int(proc_x * scale_x)
|
|
179
|
+
final_y = int(proc_y * scale_y)
|
|
180
|
+
|
|
181
|
+
return (final_x, final_y)
|
|
182
|
+
|
|
183
|
+
@instrument(
|
|
184
|
+
name="Grounding.predict_click",
|
|
185
|
+
span_type="agent",
|
|
186
|
+
record_args=True,
|
|
187
|
+
record_result=True,
|
|
188
|
+
)
|
|
189
|
+
async def predict_click(
|
|
190
|
+
self, *, image_b64: str, instruction: str, max_retries: int = 3
|
|
191
|
+
) -> tuple[int, int] | None:
|
|
192
|
+
"""Predict click coordinates for the given instruction on the image.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
image_b64: Base64-encoded screenshot
|
|
196
|
+
instruction: Natural language description of the element to click
|
|
197
|
+
max_retries: Maximum number of retry attempts (default: 3)
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
Tuple of (x, y) pixel coordinates or None if grounding fails
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
# Resize image once outside the retry loop
|
|
204
|
+
processed_image, original_size, processed_size = self._resize_image(image_b64)
|
|
205
|
+
|
|
206
|
+
# Build messages once
|
|
207
|
+
messages = []
|
|
208
|
+
|
|
209
|
+
# Add system prompt if configured
|
|
210
|
+
if self.config.system_prompt:
|
|
211
|
+
messages.append(
|
|
212
|
+
{
|
|
213
|
+
"role": "system",
|
|
214
|
+
"content": (
|
|
215
|
+
self.config.system_prompt
|
|
216
|
+
+ f" The image resolution is height {processed_size[1]} "
|
|
217
|
+
+ f"and width {processed_size[0]}."
|
|
218
|
+
),
|
|
219
|
+
}
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Add user message with image and instruction
|
|
223
|
+
messages.append(
|
|
224
|
+
{
|
|
225
|
+
"role": "user",
|
|
226
|
+
"content": [
|
|
227
|
+
{
|
|
228
|
+
"type": "image_url",
|
|
229
|
+
"image_url": {"url": f"data:image/png;base64,{processed_image}"},
|
|
230
|
+
},
|
|
231
|
+
{"type": "text", "text": instruction},
|
|
232
|
+
],
|
|
233
|
+
}
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Retry loop
|
|
237
|
+
for attempt in range(max_retries):
|
|
238
|
+
try:
|
|
239
|
+
# Call the grounding model via AsyncOpenAI
|
|
240
|
+
response = await self.client.chat.completions.create(
|
|
241
|
+
model=self.config.model,
|
|
242
|
+
messages=messages,
|
|
243
|
+
temperature=0.0,
|
|
244
|
+
max_tokens=50,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
# Extract response text
|
|
248
|
+
response_text = response.choices[0].message.content
|
|
249
|
+
|
|
250
|
+
# Manually record the raw response in the span
|
|
251
|
+
span = trace.get_current_span()
|
|
252
|
+
if span and span.is_recording():
|
|
253
|
+
span.set_attribute("grounder.raw_response", json.dumps(response.model_dump()))
|
|
254
|
+
span.set_attribute("grounder.attempt", attempt + 1)
|
|
255
|
+
|
|
256
|
+
# Parse coordinates from response
|
|
257
|
+
if response_text is None:
|
|
258
|
+
if attempt < max_retries - 1:
|
|
259
|
+
continue
|
|
260
|
+
return None
|
|
261
|
+
|
|
262
|
+
coords = self._parse_coordinates(response_text)
|
|
263
|
+
if coords is None:
|
|
264
|
+
if attempt < max_retries - 1:
|
|
265
|
+
continue
|
|
266
|
+
return None
|
|
267
|
+
|
|
268
|
+
# Convert coordinates to original image pixels based on output format and scaling
|
|
269
|
+
pixel_coords = self._convert_coordinates(coords, processed_size, original_size)
|
|
270
|
+
|
|
271
|
+
# Validate coordinates are within image bounds
|
|
272
|
+
x, y = pixel_coords
|
|
273
|
+
if x < 0 or y < 0 or x >= original_size[0] or y >= original_size[1]:
|
|
274
|
+
# Clamp to image bounds
|
|
275
|
+
x = max(0, min(x, original_size[0] - 1))
|
|
276
|
+
y = max(0, min(y, original_size[1] - 1))
|
|
277
|
+
pixel_coords = (x, y)
|
|
278
|
+
|
|
279
|
+
# Record successful grounding in span
|
|
280
|
+
span = trace.get_current_span()
|
|
281
|
+
if span and span.is_recording():
|
|
282
|
+
span.set_attribute("grounder.success", True)
|
|
283
|
+
span.set_attribute(
|
|
284
|
+
"grounder.final_coords", f"{pixel_coords[0]},{pixel_coords[1]}"
|
|
285
|
+
)
|
|
286
|
+
span.set_attribute("grounder.total_attempts", attempt + 1)
|
|
287
|
+
|
|
288
|
+
return pixel_coords
|
|
289
|
+
|
|
290
|
+
except Exception:
|
|
291
|
+
if attempt < max_retries - 1:
|
|
292
|
+
continue
|
|
293
|
+
|
|
294
|
+
# Record failure in span
|
|
295
|
+
span = trace.get_current_span()
|
|
296
|
+
if span and span.is_recording():
|
|
297
|
+
span.set_attribute("grounder.success", False)
|
|
298
|
+
span.set_attribute("grounder.total_attempts", max_retries)
|
|
299
|
+
span.set_attribute("grounder.failure_reason", "All attempts exhausted")
|
|
300
|
+
|
|
301
|
+
return None
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Tests for grounding tools."""
|