hud-python 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (54) hide show
  1. hud/__init__.py +7 -0
  2. hud/agents/base.py +42 -10
  3. hud/agents/claude.py +24 -14
  4. hud/agents/grounded_openai.py +280 -0
  5. hud/agents/tests/test_client.py +11 -27
  6. hud/agents/tests/test_grounded_openai_agent.py +155 -0
  7. hud/cli/__init__.py +50 -20
  8. hud/cli/build.py +3 -44
  9. hud/cli/eval.py +25 -6
  10. hud/cli/init.py +4 -4
  11. hud/cli/push.py +3 -1
  12. hud/cli/tests/test_push.py +6 -6
  13. hud/cli/utils/interactive.py +1 -1
  14. hud/clients/__init__.py +3 -2
  15. hud/clients/base.py +20 -9
  16. hud/clients/mcp_use.py +44 -22
  17. hud/datasets/task.py +6 -2
  18. hud/native/__init__.py +6 -0
  19. hud/native/comparator.py +546 -0
  20. hud/native/tests/__init__.py +1 -0
  21. hud/native/tests/test_comparator.py +539 -0
  22. hud/native/tests/test_native_init.py +79 -0
  23. hud/otel/instrumentation.py +0 -2
  24. hud/server/server.py +9 -2
  25. hud/settings.py +6 -0
  26. hud/shared/exceptions.py +204 -31
  27. hud/shared/hints.py +177 -0
  28. hud/shared/requests.py +15 -3
  29. hud/shared/tests/test_exceptions.py +385 -144
  30. hud/tools/__init__.py +2 -0
  31. hud/tools/executors/tests/test_base_executor.py +1 -1
  32. hud/tools/executors/xdo.py +1 -1
  33. hud/tools/grounding/__init__.py +13 -0
  34. hud/tools/grounding/config.py +54 -0
  35. hud/tools/grounding/grounded_tool.py +314 -0
  36. hud/tools/grounding/grounder.py +301 -0
  37. hud/tools/grounding/tests/__init__.py +1 -0
  38. hud/tools/grounding/tests/test_grounded_tool.py +196 -0
  39. hud/tools/submit.py +66 -0
  40. hud/tools/tests/test_playwright_tool.py +1 -1
  41. hud/tools/tests/test_tools_init.py +1 -1
  42. hud/tools/tests/test_utils.py +2 -2
  43. hud/types.py +33 -5
  44. hud/utils/agent_factories.py +86 -0
  45. hud/utils/design.py +57 -0
  46. hud/utils/mcp.py +6 -0
  47. hud/utils/pretty_errors.py +68 -0
  48. hud/utils/tests/test_version.py +1 -1
  49. hud/version.py +1 -1
  50. {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/METADATA +2 -4
  51. {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/RECORD +54 -37
  52. {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/WHEEL +0 -0
  53. {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/entry_points.txt +0 -0
  54. {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,54 @@
1
+ """Configuration for grounding models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any
7
+
8
+ SYSTEM_PROMPT = (
9
+ "You are a visual grounding model. Given an image and a description, "
10
+ "return ONLY the center pixel coordinates of the described element as a "
11
+ "single point in parentheses format: (x, y). Do not return bounding boxes "
12
+ "or multiple coordinates."
13
+ )
14
+
15
+
16
+ @dataclass
17
+ class GrounderConfig:
18
+ """Configuration for grounding model clients.
19
+
20
+ Attributes:
21
+ api_base: Base URL for the grounding model API endpoint
22
+ model: Model identifier to use for grounding
23
+ api_key: API key for authentication (default: "EMPTY" for local models)
24
+ system_prompt: System prompt to guide the grounding model
25
+ output_format: Format for coordinate output ("pixels", "norm_0_1", "norm_0_999")
26
+ parser_regex: Regular expression to parse coordinates from model output
27
+ resize: Image resizing configuration dictionary
28
+ """
29
+
30
+ api_base: str
31
+ model: str
32
+ api_key: str = "EMPTY"
33
+ system_prompt: str = SYSTEM_PROMPT
34
+ output_format: str = "pixels" # "pixels" | "norm_0_1" | "norm_0_999"
35
+ parser_regex: str = r"\((\d+),\s*(\d+)\)"
36
+ resize: dict[str, Any] = field(
37
+ default_factory=lambda: {
38
+ "enabled": True,
39
+ "min_pixels": 3136,
40
+ "max_pixels": 4096 * 2160,
41
+ "factor": 28,
42
+ }
43
+ )
44
+
45
+ def __post_init__(self) -> None:
46
+ """Validate configuration after initialization."""
47
+ if self.output_format not in ("pixels", "norm_0_1", "norm_0_999"):
48
+ raise ValueError(f"Invalid output_format: {self.output_format}")
49
+
50
+ if not self.api_base:
51
+ raise ValueError("api_base is required")
52
+
53
+ if not self.model:
54
+ raise ValueError("model is required")
@@ -0,0 +1,314 @@
1
+ """Grounded computer tool that resolves element descriptions to coordinates."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ from mcp import ErrorData, McpError
9
+ from mcp.types import INVALID_PARAMS, ContentBlock
10
+
11
+ from hud.clients.base import AgentMCPClient # noqa: TC001
12
+ from hud.tools.grounding.grounder import Grounder # noqa: TC001
13
+ from hud.types import MCPToolCall
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class GroundedComputerTool:
19
+ """Computer tool wrapper that grounds element descriptions to coordinates.
20
+
21
+ This tool acts as a local wrapper that:
22
+ 1. Accepts natural language element descriptions from the agent
23
+ 2. Calls the environment's computer tool via MCP to take screenshots
24
+ 3. Uses a grounding model to resolve descriptions to coordinates
25
+ 4. Calls the environment's computer tool via MCP with resolved coordinates
26
+ 5. Returns the result to the agent
27
+
28
+ This allows the agent to use element descriptions while ensuring all
29
+ computer actions happen in the correct environment.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ *,
35
+ grounder: Grounder,
36
+ mcp_client: AgentMCPClient,
37
+ computer_tool_name: str = "computer",
38
+ ) -> None:
39
+ """Initialize the grounded computer tool.
40
+
41
+ Args:
42
+ grounder: Grounder instance for visual grounding
43
+ mcp_client: MCP client to call the environment's computer tool
44
+ computer_tool_name: Name of the computer tool in the environment
45
+ """
46
+ self._grounder = grounder
47
+ self._mcp_client = mcp_client
48
+ self._computer_tool_name = computer_tool_name
49
+
50
+ def get_openai_tool_schema(self) -> dict:
51
+ """Get the OpenAI tool schema for the grounded computer tool.
52
+
53
+ Returns:
54
+ Dictionary containing the tool schema in OpenAI format
55
+ """
56
+ return {
57
+ "type": "function",
58
+ "function": {
59
+ "name": "computer",
60
+ "description": (
61
+ "Control a computer by interacting with UI elements. This tool uses "
62
+ "element descriptions to locate and interact with UI elements on the "
63
+ "screen (e.g., 'red submit button', 'search text field', 'hamburger menu "
64
+ "icon', 'close button in top right corner')."
65
+ ),
66
+ "parameters": {
67
+ "type": "object",
68
+ "properties": {
69
+ "action": {
70
+ "type": "string",
71
+ "enum": [
72
+ "click",
73
+ "double_click",
74
+ "move",
75
+ "scroll",
76
+ "drag",
77
+ "type",
78
+ "keypress",
79
+ "wait",
80
+ "screenshot",
81
+ "get_current_url",
82
+ "get_dimensions",
83
+ "get_environment",
84
+ ],
85
+ "description": "The action to perform",
86
+ },
87
+ "element_description": {
88
+ "type": "string",
89
+ "description": (
90
+ "Natural language description of the element for "
91
+ "click/move/scroll actions"
92
+ ),
93
+ },
94
+ "start_element_description": {
95
+ "type": "string",
96
+ "description": "Description of the start element for drag actions",
97
+ },
98
+ "end_element_description": {
99
+ "type": "string",
100
+ "description": "Description of the end element for drag actions",
101
+ },
102
+ "text": {"type": "string", "description": "Text to type"},
103
+ "keys": {
104
+ "type": "array",
105
+ "items": {"type": "string"},
106
+ "description": "Keys to press (e.g., ['ctrl', 'a'] for Ctrl+A)",
107
+ },
108
+ "button": {
109
+ "type": "string",
110
+ "enum": ["left", "right", "middle"],
111
+ "description": "Mouse button to use",
112
+ },
113
+ "scroll_x": {"type": "integer", "description": "Horizontal scroll amount"},
114
+ "scroll_y": {"type": "integer", "description": "Vertical scroll amount"},
115
+ },
116
+ "required": ["action"],
117
+ },
118
+ },
119
+ }
120
+
121
+ async def __call__(
122
+ self,
123
+ action: str,
124
+ # Screenshot from conversation
125
+ screenshot_b64: str | None = None,
126
+ # Grounding-specific parameters
127
+ element_description: str | None = None,
128
+ start_element_description: str | None = None,
129
+ end_element_description: str | None = None,
130
+ # Pass-through parameters
131
+ text: str | None = None,
132
+ keys: list[str] | None = None,
133
+ button: str | None = None,
134
+ scroll_x: int | None = None,
135
+ scroll_y: int | None = None,
136
+ **kwargs: Any,
137
+ ) -> list[ContentBlock]:
138
+ """Execute a computer action, grounding element descriptions to coordinates first.
139
+
140
+ This method calls the environment's computer tool through MCP to ensure
141
+ actions happen in the correct environment.
142
+
143
+ Args:
144
+ action: The action to perform
145
+ element_description: Description of element for click/move/scroll actions
146
+ start_element_description: Start element for drag actions
147
+ end_element_description: End element for drag actions
148
+ text: Text to type for type actions
149
+ keys: Keys to press for keypress actions
150
+ button: Mouse button (left, right, middle)
151
+ scroll_x: Horizontal scroll amount
152
+ scroll_y: Vertical scroll amount
153
+ **kwargs: Additional arguments
154
+
155
+ Returns:
156
+ List of ContentBlocks with action results from the environment
157
+ """
158
+ try:
159
+ # For actions that don't need grounding, call environment tool directly
160
+ if action in (
161
+ "screenshot",
162
+ "type",
163
+ "keypress",
164
+ "wait",
165
+ "get_current_url",
166
+ "get_dimensions",
167
+ "get_environment",
168
+ ):
169
+ computer_args: dict[str, Any] = {"action": action}
170
+ if text is not None:
171
+ computer_args["text"] = text
172
+ if keys is not None:
173
+ computer_args["keys"] = keys
174
+
175
+ result = await self._mcp_client.call_tool(
176
+ MCPToolCall(
177
+ name=self._computer_tool_name, arguments={**computer_args, **kwargs}
178
+ )
179
+ )
180
+ return result.content
181
+
182
+ # For actions that need coordinates, we need to ground element descriptions
183
+ if action in ("click", "double_click", "move", "scroll"):
184
+ if not element_description:
185
+ raise McpError(
186
+ ErrorData(
187
+ code=INVALID_PARAMS,
188
+ message=f"element_description is required for {action} action",
189
+ )
190
+ )
191
+
192
+ if not screenshot_b64:
193
+ raise McpError(
194
+ ErrorData(
195
+ code=INVALID_PARAMS, message="No screenshot available for grounding"
196
+ )
197
+ )
198
+
199
+ # Ground the element description to coordinates
200
+ coords = await self._grounder.predict_click(
201
+ image_b64=screenshot_b64, instruction=element_description
202
+ )
203
+
204
+ if not coords:
205
+ raise McpError(
206
+ ErrorData(
207
+ code=INVALID_PARAMS,
208
+ message=(
209
+ f"Could not locate element: '{element_description}'. "
210
+ "Try a more specific description or different identifying "
211
+ "features (color, position, text, etc.)"
212
+ ),
213
+ )
214
+ )
215
+
216
+ x, y = coords
217
+
218
+ # Execute action with resolved coordinates
219
+ computer_args: dict[str, Any] = {"action": action, "x": x, "y": y}
220
+ if button:
221
+ computer_args["button"] = button
222
+ if scroll_x is not None:
223
+ computer_args["scroll_x"] = scroll_x
224
+ if scroll_y is not None:
225
+ computer_args["scroll_y"] = scroll_y
226
+
227
+ result = await self._mcp_client.call_tool(
228
+ MCPToolCall(
229
+ name=self._computer_tool_name, arguments={**computer_args, **kwargs}
230
+ )
231
+ )
232
+ return result.content
233
+
234
+ elif action == "drag":
235
+ if not start_element_description or not end_element_description:
236
+ raise McpError(
237
+ ErrorData(
238
+ code=INVALID_PARAMS,
239
+ message=(
240
+ "start_element_description and end_element_description "
241
+ "are required for drag action"
242
+ ),
243
+ )
244
+ )
245
+
246
+ if not screenshot_b64:
247
+ raise McpError(
248
+ ErrorData(
249
+ code=INVALID_PARAMS, message="No screenshot available for grounding"
250
+ )
251
+ )
252
+
253
+ # Ground both start and end points
254
+ start_coords = await self._grounder.predict_click(
255
+ image_b64=screenshot_b64, instruction=start_element_description
256
+ )
257
+
258
+ if not start_coords:
259
+ raise McpError(
260
+ ErrorData(
261
+ code=INVALID_PARAMS,
262
+ message=(
263
+ f"Could not locate start element: '{start_element_description}'. "
264
+ "Try a more specific description or different identifying features."
265
+ ),
266
+ )
267
+ )
268
+
269
+ end_coords = await self._grounder.predict_click(
270
+ image_b64=screenshot_b64, instruction=end_element_description
271
+ )
272
+
273
+ if not end_coords:
274
+ raise McpError(
275
+ ErrorData(
276
+ code=INVALID_PARAMS,
277
+ message=(
278
+ f"Could not locate end element: '{end_element_description}'. "
279
+ "Try a more specific description or different identifying features."
280
+ ),
281
+ )
282
+ )
283
+
284
+ # Execute drag with resolved coordinates
285
+ computer_args: dict[str, Any] = {
286
+ "action": "drag",
287
+ "path": [
288
+ (start_coords[0], start_coords[1]),
289
+ (end_coords[0], end_coords[1]),
290
+ ],
291
+ }
292
+ if button:
293
+ computer_args["button"] = button
294
+
295
+ result = await self._mcp_client.call_tool(
296
+ MCPToolCall(
297
+ name=self._computer_tool_name, arguments={**computer_args, **kwargs}
298
+ )
299
+ )
300
+ return result.content
301
+
302
+ else:
303
+ raise McpError(
304
+ ErrorData(code=INVALID_PARAMS, message=f"Unsupported action: {action}")
305
+ )
306
+
307
+ except McpError:
308
+ # Re-raise MCP errors
309
+ raise
310
+ except Exception as e:
311
+ logger.error("Grounded tool failed: %s", e)
312
+ raise McpError(
313
+ ErrorData(code=INVALID_PARAMS, message=f"Grounding failed: {e!s}")
314
+ ) from e
@@ -0,0 +1,301 @@
1
+ """OpenAI-based grounder for visual element detection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import base64
6
+ import io
7
+ import json
8
+ import re
9
+
10
+ from openai import AsyncOpenAI
11
+ from opentelemetry import trace
12
+ from PIL import Image
13
+
14
+ from hud import instrument
15
+ from hud.tools.grounding.config import GrounderConfig # noqa: TC001
16
+
17
+
18
+ class Grounder:
19
+ """Grounder that uses AsyncOpenAI to call vLLM or other model endpoints for visual grounding.
20
+
21
+ This class handles:
22
+ - Image resizing based on configuration
23
+ - API calls to grounding models via AsyncOpenAI
24
+ - Coordinate parsing from model outputs
25
+ - Coordinate format conversion (pixels, normalized)
26
+ """
27
+
28
+ def __init__(self, config: GrounderConfig) -> None:
29
+ """Initialize the grounder with configuration.
30
+
31
+ Args:
32
+ config: GrounderConfig with API endpoint, model, and parsing settings
33
+ """
34
+ self.config = config
35
+ self.client = AsyncOpenAI(api_key=config.api_key, base_url=config.api_base)
36
+
37
+ def _resize_image(self, image_b64: str) -> tuple[str, tuple[int, int], tuple[int, int]]:
38
+ """Resize image according to configuration.
39
+
40
+ Args:
41
+ image_b64: Base64-encoded image string
42
+
43
+ Returns:
44
+ Tuple of (processed_base64, (original_width, original_height),
45
+ (processed_width, processed_height))
46
+ """
47
+ # Decode image
48
+ image_bytes = base64.b64decode(image_b64)
49
+ img = Image.open(io.BytesIO(image_bytes))
50
+ original_size = (img.width, img.height)
51
+
52
+ if not self.config.resize["enabled"]:
53
+ return image_b64, original_size, original_size
54
+
55
+ # Calculate total pixels
56
+ total_pixels = img.width * img.height
57
+ min_pixels = self.config.resize["min_pixels"]
58
+ max_pixels = self.config.resize["max_pixels"]
59
+ factor = self.config.resize["factor"]
60
+
61
+ # Determine if resizing is needed
62
+ if total_pixels < min_pixels or total_pixels > max_pixels:
63
+ # Calculate scaling factor
64
+ if total_pixels < min_pixels:
65
+ scale = (min_pixels / total_pixels) ** 0.5
66
+ else:
67
+ scale = (max_pixels / total_pixels) ** 0.5
68
+
69
+ # Round dimensions to nearest factor
70
+ new_width = int((img.width * scale) // factor) * factor
71
+ new_height = int((img.height * scale) // factor) * factor
72
+
73
+ # Ensure minimum dimensions
74
+ new_width = max(new_width, factor)
75
+ new_height = max(new_height, factor)
76
+
77
+ # Resize image
78
+ img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
79
+
80
+ # Convert back to base64
81
+ buffer = io.BytesIO()
82
+ img.save(buffer, format="PNG")
83
+ resized_b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
84
+ return resized_b64, original_size, (new_width, new_height)
85
+
86
+ return image_b64, original_size, original_size
87
+
88
+ def _parse_coordinates(self, response_text: str) -> tuple[float, float] | None:
89
+ """Parse coordinates from model response.
90
+
91
+ Handles multiple formats:
92
+ - (x, y) format from configured regex
93
+ - [x1, y1, x2, y2] bounding box format (returns center point)
94
+ - [x, y] point format
95
+
96
+ Args:
97
+ response_text: Text output from the grounding model
98
+
99
+ Returns:
100
+ Tuple of (x, y) coordinates or None if parsing fails
101
+ """
102
+ # First try the configured regex pattern
103
+ match = re.search(self.config.parser_regex, response_text)
104
+ if match:
105
+ try:
106
+ x = float(match.group(1))
107
+ y = float(match.group(2))
108
+ return (x, y)
109
+ except (ValueError, IndexError):
110
+ # If parsing fails, continue to fallback strategies
111
+ pass
112
+
113
+ # Try to parse as a list/array format [x1, y1, x2, y2] or [x, y]
114
+ # Also handles (x1, y1, x2, y2)
115
+ # Updated pattern to handle both integers and floats
116
+ list_pattern = (
117
+ r"[\[\(](\d+(?:\.\d+)?)[,\s]+(\d+(?:\.\d+)?)"
118
+ r"(?:[,\s]+(\d+(?:\.\d+)?)[,\s]+(\d+(?:\.\d+)?))?[\]\)]"
119
+ )
120
+ list_match = re.search(list_pattern, response_text)
121
+ if list_match:
122
+ x1 = float(list_match.group(1))
123
+ y1 = float(list_match.group(2))
124
+
125
+ # Check if it's a bounding box (4 values) or a point (2 values)
126
+ if list_match.group(3) and list_match.group(4):
127
+ # Bounding box format - return center point
128
+ x2 = float(list_match.group(3))
129
+ y2 = float(list_match.group(4))
130
+ center_x = (x1 + x2) / 2
131
+ center_y = (y1 + y2) / 2
132
+ return (center_x, center_y)
133
+ else:
134
+ # Point format
135
+ return (x1, y1)
136
+
137
+ return None
138
+
139
+ def _convert_coordinates(
140
+ self,
141
+ coords: tuple[float, float],
142
+ processed_size: tuple[int, int],
143
+ original_size: tuple[int, int],
144
+ ) -> tuple[int, int]:
145
+ """Convert coordinates based on output format configuration and scale to original size.
146
+
147
+ Args:
148
+ coords: Raw coordinates from model (can be float for normalized formats)
149
+ processed_size: Dimensions of the processed/resized image (width, height)
150
+ original_size: Original image dimensions (width, height)
151
+
152
+ Returns:
153
+ Converted coordinates in original image pixels
154
+ """
155
+ x, y = coords
156
+ proc_width, proc_height = processed_size
157
+ orig_width, orig_height = original_size
158
+
159
+ # First convert to pixels in the processed image space
160
+ if self.config.output_format == "pixels":
161
+ # Already in pixels of processed image
162
+ proc_x, proc_y = x, y
163
+ elif self.config.output_format == "norm_0_1":
164
+ # Convert from 0-1 normalized to pixels
165
+ proc_x = x * proc_width
166
+ proc_y = y * proc_height
167
+ elif self.config.output_format == "norm_0_999":
168
+ # Convert from 0-999 normalized to pixels
169
+ proc_x = x * proc_width / 999
170
+ proc_y = y * proc_height / 999
171
+ else:
172
+ proc_x, proc_y = x, y
173
+
174
+ # Scale from processed image coordinates to original image coordinates
175
+ scale_x = orig_width / proc_width
176
+ scale_y = orig_height / proc_height
177
+
178
+ final_x = int(proc_x * scale_x)
179
+ final_y = int(proc_y * scale_y)
180
+
181
+ return (final_x, final_y)
182
+
183
+ @instrument(
184
+ name="Grounding.predict_click",
185
+ span_type="agent",
186
+ record_args=True,
187
+ record_result=True,
188
+ )
189
+ async def predict_click(
190
+ self, *, image_b64: str, instruction: str, max_retries: int = 3
191
+ ) -> tuple[int, int] | None:
192
+ """Predict click coordinates for the given instruction on the image.
193
+
194
+ Args:
195
+ image_b64: Base64-encoded screenshot
196
+ instruction: Natural language description of the element to click
197
+ max_retries: Maximum number of retry attempts (default: 3)
198
+
199
+ Returns:
200
+ Tuple of (x, y) pixel coordinates or None if grounding fails
201
+ """
202
+
203
+ # Resize image once outside the retry loop
204
+ processed_image, original_size, processed_size = self._resize_image(image_b64)
205
+
206
+ # Build messages once
207
+ messages = []
208
+
209
+ # Add system prompt if configured
210
+ if self.config.system_prompt:
211
+ messages.append(
212
+ {
213
+ "role": "system",
214
+ "content": (
215
+ self.config.system_prompt
216
+ + f" The image resolution is height {processed_size[1]} "
217
+ + f"and width {processed_size[0]}."
218
+ ),
219
+ }
220
+ )
221
+
222
+ # Add user message with image and instruction
223
+ messages.append(
224
+ {
225
+ "role": "user",
226
+ "content": [
227
+ {
228
+ "type": "image_url",
229
+ "image_url": {"url": f"data:image/png;base64,{processed_image}"},
230
+ },
231
+ {"type": "text", "text": instruction},
232
+ ],
233
+ }
234
+ )
235
+
236
+ # Retry loop
237
+ for attempt in range(max_retries):
238
+ try:
239
+ # Call the grounding model via AsyncOpenAI
240
+ response = await self.client.chat.completions.create(
241
+ model=self.config.model,
242
+ messages=messages,
243
+ temperature=0.0,
244
+ max_tokens=50,
245
+ )
246
+
247
+ # Extract response text
248
+ response_text = response.choices[0].message.content
249
+
250
+ # Manually record the raw response in the span
251
+ span = trace.get_current_span()
252
+ if span and span.is_recording():
253
+ span.set_attribute("grounder.raw_response", json.dumps(response.model_dump()))
254
+ span.set_attribute("grounder.attempt", attempt + 1)
255
+
256
+ # Parse coordinates from response
257
+ if response_text is None:
258
+ if attempt < max_retries - 1:
259
+ continue
260
+ return None
261
+
262
+ coords = self._parse_coordinates(response_text)
263
+ if coords is None:
264
+ if attempt < max_retries - 1:
265
+ continue
266
+ return None
267
+
268
+ # Convert coordinates to original image pixels based on output format and scaling
269
+ pixel_coords = self._convert_coordinates(coords, processed_size, original_size)
270
+
271
+ # Validate coordinates are within image bounds
272
+ x, y = pixel_coords
273
+ if x < 0 or y < 0 or x >= original_size[0] or y >= original_size[1]:
274
+ # Clamp to image bounds
275
+ x = max(0, min(x, original_size[0] - 1))
276
+ y = max(0, min(y, original_size[1] - 1))
277
+ pixel_coords = (x, y)
278
+
279
+ # Record successful grounding in span
280
+ span = trace.get_current_span()
281
+ if span and span.is_recording():
282
+ span.set_attribute("grounder.success", True)
283
+ span.set_attribute(
284
+ "grounder.final_coords", f"{pixel_coords[0]},{pixel_coords[1]}"
285
+ )
286
+ span.set_attribute("grounder.total_attempts", attempt + 1)
287
+
288
+ return pixel_coords
289
+
290
+ except Exception:
291
+ if attempt < max_retries - 1:
292
+ continue
293
+
294
+ # Record failure in span
295
+ span = trace.get_current_span()
296
+ if span and span.is_recording():
297
+ span.set_attribute("grounder.success", False)
298
+ span.set_attribute("grounder.total_attempts", max_retries)
299
+ span.set_attribute("grounder.failure_reason", "All attempts exhausted")
300
+
301
+ return None
@@ -0,0 +1 @@
1
+ """Tests for grounding tools."""