hud-python 0.4.21__py3-none-any.whl → 0.4.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (63) hide show
  1. hud/agents/base.py +37 -37
  2. hud/agents/claude.py +11 -6
  3. hud/agents/grounded_openai.py +282 -0
  4. hud/agents/misc/response_agent.py +3 -2
  5. hud/agents/openai.py +2 -2
  6. hud/agents/openai_chat_generic.py +3 -1
  7. hud/agents/tests/test_client.py +6 -1
  8. hud/agents/tests/test_grounded_openai_agent.py +155 -0
  9. hud/cli/__init__.py +34 -24
  10. hud/cli/analyze.py +27 -26
  11. hud/cli/build.py +50 -46
  12. hud/cli/debug.py +7 -7
  13. hud/cli/dev.py +107 -99
  14. hud/cli/eval.py +33 -31
  15. hud/cli/hf.py +53 -53
  16. hud/cli/init.py +28 -28
  17. hud/cli/list_func.py +22 -22
  18. hud/cli/pull.py +36 -36
  19. hud/cli/push.py +76 -74
  20. hud/cli/remove.py +42 -40
  21. hud/cli/rl/__init__.py +2 -2
  22. hud/cli/rl/init.py +41 -41
  23. hud/cli/rl/pod.py +97 -91
  24. hud/cli/rl/ssh.py +42 -40
  25. hud/cli/rl/train.py +75 -73
  26. hud/cli/rl/utils.py +10 -10
  27. hud/cli/tests/test_analyze.py +1 -1
  28. hud/cli/tests/test_analyze_metadata.py +2 -2
  29. hud/cli/tests/test_pull.py +45 -45
  30. hud/cli/tests/test_push.py +31 -29
  31. hud/cli/tests/test_registry.py +15 -15
  32. hud/cli/utils/environment.py +11 -11
  33. hud/cli/utils/interactive.py +18 -18
  34. hud/cli/utils/logging.py +12 -12
  35. hud/cli/utils/metadata.py +12 -12
  36. hud/cli/utils/registry.py +5 -5
  37. hud/cli/utils/runner.py +23 -23
  38. hud/cli/utils/server.py +16 -16
  39. hud/settings.py +6 -0
  40. hud/shared/hints.py +7 -7
  41. hud/tools/executors/tests/test_base_executor.py +1 -1
  42. hud/tools/executors/xdo.py +1 -1
  43. hud/tools/grounding/__init__.py +13 -0
  44. hud/tools/grounding/config.py +54 -0
  45. hud/tools/grounding/grounded_tool.py +314 -0
  46. hud/tools/grounding/grounder.py +302 -0
  47. hud/tools/grounding/tests/__init__.py +1 -0
  48. hud/tools/grounding/tests/test_grounded_tool.py +196 -0
  49. hud/tools/tests/test_playwright_tool.py +1 -1
  50. hud/tools/tests/test_tools_init.py +1 -1
  51. hud/tools/tests/test_utils.py +2 -2
  52. hud/types.py +4 -4
  53. hud/utils/__init__.py +3 -3
  54. hud/utils/agent_factories.py +86 -0
  55. hud/utils/{design.py → hud_console.py} +39 -33
  56. hud/utils/pretty_errors.py +6 -6
  57. hud/utils/tests/test_version.py +1 -1
  58. hud/version.py +1 -1
  59. {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/METADATA +3 -1
  60. {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/RECORD +63 -54
  61. {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/WHEEL +0 -0
  62. {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/entry_points.txt +0 -0
  63. {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/licenses/LICENSE +0 -0
hud/cli/utils/server.py CHANGED
@@ -7,7 +7,7 @@ from typing import Any
7
7
 
8
8
  from fastmcp import FastMCP
9
9
 
10
- from hud.utils.design import HUDDesign
10
+ from hud.utils.hud_console import HUDConsole
11
11
 
12
12
  from .docker import generate_container_name, remove_container
13
13
 
@@ -24,7 +24,7 @@ class MCPServerManager:
24
24
  """
25
25
  self.image = image
26
26
  self.docker_args = docker_args or []
27
- self.design = HUDDesign()
27
+ self.console = HUDConsole()
28
28
  self.container_name = self._generate_container_name()
29
29
 
30
30
  def _generate_container_name(self) -> str:
@@ -155,7 +155,7 @@ class MCPServerManager:
155
155
  pass # Normal cancellation
156
156
  except Exception as e:
157
157
  if verbose:
158
- self.design.error(f"Server error: {e}")
158
+ self.console.error(f"Server error: {e}")
159
159
  raise
160
160
 
161
161
 
@@ -174,16 +174,16 @@ async def run_server_with_interactive(
174
174
  from .interactive import run_interactive_mode
175
175
  from .logging import find_free_port
176
176
 
177
- design = HUDDesign()
177
+ hud_console = HUDConsole()
178
178
 
179
179
  # Find available port
180
180
  actual_port = find_free_port(port)
181
181
  if actual_port is None:
182
- design.error(f"No available ports found starting from {port}")
182
+ hud_console.error(f"No available ports found starting from {port}")
183
183
  return
184
184
 
185
185
  if actual_port != port:
186
- design.warning(f"Port {port} in use, using port {actual_port} instead")
186
+ hud_console.warning(f"Port {port} in use, using port {actual_port} instead")
187
187
 
188
188
  # Clean up any existing container
189
189
  server_manager.cleanup_container()
@@ -198,16 +198,16 @@ async def run_server_with_interactive(
198
198
  proxy = server_manager.create_proxy(config, f"HUD Interactive - {server_manager.image}")
199
199
 
200
200
  # Show header
201
- design.info("") # Empty line
202
- design.header("HUD MCP Server - Interactive Mode", icon="🎮")
201
+ hud_console.info("") # Empty line
202
+ hud_console.header("HUD MCP Server - Interactive Mode", icon="🎮")
203
203
 
204
204
  # Show configuration
205
- design.section_title("Server Information")
206
- design.info(f"Image: {server_manager.image}")
207
- design.info(f"Port: {actual_port}")
208
- design.info(f"URL: http://localhost:{actual_port}/mcp")
209
- design.info(f"Container: {server_manager.container_name}")
210
- design.info("")
205
+ hud_console.section_title("Server Information")
206
+ hud_console.info(f"Image: {server_manager.image}")
207
+ hud_console.info(f"Port: {actual_port}")
208
+ hud_console.info(f"URL: http://localhost:{actual_port}/mcp")
209
+ hud_console.info(f"Container: {server_manager.container_name}")
210
+ hud_console.info("")
211
211
 
212
212
  # Create event to signal server is ready
213
213
  server_ready = asyncio.Event()
@@ -236,7 +236,7 @@ async def run_server_with_interactive(
236
236
  await run_interactive_mode(server_url, verbose=verbose)
237
237
 
238
238
  except KeyboardInterrupt:
239
- design.info("\n👋 Shutting down...")
239
+ hud_console.info("\n👋 Shutting down...")
240
240
  finally:
241
241
  # Cancel server task
242
242
  if server_task and not server_task.done():
@@ -244,7 +244,7 @@ async def run_server_with_interactive(
244
244
  try:
245
245
  await server_task
246
246
  except asyncio.CancelledError:
247
- design.error("Server task cancelled")
247
+ hud_console.error("Server task cancelled")
248
248
 
249
249
  # Clean up container
250
250
  server_manager.cleanup_container()
hud/settings.py CHANGED
@@ -44,6 +44,12 @@ class Settings(BaseSettings):
44
44
  validation_alias="OPENAI_API_KEY",
45
45
  )
46
46
 
47
+ openrouter_api_key: str | None = Field(
48
+ default=None,
49
+ description="API key for OpenRouter models",
50
+ validation_alias="OPENROUTER_API_KEY",
51
+ )
52
+
47
53
  wandb_api_key: str | None = Field(
48
54
  default=None,
49
55
  description="API key for Weights & Biases",
hud/shared/hints.py CHANGED
@@ -144,9 +144,9 @@ def render_hints(hints: Iterable[Hint] | None, *, design: Any | None = None) ->
144
144
 
145
145
  try:
146
146
  if design is None:
147
- from hud.utils.design import design as default_design # lazy import
147
+ from hud.utils.hud_console import hud_console as default_design # lazy import
148
148
 
149
- design = default_design
149
+ hud_console = default_design
150
150
  except Exception:
151
151
  # If design is unavailable (non-CLI contexts), silently skip rendering
152
152
  return
@@ -155,23 +155,23 @@ def render_hints(hints: Iterable[Hint] | None, *, design: Any | None = None) ->
155
155
  try:
156
156
  # Compact rendering - skip title if same as message
157
157
  if hint.title and hint.title != hint.message:
158
- design.warning(f"{hint.title}: {hint.message}")
158
+ hud_console.warning(f"{hint.title}: {hint.message}")
159
159
  else:
160
- design.warning(hint.message)
160
+ hud_console.warning(hint.message)
161
161
 
162
162
  # Tips as bullet points
163
163
  if hint.tips:
164
164
  for tip in hint.tips:
165
- design.info(f" • {tip}")
165
+ hud_console.info(f" • {tip}")
166
166
 
167
167
  # Only show command examples if provided
168
168
  if hint.command_examples:
169
169
  for cmd in hint.command_examples:
170
- design.command_example(cmd)
170
+ hud_console.command_example(cmd)
171
171
 
172
172
  # Only show docs URL if provided
173
173
  if hint.docs_url:
174
- design.link(hint.docs_url)
174
+ hud_console.link(hint.docs_url)
175
175
  except Exception:
176
176
  logger.warning("Failed to render hint: %s", hint)
177
177
  continue
@@ -361,5 +361,5 @@ class TestLazyImports:
361
361
  """Test lazy import with invalid attribute name."""
362
362
  import hud.tools.executors as executors_module
363
363
 
364
- with pytest.raises(AttributeError, match="module '.*' has no attribute 'InvalidExecutor'"):
364
+ with pytest.raises(AttributeError, match=r"module '.*' has no attribute 'InvalidExecutor'"):
365
365
  _ = executors_module.InvalidExecutor
@@ -175,7 +175,7 @@ class XDOExecutor(BaseExecutor):
175
175
 
176
176
  screenshot_cmd = f"{self._display_prefix}scrot -p {screenshot_path}"
177
177
 
178
- returncode, _, stderr = await run(screenshot_cmd)
178
+ returncode, _, _stderr = await run(screenshot_cmd)
179
179
 
180
180
  if returncode == 0 and screenshot_path.exists():
181
181
  try:
@@ -0,0 +1,13 @@
1
+ """Grounding module for visual element detection and coordinate resolution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .config import GrounderConfig
6
+ from .grounded_tool import GroundedComputerTool
7
+ from .grounder import Grounder
8
+
9
+ __all__ = [
10
+ "GroundedComputerTool",
11
+ "Grounder",
12
+ "GrounderConfig",
13
+ ]
@@ -0,0 +1,54 @@
1
+ """Configuration for grounding models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any
7
+
8
+ SYSTEM_PROMPT = (
9
+ "You are a visual grounding model. Given an image and a description, "
10
+ "return ONLY the center pixel coordinates of the described element as a "
11
+ "single point in parentheses format: (x, y). Do not return bounding boxes "
12
+ "or multiple coordinates."
13
+ )
14
+
15
+
16
+ @dataclass
17
+ class GrounderConfig:
18
+ """Configuration for grounding model clients.
19
+
20
+ Attributes:
21
+ api_base: Base URL for the grounding model API endpoint
22
+ model: Model identifier to use for grounding
23
+ api_key: API key for authentication (default: "EMPTY" for local models)
24
+ system_prompt: System prompt to guide the grounding model
25
+ output_format: Format for coordinate output ("pixels", "norm_0_1", "norm_0_999")
26
+ parser_regex: Regular expression to parse coordinates from model output
27
+ resize: Image resizing configuration dictionary
28
+ """
29
+
30
+ api_base: str
31
+ model: str
32
+ api_key: str = "EMPTY"
33
+ system_prompt: str = SYSTEM_PROMPT
34
+ output_format: str = "pixels" # "pixels" | "norm_0_1" | "norm_0_999"
35
+ parser_regex: str = r"\((\d+),\s*(\d+)\)"
36
+ resize: dict[str, Any] = field(
37
+ default_factory=lambda: {
38
+ "enabled": True,
39
+ "min_pixels": 3136,
40
+ "max_pixels": 4096 * 2160,
41
+ "factor": 28,
42
+ }
43
+ )
44
+
45
+ def __post_init__(self) -> None:
46
+ """Validate configuration after initialization."""
47
+ if self.output_format not in ("pixels", "norm_0_1", "norm_0_999"):
48
+ raise ValueError(f"Invalid output_format: {self.output_format}")
49
+
50
+ if not self.api_base:
51
+ raise ValueError("api_base is required")
52
+
53
+ if not self.model:
54
+ raise ValueError("model is required")
@@ -0,0 +1,314 @@
1
+ """Grounded computer tool that resolves element descriptions to coordinates."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ from mcp import ErrorData, McpError
9
+ from mcp.types import INVALID_PARAMS, ContentBlock
10
+
11
+ from hud.clients.base import AgentMCPClient # noqa: TC001
12
+ from hud.tools.grounding.grounder import Grounder # noqa: TC001
13
+ from hud.types import MCPToolCall
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class GroundedComputerTool:
19
+ """Computer tool wrapper that grounds element descriptions to coordinates.
20
+
21
+ This tool acts as a local wrapper that:
22
+ 1. Accepts natural language element descriptions from the agent
23
+ 2. Calls the environment's computer tool via MCP to take screenshots
24
+ 3. Uses a grounding model to resolve descriptions to coordinates
25
+ 4. Calls the environment's computer tool via MCP with resolved coordinates
26
+ 5. Returns the result to the agent
27
+
28
+ This allows the agent to use element descriptions while ensuring all
29
+ computer actions happen in the correct environment.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ *,
35
+ grounder: Grounder,
36
+ mcp_client: AgentMCPClient,
37
+ computer_tool_name: str = "computer",
38
+ ) -> None:
39
+ """Initialize the grounded computer tool.
40
+
41
+ Args:
42
+ grounder: Grounder instance for visual grounding
43
+ mcp_client: MCP client to call the environment's computer tool
44
+ computer_tool_name: Name of the computer tool in the environment
45
+ """
46
+ self._grounder = grounder
47
+ self._mcp_client = mcp_client
48
+ self._computer_tool_name = computer_tool_name
49
+
50
+ def get_openai_tool_schema(self) -> dict:
51
+ """Get the OpenAI tool schema for the grounded computer tool.
52
+
53
+ Returns:
54
+ Dictionary containing the tool schema in OpenAI format
55
+ """
56
+ return {
57
+ "type": "function",
58
+ "function": {
59
+ "name": "computer",
60
+ "description": (
61
+ "Control a computer by interacting with UI elements. This tool uses "
62
+ "element descriptions to locate and interact with UI elements on the "
63
+ "screen (e.g., 'red submit button', 'search text field', 'hamburger menu "
64
+ "icon', 'close button in top right corner')."
65
+ ),
66
+ "parameters": {
67
+ "type": "object",
68
+ "properties": {
69
+ "action": {
70
+ "type": "string",
71
+ "enum": [
72
+ "click",
73
+ "double_click",
74
+ "move",
75
+ "scroll",
76
+ "drag",
77
+ "type",
78
+ "keypress",
79
+ "wait",
80
+ "screenshot",
81
+ "get_current_url",
82
+ "get_dimensions",
83
+ "get_environment",
84
+ ],
85
+ "description": "The action to perform",
86
+ },
87
+ "element_description": {
88
+ "type": "string",
89
+ "description": (
90
+ "Natural language description of the element for "
91
+ "click/move/scroll actions"
92
+ ),
93
+ },
94
+ "start_element_description": {
95
+ "type": "string",
96
+ "description": "Description of the start element for drag actions",
97
+ },
98
+ "end_element_description": {
99
+ "type": "string",
100
+ "description": "Description of the end element for drag actions",
101
+ },
102
+ "text": {"type": "string", "description": "Text to type"},
103
+ "keys": {
104
+ "type": "array",
105
+ "items": {"type": "string"},
106
+ "description": "Keys to press (e.g., ['ctrl', 'a'] for Ctrl+A)",
107
+ },
108
+ "button": {
109
+ "type": "string",
110
+ "enum": ["left", "right", "middle"],
111
+ "description": "Mouse button to use",
112
+ },
113
+ "scroll_x": {"type": "integer", "description": "Horizontal scroll amount"},
114
+ "scroll_y": {"type": "integer", "description": "Vertical scroll amount"},
115
+ },
116
+ "required": ["action"],
117
+ },
118
+ },
119
+ }
120
+
121
+ async def __call__(
122
+ self,
123
+ action: str,
124
+ # Screenshot from conversation
125
+ screenshot_b64: str | None = None,
126
+ # Grounding-specific parameters
127
+ element_description: str | None = None,
128
+ start_element_description: str | None = None,
129
+ end_element_description: str | None = None,
130
+ # Pass-through parameters
131
+ text: str | None = None,
132
+ keys: list[str] | None = None,
133
+ button: str | None = None,
134
+ scroll_x: int | None = None,
135
+ scroll_y: int | None = None,
136
+ **kwargs: Any,
137
+ ) -> list[ContentBlock]:
138
+ """Execute a computer action, grounding element descriptions to coordinates first.
139
+
140
+ This method calls the environment's computer tool through MCP to ensure
141
+ actions happen in the correct environment.
142
+
143
+ Args:
144
+ action: The action to perform
145
+ element_description: Description of element for click/move/scroll actions
146
+ start_element_description: Start element for drag actions
147
+ end_element_description: End element for drag actions
148
+ text: Text to type for type actions
149
+ keys: Keys to press for keypress actions
150
+ button: Mouse button (left, right, middle)
151
+ scroll_x: Horizontal scroll amount
152
+ scroll_y: Vertical scroll amount
153
+ **kwargs: Additional arguments
154
+
155
+ Returns:
156
+ List of ContentBlocks with action results from the environment
157
+ """
158
+ try:
159
+ # For actions that don't need grounding, call environment tool directly
160
+ if action in (
161
+ "screenshot",
162
+ "type",
163
+ "keypress",
164
+ "wait",
165
+ "get_current_url",
166
+ "get_dimensions",
167
+ "get_environment",
168
+ ):
169
+ computer_args: dict[str, Any] = {"action": action}
170
+ if text is not None:
171
+ computer_args["text"] = text
172
+ if keys is not None:
173
+ computer_args["keys"] = keys
174
+
175
+ result = await self._mcp_client.call_tool(
176
+ MCPToolCall(
177
+ name=self._computer_tool_name, arguments={**computer_args, **kwargs}
178
+ )
179
+ )
180
+ return result.content
181
+
182
+ # For actions that need coordinates, we need to ground element descriptions
183
+ if action in ("click", "double_click", "move", "scroll"):
184
+ if not element_description:
185
+ raise McpError(
186
+ ErrorData(
187
+ code=INVALID_PARAMS,
188
+ message=f"element_description is required for {action} action",
189
+ )
190
+ )
191
+
192
+ if not screenshot_b64:
193
+ raise McpError(
194
+ ErrorData(
195
+ code=INVALID_PARAMS, message="No screenshot available for grounding"
196
+ )
197
+ )
198
+
199
+ # Ground the element description to coordinates
200
+ coords = await self._grounder.predict_click(
201
+ image_b64=screenshot_b64, instruction=element_description
202
+ )
203
+
204
+ if not coords:
205
+ raise McpError(
206
+ ErrorData(
207
+ code=INVALID_PARAMS,
208
+ message=(
209
+ f"Could not locate element: '{element_description}'. "
210
+ "Try a more specific description or different identifying "
211
+ "features (color, position, text, etc.)"
212
+ ),
213
+ )
214
+ )
215
+
216
+ x, y = coords
217
+
218
+ # Execute action with resolved coordinates
219
+ computer_args: dict[str, Any] = {"action": action, "x": x, "y": y}
220
+ if button:
221
+ computer_args["button"] = button
222
+ if scroll_x is not None:
223
+ computer_args["scroll_x"] = scroll_x
224
+ if scroll_y is not None:
225
+ computer_args["scroll_y"] = scroll_y
226
+
227
+ result = await self._mcp_client.call_tool(
228
+ MCPToolCall(
229
+ name=self._computer_tool_name, arguments={**computer_args, **kwargs}
230
+ )
231
+ )
232
+ return result.content
233
+
234
+ elif action == "drag":
235
+ if not start_element_description or not end_element_description:
236
+ raise McpError(
237
+ ErrorData(
238
+ code=INVALID_PARAMS,
239
+ message=(
240
+ "start_element_description and end_element_description "
241
+ "are required for drag action"
242
+ ),
243
+ )
244
+ )
245
+
246
+ if not screenshot_b64:
247
+ raise McpError(
248
+ ErrorData(
249
+ code=INVALID_PARAMS, message="No screenshot available for grounding"
250
+ )
251
+ )
252
+
253
+ # Ground both start and end points
254
+ start_coords = await self._grounder.predict_click(
255
+ image_b64=screenshot_b64, instruction=start_element_description
256
+ )
257
+
258
+ if not start_coords:
259
+ raise McpError(
260
+ ErrorData(
261
+ code=INVALID_PARAMS,
262
+ message=(
263
+ f"Could not locate start element: '{start_element_description}'. "
264
+ "Try a more specific description or different identifying features."
265
+ ),
266
+ )
267
+ )
268
+
269
+ end_coords = await self._grounder.predict_click(
270
+ image_b64=screenshot_b64, instruction=end_element_description
271
+ )
272
+
273
+ if not end_coords:
274
+ raise McpError(
275
+ ErrorData(
276
+ code=INVALID_PARAMS,
277
+ message=(
278
+ f"Could not locate end element: '{end_element_description}'. "
279
+ "Try a more specific description or different identifying features."
280
+ ),
281
+ )
282
+ )
283
+
284
+ # Execute drag with resolved coordinates
285
+ computer_args: dict[str, Any] = {
286
+ "action": "drag",
287
+ "path": [
288
+ (start_coords[0], start_coords[1]),
289
+ (end_coords[0], end_coords[1]),
290
+ ],
291
+ }
292
+ if button:
293
+ computer_args["button"] = button
294
+
295
+ result = await self._mcp_client.call_tool(
296
+ MCPToolCall(
297
+ name=self._computer_tool_name, arguments={**computer_args, **kwargs}
298
+ )
299
+ )
300
+ return result.content
301
+
302
+ else:
303
+ raise McpError(
304
+ ErrorData(code=INVALID_PARAMS, message=f"Unsupported action: {action}")
305
+ )
306
+
307
+ except McpError:
308
+ # Re-raise MCP errors
309
+ raise
310
+ except Exception as e:
311
+ logger.error("Grounded tool failed: %s", e)
312
+ raise McpError(
313
+ ErrorData(code=INVALID_PARAMS, message=f"Grounding failed: {e!s}")
314
+ ) from e