hud-python 0.4.21__py3-none-any.whl → 0.4.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (63) hide show
  1. hud/agents/base.py +37 -37
  2. hud/agents/claude.py +11 -6
  3. hud/agents/grounded_openai.py +282 -0
  4. hud/agents/misc/response_agent.py +3 -2
  5. hud/agents/openai.py +2 -2
  6. hud/agents/openai_chat_generic.py +3 -1
  7. hud/agents/tests/test_client.py +6 -1
  8. hud/agents/tests/test_grounded_openai_agent.py +155 -0
  9. hud/cli/__init__.py +34 -24
  10. hud/cli/analyze.py +27 -26
  11. hud/cli/build.py +50 -46
  12. hud/cli/debug.py +7 -7
  13. hud/cli/dev.py +107 -99
  14. hud/cli/eval.py +33 -31
  15. hud/cli/hf.py +53 -53
  16. hud/cli/init.py +28 -28
  17. hud/cli/list_func.py +22 -22
  18. hud/cli/pull.py +36 -36
  19. hud/cli/push.py +76 -74
  20. hud/cli/remove.py +42 -40
  21. hud/cli/rl/__init__.py +2 -2
  22. hud/cli/rl/init.py +41 -41
  23. hud/cli/rl/pod.py +97 -91
  24. hud/cli/rl/ssh.py +42 -40
  25. hud/cli/rl/train.py +75 -73
  26. hud/cli/rl/utils.py +10 -10
  27. hud/cli/tests/test_analyze.py +1 -1
  28. hud/cli/tests/test_analyze_metadata.py +2 -2
  29. hud/cli/tests/test_pull.py +45 -45
  30. hud/cli/tests/test_push.py +31 -29
  31. hud/cli/tests/test_registry.py +15 -15
  32. hud/cli/utils/environment.py +11 -11
  33. hud/cli/utils/interactive.py +18 -18
  34. hud/cli/utils/logging.py +12 -12
  35. hud/cli/utils/metadata.py +12 -12
  36. hud/cli/utils/registry.py +5 -5
  37. hud/cli/utils/runner.py +23 -23
  38. hud/cli/utils/server.py +16 -16
  39. hud/settings.py +6 -0
  40. hud/shared/hints.py +7 -7
  41. hud/tools/executors/tests/test_base_executor.py +1 -1
  42. hud/tools/executors/xdo.py +1 -1
  43. hud/tools/grounding/__init__.py +13 -0
  44. hud/tools/grounding/config.py +54 -0
  45. hud/tools/grounding/grounded_tool.py +314 -0
  46. hud/tools/grounding/grounder.py +302 -0
  47. hud/tools/grounding/tests/__init__.py +1 -0
  48. hud/tools/grounding/tests/test_grounded_tool.py +196 -0
  49. hud/tools/tests/test_playwright_tool.py +1 -1
  50. hud/tools/tests/test_tools_init.py +1 -1
  51. hud/tools/tests/test_utils.py +2 -2
  52. hud/types.py +4 -4
  53. hud/utils/__init__.py +3 -3
  54. hud/utils/agent_factories.py +86 -0
  55. hud/utils/{design.py → hud_console.py} +39 -33
  56. hud/utils/pretty_errors.py +6 -6
  57. hud/utils/tests/test_version.py +1 -1
  58. hud/version.py +1 -1
  59. {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/METADATA +3 -1
  60. {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/RECORD +63 -54
  61. {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/WHEEL +0 -0
  62. {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/entry_points.txt +0 -0
  63. {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/licenses/LICENSE +0 -0
hud/agents/base.py CHANGED
@@ -11,7 +11,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal
11
11
  import mcp.types as types
12
12
 
13
13
  from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
14
- from hud.utils.design import HUDDesign
14
+ from hud.utils.hud_console import HUDConsole
15
15
  from hud.utils.mcp import MCPConfigPatch, patch_mcp_config, setup_hud_telemetry
16
16
 
17
17
  if TYPE_CHECKING:
@@ -37,7 +37,7 @@ class MCPAgent(ABC):
37
37
  and automatic marking of lifecycle tools (setup/evaluate) from a `Task`.
38
38
  - Messaging: system prompt handling, optional inclusion of setup output on
39
39
  the first turn, and control over initial screenshots.
40
- - Telemetry & UX: standardized logging/printing via `HUDDesign` and optional
40
+ - Telemetry & UX: standardized logging/printing via `HUDConsole` and optional
41
41
  automatic tracing (`auto_trace`).
42
42
 
43
43
  Subclasses implement provider-specific formatting and response fetching
@@ -92,11 +92,11 @@ class MCPAgent(ABC):
92
92
  self._auto_created_client = False # Track if we created the client
93
93
 
94
94
  self.model_name = model_name
95
- self.design = HUDDesign(logger=logger)
95
+ self.console = HUDConsole(logger=logger)
96
96
 
97
97
  # Set verbose mode if requested
98
98
  if verbose:
99
- self.design.set_verbose(True)
99
+ self.console.set_verbose(True)
100
100
 
101
101
  # Filtering
102
102
  self.allowed_tools = allowed_tools
@@ -131,7 +131,7 @@ class MCPAgent(ABC):
131
131
 
132
132
  self.mcp_client = MCPClient(mcp_config=task.mcp_config)
133
133
  self._auto_created_client = True
134
- self.design.info_log("Auto-created MCPClient from task.mcp_config")
134
+ self.console.info_log("Auto-created MCPClient from task.mcp_config")
135
135
 
136
136
  # Ensure we have a client
137
137
  if self.mcp_client is None:
@@ -168,7 +168,7 @@ class MCPAgent(ABC):
168
168
  await self._filter_tools()
169
169
 
170
170
  num_tools = len(self._available_tools)
171
- self.design.success_log(
171
+ self.console.success_log(
172
172
  f"Agent initialized with {num_tools} available tools (after filtering)"
173
173
  )
174
174
 
@@ -243,7 +243,7 @@ class MCPAgent(ABC):
243
243
 
244
244
  # Execute the setup tool and append the initial observation to the context
245
245
  if task.setup_tool is not None:
246
- self.design.progress_log(f"Setting up tool phase: {task.setup_tool}")
246
+ self.console.progress_log(f"Setting up tool phase: {task.setup_tool}")
247
247
  results = await self.call_tools(task.setup_tool)
248
248
  if any(result.isError for result in results):
249
249
  raise RuntimeError(f"{results}")
@@ -257,7 +257,7 @@ class MCPAgent(ABC):
257
257
  prompt_result = await self._run_context(start_context, max_steps=max_steps)
258
258
 
259
259
  except Exception as e:
260
- self.design.error_log(f"Task execution failed: {e}")
260
+ self.console.error_log(f"Task execution failed: {e}")
261
261
  # Create an error result but don't return yet - we still want to evaluate
262
262
  prompt_result = Trace(reward=0.0, done=True, content=str(e), isError=True)
263
263
  prompt_result.populate_from_context()
@@ -265,7 +265,7 @@ class MCPAgent(ABC):
265
265
  # Always evaluate if we have a prompt result and evaluate tool
266
266
  if prompt_result is not None and task.evaluate_tool is not None:
267
267
  try:
268
- self.design.progress_log(f"Evaluating tool phase: {task.evaluate_tool}")
268
+ self.console.progress_log(f"Evaluating tool phase: {task.evaluate_tool}")
269
269
  results = await self.call_tools(task.evaluate_tool)
270
270
 
271
271
  if any(result.isError for result in results):
@@ -288,7 +288,7 @@ class MCPAgent(ABC):
288
288
  prompt_result.content = eval_content
289
289
 
290
290
  except Exception as e:
291
- self.design.error_log(f"Evaluation phase failed: {e}")
291
+ self.console.error_log(f"Evaluation phase failed: {e}")
292
292
  # Continue with the prompt result even if evaluation failed
293
293
 
294
294
  return (
@@ -319,21 +319,21 @@ class MCPAgent(ABC):
319
319
 
320
320
  # Add initial context
321
321
  messages.extend(await self.format_message(context))
322
- self.design.debug(f"Messages: {messages}")
322
+ self.console.debug(f"Messages: {messages}")
323
323
 
324
324
  step_count = 0
325
325
  while max_steps == -1 or step_count < max_steps:
326
326
  step_count += 1
327
327
  if max_steps == -1:
328
- self.design.debug(f"Step {step_count} (unlimited)")
328
+ self.console.debug(f"Step {step_count} (unlimited)")
329
329
  else:
330
- self.design.debug(f"Step {step_count}/{max_steps}")
330
+ self.console.debug(f"Step {step_count}/{max_steps}")
331
331
 
332
332
  try:
333
333
  # 1. Get model response
334
334
  response = await self.get_response(messages)
335
335
 
336
- self.design.debug(f"Agent:\n{response}")
336
+ self.console.debug(f"Agent:\n{response}")
337
337
 
338
338
  # Check if we should stop
339
339
  if response.done or not response.tool_calls:
@@ -345,16 +345,16 @@ class MCPAgent(ABC):
345
345
  response.content
346
346
  )
347
347
  except Exception as e:
348
- self.design.warning_log(f"ResponseAgent failed: {e}")
348
+ self.console.warning_log(f"ResponseAgent failed: {e}")
349
349
  if decision == "STOP":
350
350
  # Try to submit response through lifecycle tool
351
351
  await self._maybe_submit_response(response, messages)
352
352
 
353
- self.design.debug("Stopping execution")
353
+ self.console.debug("Stopping execution")
354
354
  final_response = response
355
355
  break
356
356
  else:
357
- self.design.debug("Continuing execution")
357
+ self.console.debug("Continuing execution")
358
358
  messages.extend(await self.format_message(decision))
359
359
  continue
360
360
 
@@ -376,21 +376,21 @@ class MCPAgent(ABC):
376
376
  for call, result in zip(tool_calls, tool_results, strict=False):
377
377
  step_info += f"\n{call}\n{result}"
378
378
 
379
- self.design.info_log(step_info)
379
+ self.console.info_log(step_info)
380
380
 
381
381
  except Exception as e:
382
- self.design.error_log(f"Step failed: {e}")
382
+ self.console.error_log(f"Step failed: {e}")
383
383
  error = str(e)
384
384
  break
385
385
 
386
386
  except KeyboardInterrupt:
387
- self.design.warning_log("Agent execution interrupted by user")
387
+ self.console.warning_log("Agent execution interrupted by user")
388
388
  error = "Interrupted by user"
389
389
  except asyncio.CancelledError:
390
- self.design.warning_log("Agent execution cancelled")
390
+ self.console.warning_log("Agent execution cancelled")
391
391
  error = "Cancelled"
392
392
  except Exception as e:
393
- self.design.error_log(f"Unexpected error: {e}")
393
+ self.console.error_log(f"Unexpected error: {e}")
394
394
  error = str(e)
395
395
 
396
396
  # Build result
@@ -431,17 +431,17 @@ class MCPAgent(ABC):
431
431
  results: list[MCPToolResult] = []
432
432
  for tc in tool_call:
433
433
  try:
434
- self.design.debug(f"Calling tool: {tc}")
434
+ self.console.debug(f"Calling tool: {tc}")
435
435
  results.append(await self.mcp_client.call_tool(tc))
436
436
  except TimeoutError as e:
437
- self.design.error_log(f"Tool execution timed out: {e}")
437
+ self.console.error_log(f"Tool execution timed out: {e}")
438
438
  try:
439
439
  await self.mcp_client.shutdown()
440
440
  except Exception as close_err:
441
- self.design.debug(f"Failed to close MCP client cleanly: {close_err}")
441
+ self.console.debug(f"Failed to close MCP client cleanly: {close_err}")
442
442
  raise
443
443
  except Exception as e:
444
- self.design.error_log(f"Tool execution failed: {e}")
444
+ self.console.error_log(f"Tool execution failed: {e}")
445
445
  results.append(_format_error_result(str(e)))
446
446
  return results
447
447
 
@@ -573,7 +573,7 @@ class MCPAgent(ABC):
573
573
 
574
574
  # Add to lifecycle tools if found
575
575
  if response_tool_name and response_tool_name not in self.lifecycle_tools:
576
- self.design.debug(f"Auto-detected '{response_tool_name}' tool as a lifecycle tool")
576
+ self.console.debug(f"Auto-detected '{response_tool_name}' tool as a lifecycle tool")
577
577
  self.response_tool_name = response_tool_name
578
578
  self.lifecycle_tools.append(response_tool_name)
579
579
 
@@ -597,7 +597,7 @@ class MCPAgent(ABC):
597
597
  messages: The current message history (will be modified in-place)
598
598
  """
599
599
  if self.response_tool_name:
600
- self.design.debug(f"Calling response lifecycle tool: {self.response_tool_name}")
600
+ self.console.debug(f"Calling response lifecycle tool: {self.response_tool_name}")
601
601
  try:
602
602
  # Call the response tool with the agent's response
603
603
  response_tool_call = MCPToolCall(
@@ -612,9 +612,9 @@ class MCPAgent(ABC):
612
612
  messages.extend(response_messages)
613
613
 
614
614
  # Mark the task as done
615
- self.design.debug("Response lifecycle tool executed, marking task as done")
615
+ self.console.debug("Response lifecycle tool executed, marking task as done")
616
616
  except Exception as e:
617
- self.design.error_log(f"Response lifecycle tool failed: {e}")
617
+ self.console.error_log(f"Response lifecycle tool failed: {e}")
618
618
 
619
619
  async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
620
620
  """Inject metadata into the metadata of the initialize request."""
@@ -668,9 +668,9 @@ class MCPAgent(ABC):
668
668
  if self._auto_trace_cm:
669
669
  try:
670
670
  self._auto_trace_cm.__exit__(None, None, None)
671
- self.design.debug("Closed auto-created trace")
671
+ self.console.debug("Closed auto-created trace")
672
672
  except Exception as e:
673
- self.design.warning_log(f"Failed to close auto-created trace: {e}")
673
+ self.console.warning_log(f"Failed to close auto-created trace: {e}")
674
674
  finally:
675
675
  self._auto_trace_cm = None
676
676
 
@@ -678,9 +678,9 @@ class MCPAgent(ABC):
678
678
  if self._auto_created_client and self.mcp_client:
679
679
  try:
680
680
  await self.mcp_client.shutdown()
681
- self.design.debug("Closed auto-created MCPClient")
681
+ self.console.debug("Closed auto-created MCPClient")
682
682
  except Exception as e:
683
- self.design.warning_log(f"Failed to close auto-created client: {e}")
683
+ self.console.warning_log(f"Failed to close auto-created client: {e}")
684
684
  finally:
685
685
  self.mcp_client = None
686
686
  self._auto_created_client = False
@@ -713,13 +713,13 @@ class MCPAgent(ABC):
713
713
  if self._is_connection_error(e):
714
714
  msg = self._get_connection_error_message(e)
715
715
  # Always show connection errors, not just when logging is enabled
716
- self.design.error(f"❌ {msg}")
717
- self.design.info("💡 Make sure the MCP server is started before running the agent.")
716
+ self.console.error(f"❌ {msg}")
717
+ self.console.info("💡 Make sure the MCP server is started before running the agent.")
718
718
 
719
719
  # For localhost, provide specific instructions
720
720
  error_str = str(e).lower()
721
721
  if "localhost" in error_str or "127.0.0.1" in error_str:
722
- self.design.info(" Run 'hud dev' in another terminal to start the MCP server")
722
+ self.console.info(" Run 'hud dev' in another terminal to start the MCP server")
723
723
 
724
724
  raise RuntimeError(msg) from e
725
725
  raise
hud/agents/claude.py CHANGED
@@ -364,16 +364,21 @@ class ClaudeAgent(MCPAgent):
364
364
  messages_cached = copy.deepcopy(messages)
365
365
 
366
366
  # Mark last user message with cache control
367
- if messages_cached and messages_cached[-1].get("role") == "user":
367
+ if (
368
+ messages_cached
369
+ and isinstance(messages_cached[-1], dict)
370
+ and messages_cached[-1].get("role") == "user"
371
+ ):
368
372
  last_content = messages_cached[-1]["content"]
369
373
  # Content is formatted to be list of ContentBlock in format_blocks and format_message
370
374
  if isinstance(last_content, list):
371
375
  for block in last_content:
372
- # Only add cache control to block types that support it
373
- block_type = block.get("type")
374
- if block_type in ["text", "image", "tool_use", "tool_result"]:
375
- cache_control: BetaCacheControlEphemeralParam = {"type": "ephemeral"}
376
- block["cache_control"] = cache_control # type: ignore[reportGeneralTypeIssues]
376
+ # Only add cache control to dict-like block types that support it
377
+ if isinstance(block, dict):
378
+ block_type = block.get("type")
379
+ if block_type in ["text", "image", "tool_use", "tool_result"]:
380
+ cache_control: BetaCacheControlEphemeralParam = {"type": "ephemeral"}
381
+ block["cache_control"] = cache_control # type: ignore[reportGeneralTypeIssues]
377
382
 
378
383
  return messages_cached
379
384
 
@@ -0,0 +1,282 @@
1
+ """Grounded OpenAI agent that separates visual grounding from reasoning."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Any, ClassVar
7
+
8
+ from hud import instrument
9
+ from hud.tools.grounding import GroundedComputerTool, Grounder, GrounderConfig
10
+ from hud.types import AgentResponse, MCPToolCall, MCPToolResult
11
+
12
+ from .openai_chat_generic import GenericOpenAIChatAgent
13
+
14
+
15
+ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
16
+ """OpenAI agent that uses a separate grounding model for element detection.
17
+
18
+ This agent:
19
+ - Exposes only a synthetic "computer" tool to the planning model
20
+ - Intercepts tool calls to ground element descriptions to coordinates
21
+ - Converts grounded results to real computer tool calls
22
+ - Maintains screenshot state for grounding operations
23
+
24
+ The architecture separates concerns:
25
+ - Planning model (GPT-4o etc) focuses on high-level reasoning
26
+ - Grounding model (Qwen2-VL etc) handles visual element detection
27
+ """
28
+
29
+ metadata: ClassVar[dict[str, Any]] = {}
30
+
31
+ def __init__(
32
+ self,
33
+ *,
34
+ grounder_config: GrounderConfig,
35
+ model_name: str = "gpt-4o-mini",
36
+ allowed_tools: list[str] | None = None,
37
+ append_setup_output: bool = False,
38
+ system_prompt: str | None = None,
39
+ **kwargs: Any,
40
+ ) -> None:
41
+ """Initialize the grounded OpenAI agent.
42
+
43
+ Args:
44
+ grounder_config: Configuration for the grounding model
45
+ openai_client: OpenAI client for the planning model
46
+ model: Name of the OpenAI model to use for planning (e.g., "gpt-4o", "gpt-4o-mini")
47
+ real_computer_tool_name: Name of the actual computer tool to execute
48
+ **kwargs: Additional arguments passed to GenericOpenAIChatAgent
49
+ """
50
+ # Set defaults for grounded agent
51
+ if allowed_tools is None:
52
+ allowed_tools = ["computer"]
53
+
54
+ if system_prompt is None:
55
+ system_prompt = (
56
+ "You are a helpful AI assistant that can control the computer "
57
+ "through visual interaction.\n\n"
58
+ "IMPORTANT: Always explain your reasoning and observations before taking actions:\n"
59
+ "1. First, describe what you see on the screen\n"
60
+ "2. Explain what you plan to do and why\n"
61
+ "3. Then use the computer tool with natural language descriptions\n\n"
62
+ "For example:\n"
63
+ "- 'I can see a login form with username and password fields. "
64
+ "I need to click on the username field first.'\n"
65
+ "- 'There's a blue submit button at the bottom. "
66
+ "I'll click on it to submit the form.'\n"
67
+ "- 'I notice a red close button in the top right corner. "
68
+ "I'll click it to close this dialog.'\n\n"
69
+ "Use descriptive element descriptions like:\n"
70
+ "- Colors: 'red button', 'blue link', 'green checkmark'\n"
71
+ "- Position: 'top right corner', 'bottom of the page', 'left sidebar'\n"
72
+ "- Text content: 'Submit button', 'Login link', 'Cancel option'\n"
73
+ "- Element type: 'text field', 'dropdown menu', 'checkbox'"
74
+ )
75
+
76
+ super().__init__(
77
+ model_name=model_name,
78
+ allowed_tools=allowed_tools,
79
+ append_setup_output=append_setup_output,
80
+ system_prompt=system_prompt,
81
+ **kwargs,
82
+ )
83
+
84
+ self.grounder = Grounder(grounder_config)
85
+ self.grounded_tool = None
86
+
87
+ async def initialize(self, task: Any = None) -> None:
88
+ """Initialize the agent and create the grounded tool with mcp_client."""
89
+ # Call parent initialization first
90
+ await super().initialize(task)
91
+
92
+ if self.mcp_client is None:
93
+ raise ValueError("mcp_client must be initialized before creating grounded tool")
94
+ self.grounded_tool = GroundedComputerTool(
95
+ grounder=self.grounder, mcp_client=self.mcp_client, computer_tool_name="computer"
96
+ )
97
+
98
+ def get_tool_schemas(self) -> list[Any]:
99
+ """Override to expose only the synthetic grounded tool.
100
+
101
+ The planning model only sees the synthetic "computer" tool,
102
+ which is provided by the grounded tool itself.
103
+
104
+ Returns:
105
+ List containing only the grounded computer tool schema
106
+ """
107
+ if self.grounded_tool is None:
108
+ return []
109
+ return [self.grounded_tool.get_openai_tool_schema()]
110
+
111
+ @instrument(
112
+ span_type="agent",
113
+ record_args=False,
114
+ record_result=True,
115
+ )
116
+ async def get_response(self, messages: Any) -> AgentResponse:
117
+ """Get response from the planning model and handle grounded tool calls.
118
+
119
+ This method:
120
+ 1. Calls the planning model with the grounded tool schema
121
+ 2. Executes any tool calls directly through the grounded tool
122
+ 3. Returns the response
123
+
124
+ Args:
125
+ messages: Conversation messages
126
+
127
+ Returns:
128
+ AgentResponse with either content or tool calls for MCP execution
129
+ """
130
+ tool_schemas = self.get_tool_schemas()
131
+
132
+ # Take initial screenshot and add to messages if this is the first turn
133
+ has_image = any(
134
+ isinstance(m.get("content"), list)
135
+ and any(
136
+ block.get("type") == "image_url"
137
+ for block in m["content"]
138
+ if isinstance(block, dict)
139
+ )
140
+ for m in messages
141
+ if isinstance(m.get("content"), list)
142
+ )
143
+
144
+ if not has_image:
145
+ if self.mcp_client is None:
146
+ raise ValueError("mcp_client is not initialized")
147
+ screenshot_result = await self.mcp_client.call_tool(
148
+ MCPToolCall(name="computer", arguments={"action": "screenshot"})
149
+ )
150
+
151
+ for block in screenshot_result.content:
152
+ # Check for ImageContent type from MCP
153
+ if hasattr(block, "data") and hasattr(block, "mimeType"):
154
+ mime_type = getattr(block, "mimeType", "image/png")
155
+ data = getattr(block, "data", "")
156
+ messages.append(
157
+ {
158
+ "role": "user",
159
+ "content": [
160
+ {
161
+ "type": "image_url",
162
+ "image_url": {"url": f"data:{mime_type};base64,{data}"},
163
+ }
164
+ ],
165
+ }
166
+ )
167
+ break
168
+
169
+ protected_keys = {"model", "messages", "tools", "parallel_tool_calls"}
170
+ extra = {k: v for k, v in (self.completion_kwargs or {}).items() if k not in protected_keys}
171
+
172
+ response = await self.oai.chat.completions.create(
173
+ model=self.model_name,
174
+ messages=messages,
175
+ tools=tool_schemas,
176
+ parallel_tool_calls=False,
177
+ **extra,
178
+ )
179
+
180
+ choice = response.choices[0]
181
+ msg = choice.message
182
+
183
+ assistant_msg: dict[str, Any] = {"role": "assistant"}
184
+ if msg.content:
185
+ assistant_msg["content"] = msg.content
186
+ if msg.tool_calls:
187
+ assistant_msg["tool_calls"] = msg.tool_calls
188
+
189
+ messages.append(assistant_msg)
190
+
191
+ self.conversation_history = messages.copy()
192
+
193
+ if not msg.tool_calls:
194
+ return AgentResponse(
195
+ content=msg.content or "",
196
+ tool_calls=[],
197
+ done=choice.finish_reason in ("stop", "length"),
198
+ raw=response,
199
+ )
200
+
201
+ tc = msg.tool_calls[0]
202
+
203
+ if tc.function.name != "computer":
204
+ return AgentResponse(
205
+ content=f"Error: Model called unexpected tool '{tc.function.name}'",
206
+ tool_calls=[],
207
+ done=True,
208
+ raw=response,
209
+ )
210
+
211
+ # Parse the arguments
212
+ try:
213
+ args = json.loads(tc.function.arguments or "{}")
214
+ except json.JSONDecodeError:
215
+ return AgentResponse(
216
+ content="Error: Invalid tool arguments", tool_calls=[], done=True, raw=response
217
+ )
218
+
219
+ tool_call = MCPToolCall(name="computer", arguments=args, id=tc.id)
220
+
221
+ return AgentResponse(
222
+ content=msg.content or "", tool_calls=[tool_call], done=False, raw=response
223
+ )
224
+
225
+ async def call_tools(
226
+ self, tool_call: MCPToolCall | list[MCPToolCall] | None = None
227
+ ) -> list[MCPToolResult]:
228
+ """Override call_tools to intercept computer tool calls.
229
+
230
+ Execute them through grounded tool.
231
+ """
232
+ if tool_call is None:
233
+ return []
234
+
235
+ if isinstance(tool_call, MCPToolCall):
236
+ tool_call = [tool_call]
237
+
238
+ results: list[MCPToolResult] = []
239
+ for tc in tool_call:
240
+ if tc.name == "computer":
241
+ # Execute through grounded tool instead of MCP
242
+ try:
243
+ # Extract latest screenshot from conversation history
244
+ screenshot_b64 = None
245
+ for m in reversed(self.conversation_history):
246
+ if m.get("role") == "user" and isinstance(m.get("content"), list):
247
+ for block in m["content"]:
248
+ if (
249
+ isinstance(block, dict)
250
+ and block.get("type") == "image_url"
251
+ and isinstance(block.get("image_url"), dict)
252
+ ):
253
+ url = block["image_url"].get("url", "")
254
+ if url.startswith("data:"):
255
+ screenshot_b64 = (
256
+ url.split(",", 1)[1] if "," in url else None
257
+ )
258
+ break
259
+ if screenshot_b64:
260
+ break
261
+
262
+ # Pass screenshot to grounded tool
263
+ args_with_screenshot = dict(tc.arguments) if tc.arguments else {}
264
+ if screenshot_b64:
265
+ args_with_screenshot["screenshot_b64"] = screenshot_b64
266
+
267
+ if self.grounded_tool is None:
268
+ raise ValueError("Grounded tool is not initialized")
269
+ content_blocks = await self.grounded_tool(**args_with_screenshot)
270
+ results.append(MCPToolResult(content=content_blocks, isError=False))
271
+ except Exception as e:
272
+ # Create error result
273
+ from mcp.types import TextContent
274
+
275
+ error_content = TextContent(text=str(e), type="text")
276
+ results.append(MCPToolResult(content=[error_content], isError=True))
277
+ else:
278
+ # For non-computer tools, use parent implementation
279
+ parent_results = await super().call_tools(tc)
280
+ results.extend(parent_results)
281
+
282
+ return results
@@ -16,7 +16,7 @@ class ResponseAgent:
16
16
  based on the agent's final response message.
17
17
  """
18
18
 
19
- def __init__(self, api_key: str | None = None) -> None:
19
+ def __init__(self, api_key: str | None = None, model: str = "gpt-4o") -> None:
20
20
  self.api_key = api_key or settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
21
21
  if not self.api_key:
22
22
  raise ValueError(
@@ -24,6 +24,7 @@ class ResponseAgent:
24
24
  )
25
25
 
26
26
  self.client = AsyncOpenAI(api_key=self.api_key)
27
+ self.model = model
27
28
 
28
29
  self.system_prompt = """
29
30
  You are an assistant that helps determine the appropriate response to an agent's message.
@@ -54,7 +55,7 @@ class ResponseAgent:
54
55
  """
55
56
  try:
56
57
  response = await self.client.chat.completions.create(
57
- model="gpt-5-nano",
58
+ model=self.model,
58
59
  messages=[
59
60
  {"role": "system", "content": self.system_prompt},
60
61
  {
hud/agents/openai.py CHANGED
@@ -204,7 +204,7 @@ class OperatorAgent(MCPAgent):
204
204
  break
205
205
 
206
206
  if not latest_screenshot:
207
- self.design.warning_log("No screenshot provided for response to action")
207
+ self.console.warning_log("No screenshot provided for response to action")
208
208
  return AgentResponse(
209
209
  content="No screenshot available for next action",
210
210
  tool_calls=[],
@@ -327,7 +327,7 @@ class OperatorAgent(MCPAgent):
327
327
  for content in result.content:
328
328
  if isinstance(content, types.TextContent):
329
329
  # Don't add error text as input_text, just track it
330
- self.design.error_log(f"Tool error: {content.text}")
330
+ self.console.error_log(f"Tool error: {content.text}")
331
331
  elif isinstance(content, types.ImageContent):
332
332
  # Even error results might have images
333
333
  latest_screenshot = content.data
@@ -17,7 +17,7 @@ from __future__ import annotations
17
17
 
18
18
  import json
19
19
  import logging
20
- from typing import TYPE_CHECKING, Any, cast
20
+ from typing import TYPE_CHECKING, Any, ClassVar, cast
21
21
 
22
22
  import mcp.types as types
23
23
 
@@ -36,6 +36,8 @@ logger = logging.getLogger(__name__)
36
36
  class GenericOpenAIChatAgent(MCPAgent):
37
37
  """MCP-enabled agent that speaks the OpenAI *chat.completions* protocol."""
38
38
 
39
+ metadata: ClassVar[dict[str, Any]] = {}
40
+
39
41
  def __init__(
40
42
  self,
41
43
  *,
@@ -200,7 +200,12 @@ class TestMCPClient:
200
200
  # Calling a non-existent tool should return an error result
201
201
  result = await client.call_tool(name="nonexistent", arguments={})
202
202
  assert result.isError is True
203
- assert "Tool 'nonexistent' not found" in result.content[0].text
203
+ # Check that the error message is in the text content
204
+ text_content = ""
205
+ for content in result.content:
206
+ if isinstance(content, types.TextContent):
207
+ text_content += content.text
208
+ assert "Tool 'nonexistent' not found" in text_content
204
209
 
205
210
  @pytest.mark.asyncio
206
211
  async def test_get_telemetry_data(self, mock_telemetry, mock_mcp_use_client):