hud-python 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (130) hide show
  1. hud/__init__.py +22 -22
  2. hud/agents/__init__.py +13 -15
  3. hud/agents/base.py +599 -599
  4. hud/agents/claude.py +373 -373
  5. hud/agents/langchain.py +261 -250
  6. hud/agents/misc/__init__.py +7 -7
  7. hud/agents/misc/response_agent.py +82 -80
  8. hud/agents/openai.py +352 -352
  9. hud/agents/openai_chat_generic.py +154 -154
  10. hud/agents/tests/__init__.py +1 -1
  11. hud/agents/tests/test_base.py +742 -742
  12. hud/agents/tests/test_claude.py +324 -324
  13. hud/agents/tests/test_client.py +363 -363
  14. hud/agents/tests/test_openai.py +237 -237
  15. hud/cli/__init__.py +617 -617
  16. hud/cli/__main__.py +8 -8
  17. hud/cli/analyze.py +371 -371
  18. hud/cli/analyze_metadata.py +230 -230
  19. hud/cli/build.py +498 -427
  20. hud/cli/clone.py +185 -185
  21. hud/cli/cursor.py +92 -92
  22. hud/cli/debug.py +392 -392
  23. hud/cli/docker_utils.py +83 -83
  24. hud/cli/init.py +280 -281
  25. hud/cli/interactive.py +353 -353
  26. hud/cli/mcp_server.py +764 -756
  27. hud/cli/pull.py +330 -336
  28. hud/cli/push.py +404 -370
  29. hud/cli/remote_runner.py +311 -311
  30. hud/cli/runner.py +160 -160
  31. hud/cli/tests/__init__.py +3 -3
  32. hud/cli/tests/test_analyze.py +284 -284
  33. hud/cli/tests/test_cli_init.py +265 -265
  34. hud/cli/tests/test_cli_main.py +27 -27
  35. hud/cli/tests/test_clone.py +142 -142
  36. hud/cli/tests/test_cursor.py +253 -253
  37. hud/cli/tests/test_debug.py +453 -453
  38. hud/cli/tests/test_mcp_server.py +139 -139
  39. hud/cli/tests/test_utils.py +388 -388
  40. hud/cli/utils.py +263 -263
  41. hud/clients/README.md +143 -143
  42. hud/clients/__init__.py +16 -16
  43. hud/clients/base.py +378 -379
  44. hud/clients/fastmcp.py +222 -222
  45. hud/clients/mcp_use.py +298 -278
  46. hud/clients/tests/__init__.py +1 -1
  47. hud/clients/tests/test_client_integration.py +111 -111
  48. hud/clients/tests/test_fastmcp.py +342 -342
  49. hud/clients/tests/test_protocol.py +188 -188
  50. hud/clients/utils/__init__.py +1 -1
  51. hud/clients/utils/retry_transport.py +160 -160
  52. hud/datasets.py +327 -322
  53. hud/misc/__init__.py +1 -1
  54. hud/misc/claude_plays_pokemon.py +292 -292
  55. hud/otel/__init__.py +35 -35
  56. hud/otel/collector.py +142 -142
  57. hud/otel/config.py +164 -164
  58. hud/otel/context.py +536 -536
  59. hud/otel/exporters.py +366 -366
  60. hud/otel/instrumentation.py +97 -97
  61. hud/otel/processors.py +118 -118
  62. hud/otel/tests/__init__.py +1 -1
  63. hud/otel/tests/test_processors.py +197 -197
  64. hud/server/__init__.py +5 -5
  65. hud/server/context.py +114 -114
  66. hud/server/helper/__init__.py +5 -5
  67. hud/server/low_level.py +132 -132
  68. hud/server/server.py +170 -166
  69. hud/server/tests/__init__.py +3 -3
  70. hud/settings.py +73 -73
  71. hud/shared/__init__.py +5 -5
  72. hud/shared/exceptions.py +180 -180
  73. hud/shared/requests.py +264 -264
  74. hud/shared/tests/test_exceptions.py +157 -157
  75. hud/shared/tests/test_requests.py +275 -275
  76. hud/telemetry/__init__.py +25 -25
  77. hud/telemetry/instrument.py +379 -379
  78. hud/telemetry/job.py +309 -309
  79. hud/telemetry/replay.py +74 -74
  80. hud/telemetry/trace.py +83 -83
  81. hud/tools/__init__.py +33 -33
  82. hud/tools/base.py +365 -365
  83. hud/tools/bash.py +161 -161
  84. hud/tools/computer/__init__.py +15 -15
  85. hud/tools/computer/anthropic.py +437 -437
  86. hud/tools/computer/hud.py +376 -376
  87. hud/tools/computer/openai.py +295 -295
  88. hud/tools/computer/settings.py +82 -82
  89. hud/tools/edit.py +314 -314
  90. hud/tools/executors/__init__.py +30 -30
  91. hud/tools/executors/base.py +539 -539
  92. hud/tools/executors/pyautogui.py +621 -621
  93. hud/tools/executors/tests/__init__.py +1 -1
  94. hud/tools/executors/tests/test_base_executor.py +338 -338
  95. hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
  96. hud/tools/executors/xdo.py +511 -511
  97. hud/tools/playwright.py +412 -412
  98. hud/tools/tests/__init__.py +3 -3
  99. hud/tools/tests/test_base.py +282 -282
  100. hud/tools/tests/test_bash.py +158 -158
  101. hud/tools/tests/test_bash_extended.py +197 -197
  102. hud/tools/tests/test_computer.py +425 -425
  103. hud/tools/tests/test_computer_actions.py +34 -34
  104. hud/tools/tests/test_edit.py +259 -259
  105. hud/tools/tests/test_init.py +27 -27
  106. hud/tools/tests/test_playwright_tool.py +183 -183
  107. hud/tools/tests/test_tools.py +145 -145
  108. hud/tools/tests/test_utils.py +156 -156
  109. hud/tools/types.py +72 -72
  110. hud/tools/utils.py +50 -50
  111. hud/types.py +136 -136
  112. hud/utils/__init__.py +10 -10
  113. hud/utils/async_utils.py +65 -65
  114. hud/utils/design.py +236 -168
  115. hud/utils/mcp.py +55 -55
  116. hud/utils/progress.py +149 -149
  117. hud/utils/telemetry.py +66 -66
  118. hud/utils/tests/test_async_utils.py +173 -173
  119. hud/utils/tests/test_init.py +17 -17
  120. hud/utils/tests/test_progress.py +261 -261
  121. hud/utils/tests/test_telemetry.py +82 -82
  122. hud/utils/tests/test_version.py +8 -8
  123. hud/version.py +7 -7
  124. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/METADATA +10 -8
  125. hud_python-0.4.3.dist-info/RECORD +131 -0
  126. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/licenses/LICENSE +21 -21
  127. hud/agents/art.py +0 -101
  128. hud_python-0.4.1.dist-info/RECORD +0 -132
  129. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/WHEEL +0 -0
  130. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/entry_points.txt +0 -0
hud/agents/openai.py CHANGED
@@ -1,352 +1,352 @@
1
- """OpenAI MCP Agent implementation."""
2
-
3
- from __future__ import annotations
4
-
5
- import logging
6
- from typing import Any, ClassVar, Literal
7
-
8
- import mcp.types as types
9
- from openai import AsyncOpenAI
10
- from openai.types.responses import (
11
- ResponseComputerToolCall,
12
- ResponseInputMessageContentListParam,
13
- ResponseInputParam,
14
- ResponseOutputMessage,
15
- ResponseOutputText,
16
- ToolParam,
17
- )
18
-
19
- import hud
20
- from hud.settings import settings
21
- from hud.tools.computer.settings import computer_settings
22
- from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
23
-
24
- from .base import MCPAgent
25
-
26
- logger = logging.getLogger(__name__)
27
-
28
-
29
- class OperatorAgent(MCPAgent):
30
- """
31
- Operator agent that uses MCP servers for tool execution.
32
-
33
- This agent uses OpenAI's Computer Use API format but executes
34
- tools through MCP servers instead of direct implementation.
35
- """
36
-
37
- metadata: ClassVar[dict[str, Any]] = {
38
- "display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
39
- "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
40
- }
41
-
42
- def __init__(
43
- self,
44
- model_client: AsyncOpenAI | None = None,
45
- model: str = "computer-use-preview",
46
- environment: Literal["windows", "mac", "linux", "browser"] = "linux",
47
- **kwargs: Any,
48
- ) -> None:
49
- """
50
- Initialize Operator MCP agent.
51
-
52
- Args:
53
- client: AsyncOpenAI client (created if not provided)
54
- model: OpenAI model to use
55
- environment: Environment type for computer use
56
- display_width: Display width for computer use
57
- display_height: Display height for computer use
58
- **kwargs: Additional arguments passed to MCPAgent
59
- """
60
- super().__init__(**kwargs)
61
-
62
- # Initialize client if not provided
63
- if model_client is None:
64
- api_key = settings.openai_api_key
65
- if not api_key:
66
- raise ValueError("OpenAI API key not found. Set OPENAI_API_KEY.")
67
- model_client = AsyncOpenAI(api_key=api_key)
68
-
69
- self.openai_client = model_client
70
- self.model = model
71
- self.environment = environment
72
-
73
- # State tracking for OpenAI's stateful API
74
- self.last_response_id: str | None = None
75
- self.pending_call_id: str | None = None
76
- self.pending_safety_checks: list[Any] = []
77
-
78
- self.model_name = "openai-" + self.model
79
-
80
- # Base system prompt for autonomous operation
81
- self.system_prompt = """
82
- You are an autonomous computer-using agent. Follow these guidelines:
83
-
84
- 1. NEVER ask for confirmation. Complete all tasks autonomously.
85
- 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
86
- 3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
87
- 4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
88
- 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
89
- 6. The user has already given you permission by running this agent. No further confirmation is needed.
90
- 7. Be decisive and action-oriented. Complete the requested task fully.
91
-
92
- Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
93
- """.strip() # noqa: E501
94
-
95
- async def _run_context(self, context: list[types.ContentBlock], max_steps: int = 10) -> Trace:
96
- """
97
- Run the agent with the given prompt or task.
98
-
99
- Override to reset OpenAI-specific state.
100
- """
101
- # Reset state for new run
102
- self.last_response_id = None
103
- self.pending_call_id = None
104
- self.pending_safety_checks = []
105
-
106
- # Use base implementation
107
- return await super()._run_context(context, max_steps=max_steps)
108
-
109
- async def get_system_messages(self) -> list[Any]:
110
- """
111
- Create initial messages for OpenAI.
112
-
113
- OpenAI uses a different message format - we'll store the prompt
114
- and screenshot for use in get_model_response.
115
- """
116
- return []
117
-
118
- async def format_blocks(
119
- self, blocks: list[types.ContentBlock]
120
- ) -> ResponseInputMessageContentListParam:
121
- """
122
- Format blocks for OpenAI input format.
123
-
124
- Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts.
125
- """ # noqa: E501
126
- formatted = []
127
- for block in blocks:
128
- if isinstance(block, types.TextContent):
129
- formatted.append({"type": "input_text", "text": block.text})
130
- elif isinstance(block, types.ImageContent):
131
- mime_type = getattr(block, "mimeType", "image/png")
132
- formatted.append(
133
- {"type": "input_image", "image_url": f"data:{mime_type};base64,{block.data}"}
134
- )
135
- return formatted
136
-
137
- @hud.instrument(
138
- span_type="agent",
139
- record_args=False, # Messages can be large
140
- record_result=True,
141
- )
142
- async def get_response(self, messages: ResponseInputMessageContentListParam) -> AgentResponse:
143
- """Get response from OpenAI including any tool calls."""
144
- # OpenAI's API is stateful, so we handle messages differently
145
-
146
- # Check if we have computer tools available
147
- computer_tool_name = None
148
- for tool in self._available_tools:
149
- if tool.name in ["openai_computer", "computer"]:
150
- computer_tool_name = tool.name
151
- break
152
-
153
- if not computer_tool_name:
154
- # No computer tools available, just return a text response
155
- return AgentResponse(
156
- content="No computer use tools available",
157
- tool_calls=[],
158
- done=True,
159
- )
160
-
161
- # Define the computer use tool
162
- computer_tool: ToolParam = { # type: ignore[reportAssignmentType]
163
- "type": "computer_use_preview",
164
- "display_width": self.metadata["display_width"],
165
- "display_height": self.metadata["display_height"],
166
- "environment": self.environment,
167
- }
168
-
169
- # Build the request based on whether this is first step or follow-up
170
- if self.pending_call_id is None and self.last_response_id is None:
171
- # First step - messages are already formatted dicts from format_blocks
172
- # format_blocks returns type ResponseInputMessageContentListParam, which is a list of dicts # noqa: E501
173
- input_content: ResponseInputMessageContentListParam = []
174
-
175
- input_content.extend(messages)
176
-
177
- # If no content was added, add empty text to avoid empty request
178
- if not input_content:
179
- input_content.append({"type": "input_text", "text": ""})
180
-
181
- input_param: ResponseInputParam = [{"role": "user", "content": input_content}] # type: ignore[reportUnknownMemberType]
182
-
183
- response = await self.openai_client.responses.create(
184
- model=self.model,
185
- tools=[computer_tool],
186
- input=input_param,
187
- instructions=self.system_prompt,
188
- truncation="auto",
189
- reasoning={"summary": "auto"}, # type: ignore[arg-type]
190
- )
191
- else:
192
- # Follow-up step - check if this is user input or tool result
193
- latest_message = messages[-1] if messages else {}
194
-
195
- if latest_message.get("type") == "input_text":
196
- # User provided input in conversation mode
197
- user_text = latest_message.get("text", "")
198
- input_param_followup: ResponseInputParam = [ # type: ignore[reportAssignmentType]
199
- {"role": "user", "content": [{"type": "input_text", "text": user_text}]}
200
- ]
201
- # Reset pending_call_id since this is user input, not a tool response
202
- self.pending_call_id = None
203
- else:
204
- # Tool result - need screenshot from processed results
205
- latest_screenshot = None
206
- for msg in reversed(messages):
207
- if isinstance(msg, dict) and "image_url" in msg:
208
- latest_screenshot = msg["image_url"] # type: ignore
209
- break
210
-
211
- if not latest_screenshot:
212
- logger.warning("No screenshot provided for response to action")
213
- return AgentResponse(
214
- content="No screenshot available for next action",
215
- tool_calls=[],
216
- done=True,
217
- )
218
-
219
- # Create response to previous action
220
- input_param_followup: ResponseInputParam = [ # type: ignore[reportAssignmentType]
221
- { # type: ignore[reportAssignmentType]
222
- "call_id": self.pending_call_id,
223
- "type": "computer_call_output",
224
- "output": {
225
- "type": "input_image",
226
- "image_url": latest_screenshot,
227
- },
228
- "acknowledged_safety_checks": self.pending_safety_checks,
229
- }
230
- ]
231
-
232
- self.pending_safety_checks = []
233
-
234
- response = await self.openai_client.responses.create(
235
- model=self.model,
236
- previous_response_id=self.last_response_id,
237
- tools=[computer_tool],
238
- input=input_param_followup,
239
- instructions=self.system_prompt,
240
- truncation="auto",
241
- reasoning={"summary": "auto"}, # type: ignore[arg-type]
242
- )
243
-
244
- # Store response ID for next call
245
- self.last_response_id = response.id
246
-
247
- # Process response
248
- result = AgentResponse(
249
- content="",
250
- tool_calls=[],
251
- done=False, # Will be set to True only if no tool calls
252
- )
253
-
254
- self.pending_call_id = None
255
-
256
- # Check for computer calls
257
- computer_calls = [
258
- item
259
- for item in response.output
260
- if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
261
- ]
262
-
263
- if computer_calls:
264
- # Process computer calls
265
- result.done = False
266
- for computer_call in computer_calls:
267
- self.pending_call_id = computer_call.call_id
268
- self.pending_safety_checks = computer_call.pending_safety_checks
269
-
270
- # Convert OpenAI action to MCP tool call
271
- action = computer_call.action.model_dump()
272
-
273
- # Create MCPToolCall object with OpenAI metadata as extra fields
274
- # Pyright will complain but the tool class accepts extra fields
275
- tool_call = MCPToolCall(
276
- name=computer_tool_name,
277
- arguments=action,
278
- id=computer_call.call_id, # type: ignore
279
- pending_safety_checks=computer_call.pending_safety_checks, # type: ignore
280
- )
281
- result.tool_calls.append(tool_call)
282
- else:
283
- # No computer calls, check for text response
284
- for item in response.output:
285
- if isinstance(item, ResponseOutputMessage) and item.type == "message":
286
- # Extract text from content blocks
287
- text_parts = [
288
- content.text
289
- for content in item.content
290
- if isinstance(content, ResponseOutputText)
291
- ]
292
- if text_parts:
293
- result.content = "".join(text_parts)
294
- break
295
-
296
- # Extract reasoning if present
297
- reasoning_text = ""
298
- for item in response.output:
299
- if item.type == "reasoning" and hasattr(item, "summary") and item.summary:
300
- reasoning_text += f"Thinking: {item.summary[0].text}\n"
301
-
302
- if reasoning_text:
303
- result.content = reasoning_text + result.content if result.content else reasoning_text
304
-
305
- # Set done=True if no tool calls (task complete or waiting for user)
306
- if not result.tool_calls:
307
- result.done = True
308
-
309
- return result
310
-
311
- async def format_tool_results(
312
- self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
313
- ) -> ResponseInputMessageContentListParam:
314
- """
315
- Format tool results for OpenAI's stateful API.
316
-
317
- Tool result content is a list of ContentBlock objects.
318
- We need to extract the latest screenshot from the tool results.
319
-
320
- This assumes that you only care about computer tool results for your agent loop.
321
- If you need to add other content, you can do so by adding a new ContentBlock object to the list.
322
-
323
- Returns formatted dicts with tool result data, preserving screenshots.
324
- """ # noqa: E501
325
- formatted_results = []
326
- latest_screenshot = None
327
-
328
- # Extract all content from tool results
329
- for result in tool_results:
330
- if result.isError:
331
- # If it's an error, the error details are in the content
332
- for content in result.content:
333
- if isinstance(content, types.TextContent):
334
- # Don't add error text as input_text, just track it
335
- logger.error("Tool error: %s", content.text)
336
- elif isinstance(content, types.ImageContent):
337
- # Even error results might have images
338
- latest_screenshot = content.data
339
- else:
340
- # Extract content from successful results
341
- for content in result.content:
342
- if isinstance(content, types.ImageContent):
343
- latest_screenshot = content.data
344
- break
345
-
346
- # Return a dict with the latest screenshot for the follow-up step
347
- if latest_screenshot:
348
- formatted_results.append(
349
- {"type": "input_image", "image_url": f"data:image/png;base64,{latest_screenshot}"}
350
- )
351
-
352
- return formatted_results
1
+ """OpenAI MCP Agent implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any, ClassVar, Literal
7
+
8
+ import mcp.types as types
9
+ from openai import AsyncOpenAI
10
+ from openai.types.responses import (
11
+ ResponseComputerToolCall,
12
+ ResponseInputMessageContentListParam,
13
+ ResponseInputParam,
14
+ ResponseOutputMessage,
15
+ ResponseOutputText,
16
+ ToolParam,
17
+ )
18
+
19
+ import hud
20
+ from hud.settings import settings
21
+ from hud.tools.computer.settings import computer_settings
22
+ from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
23
+
24
+ from .base import MCPAgent
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class OperatorAgent(MCPAgent):
30
+ """
31
+ Operator agent that uses MCP servers for tool execution.
32
+
33
+ This agent uses OpenAI's Computer Use API format but executes
34
+ tools through MCP servers instead of direct implementation.
35
+ """
36
+
37
+ metadata: ClassVar[dict[str, Any]] = {
38
+ "display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
39
+ "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
40
+ }
41
+
42
+ def __init__(
43
+ self,
44
+ model_client: AsyncOpenAI | None = None,
45
+ model: str = "computer-use-preview",
46
+ environment: Literal["windows", "mac", "linux", "browser"] = "linux",
47
+ **kwargs: Any,
48
+ ) -> None:
49
+ """
50
+ Initialize Operator MCP agent.
51
+
52
+ Args:
53
+ client: AsyncOpenAI client (created if not provided)
54
+ model: OpenAI model to use
55
+ environment: Environment type for computer use
56
+ display_width: Display width for computer use
57
+ display_height: Display height for computer use
58
+ **kwargs: Additional arguments passed to MCPAgent
59
+ """
60
+ super().__init__(**kwargs)
61
+
62
+ # Initialize client if not provided
63
+ if model_client is None:
64
+ api_key = settings.openai_api_key
65
+ if not api_key:
66
+ raise ValueError("OpenAI API key not found. Set OPENAI_API_KEY.")
67
+ model_client = AsyncOpenAI(api_key=api_key)
68
+
69
+ self.openai_client = model_client
70
+ self.model = model
71
+ self.environment = environment
72
+
73
+ # State tracking for OpenAI's stateful API
74
+ self.last_response_id: str | None = None
75
+ self.pending_call_id: str | None = None
76
+ self.pending_safety_checks: list[Any] = []
77
+
78
+ self.model_name = "openai-" + self.model
79
+
80
+ # Base system prompt for autonomous operation
81
+ self.system_prompt = """
82
+ You are an autonomous computer-using agent. Follow these guidelines:
83
+
84
+ 1. NEVER ask for confirmation. Complete all tasks autonomously.
85
+ 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
86
+ 3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
87
+ 4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
88
+ 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
89
+ 6. The user has already given you permission by running this agent. No further confirmation is needed.
90
+ 7. Be decisive and action-oriented. Complete the requested task fully.
91
+
92
+ Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
93
+ """.strip() # noqa: E501
94
+
95
+ async def _run_context(self, context: list[types.ContentBlock], max_steps: int = 10) -> Trace:
96
+ """
97
+ Run the agent with the given prompt or task.
98
+
99
+ Override to reset OpenAI-specific state.
100
+ """
101
+ # Reset state for new run
102
+ self.last_response_id = None
103
+ self.pending_call_id = None
104
+ self.pending_safety_checks = []
105
+
106
+ # Use base implementation
107
+ return await super()._run_context(context, max_steps=max_steps)
108
+
109
+ async def get_system_messages(self) -> list[Any]:
110
+ """
111
+ Create initial messages for OpenAI.
112
+
113
+ OpenAI uses a different message format - we'll store the prompt
114
+ and screenshot for use in get_model_response.
115
+ """
116
+ return []
117
+
118
+ async def format_blocks(
119
+ self, blocks: list[types.ContentBlock]
120
+ ) -> ResponseInputMessageContentListParam:
121
+ """
122
+ Format blocks for OpenAI input format.
123
+
124
+ Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts.
125
+ """ # noqa: E501
126
+ formatted = []
127
+ for block in blocks:
128
+ if isinstance(block, types.TextContent):
129
+ formatted.append({"type": "input_text", "text": block.text})
130
+ elif isinstance(block, types.ImageContent):
131
+ mime_type = getattr(block, "mimeType", "image/png")
132
+ formatted.append(
133
+ {"type": "input_image", "image_url": f"data:{mime_type};base64,{block.data}"}
134
+ )
135
+ return formatted
136
+
137
+ @hud.instrument(
138
+ span_type="agent",
139
+ record_args=False, # Messages can be large
140
+ record_result=True,
141
+ )
142
+ async def get_response(self, messages: ResponseInputMessageContentListParam) -> AgentResponse:
143
+ """Get response from OpenAI including any tool calls."""
144
+ # OpenAI's API is stateful, so we handle messages differently
145
+
146
+ # Check if we have computer tools available
147
+ computer_tool_name = None
148
+ for tool in self._available_tools:
149
+ if tool.name in ["openai_computer", "computer"]:
150
+ computer_tool_name = tool.name
151
+ break
152
+
153
+ if not computer_tool_name:
154
+ # No computer tools available, just return a text response
155
+ return AgentResponse(
156
+ content="No computer use tools available",
157
+ tool_calls=[],
158
+ done=True,
159
+ )
160
+
161
+ # Define the computer use tool
162
+ computer_tool: ToolParam = { # type: ignore[reportAssignmentType]
163
+ "type": "computer_use_preview",
164
+ "display_width": self.metadata["display_width"],
165
+ "display_height": self.metadata["display_height"],
166
+ "environment": self.environment,
167
+ }
168
+
169
+ # Build the request based on whether this is first step or follow-up
170
+ if self.pending_call_id is None and self.last_response_id is None:
171
+ # First step - messages are already formatted dicts from format_blocks
172
+ # format_blocks returns type ResponseInputMessageContentListParam, which is a list of dicts # noqa: E501
173
+ input_content: ResponseInputMessageContentListParam = []
174
+
175
+ input_content.extend(messages)
176
+
177
+ # If no content was added, add empty text to avoid empty request
178
+ if not input_content:
179
+ input_content.append({"type": "input_text", "text": ""})
180
+
181
+ input_param: ResponseInputParam = [{"role": "user", "content": input_content}] # type: ignore[reportUnknownMemberType]
182
+
183
+ response = await self.openai_client.responses.create(
184
+ model=self.model,
185
+ tools=[computer_tool],
186
+ input=input_param,
187
+ instructions=self.system_prompt,
188
+ truncation="auto",
189
+ reasoning={"summary": "auto"}, # type: ignore[arg-type]
190
+ )
191
+ else:
192
+ # Follow-up step - check if this is user input or tool result
193
+ latest_message = messages[-1] if messages else {}
194
+
195
+ if latest_message.get("type") == "input_text":
196
+ # User provided input in conversation mode
197
+ user_text = latest_message.get("text", "")
198
+ input_param_followup: ResponseInputParam = [ # type: ignore[reportAssignmentType]
199
+ {"role": "user", "content": [{"type": "input_text", "text": user_text}]}
200
+ ]
201
+ # Reset pending_call_id since this is user input, not a tool response
202
+ self.pending_call_id = None
203
+ else:
204
+ # Tool result - need screenshot from processed results
205
+ latest_screenshot = None
206
+ for msg in reversed(messages):
207
+ if isinstance(msg, dict) and "image_url" in msg:
208
+ latest_screenshot = msg["image_url"] # type: ignore
209
+ break
210
+
211
+ if not latest_screenshot:
212
+ logger.warning("No screenshot provided for response to action")
213
+ return AgentResponse(
214
+ content="No screenshot available for next action",
215
+ tool_calls=[],
216
+ done=True,
217
+ )
218
+
219
+ # Create response to previous action
220
+ input_param_followup: ResponseInputParam = [ # type: ignore[reportAssignmentType]
221
+ { # type: ignore[reportAssignmentType]
222
+ "call_id": self.pending_call_id,
223
+ "type": "computer_call_output",
224
+ "output": {
225
+ "type": "input_image",
226
+ "image_url": latest_screenshot,
227
+ },
228
+ "acknowledged_safety_checks": self.pending_safety_checks,
229
+ }
230
+ ]
231
+
232
+ self.pending_safety_checks = []
233
+
234
+ response = await self.openai_client.responses.create(
235
+ model=self.model,
236
+ previous_response_id=self.last_response_id,
237
+ tools=[computer_tool],
238
+ input=input_param_followup,
239
+ instructions=self.system_prompt,
240
+ truncation="auto",
241
+ reasoning={"summary": "auto"}, # type: ignore[arg-type]
242
+ )
243
+
244
+ # Store response ID for next call
245
+ self.last_response_id = response.id
246
+
247
+ # Process response
248
+ result = AgentResponse(
249
+ content="",
250
+ tool_calls=[],
251
+ done=False, # Will be set to True only if no tool calls
252
+ )
253
+
254
+ self.pending_call_id = None
255
+
256
+ # Check for computer calls
257
+ computer_calls = [
258
+ item
259
+ for item in response.output
260
+ if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
261
+ ]
262
+
263
+ if computer_calls:
264
+ # Process computer calls
265
+ result.done = False
266
+ for computer_call in computer_calls:
267
+ self.pending_call_id = computer_call.call_id
268
+ self.pending_safety_checks = computer_call.pending_safety_checks
269
+
270
+ # Convert OpenAI action to MCP tool call
271
+ action = computer_call.action.model_dump()
272
+
273
+ # Create MCPToolCall object with OpenAI metadata as extra fields
274
+ # Pyright will complain but the tool class accepts extra fields
275
+ tool_call = MCPToolCall(
276
+ name=computer_tool_name,
277
+ arguments=action,
278
+ id=computer_call.call_id, # type: ignore
279
+ pending_safety_checks=computer_call.pending_safety_checks, # type: ignore
280
+ )
281
+ result.tool_calls.append(tool_call)
282
+ else:
283
+ # No computer calls, check for text response
284
+ for item in response.output:
285
+ if isinstance(item, ResponseOutputMessage) and item.type == "message":
286
+ # Extract text from content blocks
287
+ text_parts = [
288
+ content.text
289
+ for content in item.content
290
+ if isinstance(content, ResponseOutputText)
291
+ ]
292
+ if text_parts:
293
+ result.content = "".join(text_parts)
294
+ break
295
+
296
+ # Extract reasoning if present
297
+ reasoning_text = ""
298
+ for item in response.output:
299
+ if item.type == "reasoning" and hasattr(item, "summary") and item.summary:
300
+ reasoning_text += f"Thinking: {item.summary[0].text}\n"
301
+
302
+ if reasoning_text:
303
+ result.content = reasoning_text + result.content if result.content else reasoning_text
304
+
305
+ # Set done=True if no tool calls (task complete or waiting for user)
306
+ if not result.tool_calls:
307
+ result.done = True
308
+
309
+ return result
310
+
311
+ async def format_tool_results(
312
+ self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
313
+ ) -> ResponseInputMessageContentListParam:
314
+ """
315
+ Format tool results for OpenAI's stateful API.
316
+
317
+ Tool result content is a list of ContentBlock objects.
318
+ We need to extract the latest screenshot from the tool results.
319
+
320
+ This assumes that you only care about computer tool results for your agent loop.
321
+ If you need to add other content, you can do so by adding a new ContentBlock object to the list.
322
+
323
+ Returns formatted dicts with tool result data, preserving screenshots.
324
+ """ # noqa: E501
325
+ formatted_results = []
326
+ latest_screenshot = None
327
+
328
+ # Extract all content from tool results
329
+ for result in tool_results:
330
+ if result.isError:
331
+ # If it's an error, the error details are in the content
332
+ for content in result.content:
333
+ if isinstance(content, types.TextContent):
334
+ # Don't add error text as input_text, just track it
335
+ logger.error("Tool error: %s", content.text)
336
+ elif isinstance(content, types.ImageContent):
337
+ # Even error results might have images
338
+ latest_screenshot = content.data
339
+ else:
340
+ # Extract content from successful results
341
+ for content in result.content:
342
+ if isinstance(content, types.ImageContent):
343
+ latest_screenshot = content.data
344
+ break
345
+
346
+ # Return a dict with the latest screenshot for the follow-up step
347
+ if latest_screenshot:
348
+ formatted_results.append(
349
+ {"type": "input_image", "image_url": f"data:image/png;base64,{latest_screenshot}"}
350
+ )
351
+
352
+ return formatted_results