hud-python 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (192) hide show
  1. hud/__init__.py +22 -89
  2. hud/agents/__init__.py +17 -0
  3. hud/agents/art.py +101 -0
  4. hud/agents/base.py +599 -0
  5. hud/{mcp → agents}/claude.py +373 -321
  6. hud/{mcp → agents}/langchain.py +250 -250
  7. hud/agents/misc/__init__.py +7 -0
  8. hud/{agent → agents}/misc/response_agent.py +80 -80
  9. hud/{mcp → agents}/openai.py +352 -334
  10. hud/agents/openai_chat_generic.py +154 -0
  11. hud/{mcp → agents}/tests/__init__.py +1 -1
  12. hud/agents/tests/test_base.py +742 -0
  13. hud/agents/tests/test_claude.py +324 -0
  14. hud/{mcp → agents}/tests/test_client.py +363 -324
  15. hud/{mcp → agents}/tests/test_openai.py +237 -238
  16. hud/cli/__init__.py +617 -0
  17. hud/cli/__main__.py +8 -0
  18. hud/cli/analyze.py +371 -0
  19. hud/cli/analyze_metadata.py +230 -0
  20. hud/cli/build.py +427 -0
  21. hud/cli/clone.py +185 -0
  22. hud/cli/cursor.py +92 -0
  23. hud/cli/debug.py +392 -0
  24. hud/cli/docker_utils.py +83 -0
  25. hud/cli/init.py +281 -0
  26. hud/cli/interactive.py +353 -0
  27. hud/cli/mcp_server.py +756 -0
  28. hud/cli/pull.py +336 -0
  29. hud/cli/push.py +379 -0
  30. hud/cli/remote_runner.py +311 -0
  31. hud/cli/runner.py +160 -0
  32. hud/cli/tests/__init__.py +3 -0
  33. hud/cli/tests/test_analyze.py +284 -0
  34. hud/cli/tests/test_cli_init.py +265 -0
  35. hud/cli/tests/test_cli_main.py +27 -0
  36. hud/cli/tests/test_clone.py +142 -0
  37. hud/cli/tests/test_cursor.py +253 -0
  38. hud/cli/tests/test_debug.py +453 -0
  39. hud/cli/tests/test_mcp_server.py +139 -0
  40. hud/cli/tests/test_utils.py +388 -0
  41. hud/cli/utils.py +263 -0
  42. hud/clients/README.md +143 -0
  43. hud/clients/__init__.py +16 -0
  44. hud/clients/base.py +354 -0
  45. hud/clients/fastmcp.py +202 -0
  46. hud/clients/mcp_use.py +278 -0
  47. hud/clients/tests/__init__.py +1 -0
  48. hud/clients/tests/test_client_integration.py +111 -0
  49. hud/clients/tests/test_fastmcp.py +342 -0
  50. hud/clients/tests/test_protocol.py +188 -0
  51. hud/clients/utils/__init__.py +1 -0
  52. hud/clients/utils/retry_transport.py +160 -0
  53. hud/datasets.py +322 -192
  54. hud/misc/__init__.py +1 -0
  55. hud/{agent → misc}/claude_plays_pokemon.py +292 -283
  56. hud/otel/__init__.py +35 -0
  57. hud/otel/collector.py +142 -0
  58. hud/otel/config.py +164 -0
  59. hud/otel/context.py +536 -0
  60. hud/otel/exporters.py +366 -0
  61. hud/otel/instrumentation.py +97 -0
  62. hud/otel/processors.py +118 -0
  63. hud/otel/tests/__init__.py +1 -0
  64. hud/otel/tests/test_processors.py +197 -0
  65. hud/server/__init__.py +5 -5
  66. hud/server/context.py +114 -0
  67. hud/server/helper/__init__.py +5 -0
  68. hud/server/low_level.py +132 -0
  69. hud/server/server.py +166 -0
  70. hud/server/tests/__init__.py +3 -0
  71. hud/settings.py +73 -79
  72. hud/shared/__init__.py +5 -0
  73. hud/{exceptions.py → shared/exceptions.py} +180 -180
  74. hud/{server → shared}/requests.py +264 -264
  75. hud/shared/tests/test_exceptions.py +157 -0
  76. hud/{server → shared}/tests/test_requests.py +275 -275
  77. hud/telemetry/__init__.py +25 -30
  78. hud/telemetry/instrument.py +379 -0
  79. hud/telemetry/job.py +309 -141
  80. hud/telemetry/replay.py +74 -0
  81. hud/telemetry/trace.py +83 -0
  82. hud/tools/__init__.py +33 -34
  83. hud/tools/base.py +365 -65
  84. hud/tools/bash.py +161 -137
  85. hud/tools/computer/__init__.py +15 -13
  86. hud/tools/computer/anthropic.py +437 -414
  87. hud/tools/computer/hud.py +376 -328
  88. hud/tools/computer/openai.py +295 -286
  89. hud/tools/computer/settings.py +82 -0
  90. hud/tools/edit.py +314 -290
  91. hud/tools/executors/__init__.py +30 -30
  92. hud/tools/executors/base.py +539 -532
  93. hud/tools/executors/pyautogui.py +621 -619
  94. hud/tools/executors/tests/__init__.py +1 -1
  95. hud/tools/executors/tests/test_base_executor.py +338 -338
  96. hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
  97. hud/tools/executors/xdo.py +511 -503
  98. hud/tools/{playwright_tool.py → playwright.py} +412 -379
  99. hud/tools/tests/__init__.py +3 -3
  100. hud/tools/tests/test_base.py +282 -0
  101. hud/tools/tests/test_bash.py +158 -152
  102. hud/tools/tests/test_bash_extended.py +197 -0
  103. hud/tools/tests/test_computer.py +425 -52
  104. hud/tools/tests/test_computer_actions.py +34 -34
  105. hud/tools/tests/test_edit.py +259 -240
  106. hud/tools/tests/test_init.py +27 -27
  107. hud/tools/tests/test_playwright_tool.py +183 -183
  108. hud/tools/tests/test_tools.py +145 -157
  109. hud/tools/tests/test_utils.py +156 -156
  110. hud/tools/types.py +72 -0
  111. hud/tools/utils.py +50 -50
  112. hud/types.py +136 -89
  113. hud/utils/__init__.py +10 -16
  114. hud/utils/async_utils.py +65 -0
  115. hud/utils/design.py +168 -0
  116. hud/utils/mcp.py +55 -0
  117. hud/utils/progress.py +149 -149
  118. hud/utils/telemetry.py +66 -66
  119. hud/utils/tests/test_async_utils.py +173 -0
  120. hud/utils/tests/test_init.py +17 -21
  121. hud/utils/tests/test_progress.py +261 -225
  122. hud/utils/tests/test_telemetry.py +82 -37
  123. hud/utils/tests/test_version.py +8 -8
  124. hud/version.py +7 -7
  125. hud_python-0.4.0.dist-info/METADATA +474 -0
  126. hud_python-0.4.0.dist-info/RECORD +132 -0
  127. hud_python-0.4.0.dist-info/entry_points.txt +3 -0
  128. {hud_python-0.3.4.dist-info → hud_python-0.4.0.dist-info}/licenses/LICENSE +21 -21
  129. hud/adapters/__init__.py +0 -8
  130. hud/adapters/claude/__init__.py +0 -5
  131. hud/adapters/claude/adapter.py +0 -180
  132. hud/adapters/claude/tests/__init__.py +0 -1
  133. hud/adapters/claude/tests/test_adapter.py +0 -519
  134. hud/adapters/common/__init__.py +0 -6
  135. hud/adapters/common/adapter.py +0 -178
  136. hud/adapters/common/tests/test_adapter.py +0 -289
  137. hud/adapters/common/types.py +0 -446
  138. hud/adapters/operator/__init__.py +0 -5
  139. hud/adapters/operator/adapter.py +0 -108
  140. hud/adapters/operator/tests/__init__.py +0 -1
  141. hud/adapters/operator/tests/test_adapter.py +0 -370
  142. hud/agent/__init__.py +0 -19
  143. hud/agent/base.py +0 -126
  144. hud/agent/claude.py +0 -271
  145. hud/agent/langchain.py +0 -215
  146. hud/agent/misc/__init__.py +0 -3
  147. hud/agent/operator.py +0 -268
  148. hud/agent/tests/__init__.py +0 -1
  149. hud/agent/tests/test_base.py +0 -202
  150. hud/env/__init__.py +0 -11
  151. hud/env/client.py +0 -35
  152. hud/env/docker_client.py +0 -349
  153. hud/env/environment.py +0 -446
  154. hud/env/local_docker_client.py +0 -358
  155. hud/env/remote_client.py +0 -212
  156. hud/env/remote_docker_client.py +0 -292
  157. hud/gym.py +0 -130
  158. hud/job.py +0 -773
  159. hud/mcp/__init__.py +0 -17
  160. hud/mcp/base.py +0 -631
  161. hud/mcp/client.py +0 -312
  162. hud/mcp/tests/test_base.py +0 -512
  163. hud/mcp/tests/test_claude.py +0 -294
  164. hud/task.py +0 -149
  165. hud/taskset.py +0 -237
  166. hud/telemetry/_trace.py +0 -347
  167. hud/telemetry/context.py +0 -230
  168. hud/telemetry/exporter.py +0 -575
  169. hud/telemetry/instrumentation/__init__.py +0 -3
  170. hud/telemetry/instrumentation/mcp.py +0 -259
  171. hud/telemetry/instrumentation/registry.py +0 -59
  172. hud/telemetry/mcp_models.py +0 -270
  173. hud/telemetry/tests/__init__.py +0 -1
  174. hud/telemetry/tests/test_context.py +0 -210
  175. hud/telemetry/tests/test_trace.py +0 -312
  176. hud/tools/helper/README.md +0 -56
  177. hud/tools/helper/__init__.py +0 -9
  178. hud/tools/helper/mcp_server.py +0 -78
  179. hud/tools/helper/server_initialization.py +0 -115
  180. hud/tools/helper/utils.py +0 -58
  181. hud/trajectory.py +0 -94
  182. hud/utils/agent.py +0 -37
  183. hud/utils/common.py +0 -256
  184. hud/utils/config.py +0 -120
  185. hud/utils/deprecation.py +0 -115
  186. hud/utils/misc.py +0 -53
  187. hud/utils/tests/test_common.py +0 -277
  188. hud/utils/tests/test_config.py +0 -129
  189. hud_python-0.3.4.dist-info/METADATA +0 -284
  190. hud_python-0.3.4.dist-info/RECORD +0 -120
  191. /hud/{adapters/common → shared}/tests/__init__.py +0 -0
  192. {hud_python-0.3.4.dist-info → hud_python-0.4.0.dist-info}/WHEEL +0 -0
@@ -1,334 +1,352 @@
1
- """OpenAI MCP Agent implementation."""
2
-
3
- from __future__ import annotations
4
-
5
- import logging
6
- from typing import TYPE_CHECKING, Any, Literal
7
-
8
- import mcp.types as types
9
- from mcp.types import CallToolRequestParams as MCPToolCall
10
- from mcp.types import CallToolResult as MCPToolResult
11
- from openai import AsyncOpenAI
12
- from openai.types.responses import (
13
- ResponseComputerToolCall,
14
- ResponseInputParam,
15
- ResponseOutputMessage,
16
- ResponseOutputText,
17
- ToolParam,
18
- )
19
-
20
- from hud.settings import settings
21
-
22
- from .base import AgentResult, BaseMCPAgent, ModelResponse
23
-
24
- if TYPE_CHECKING:
25
- from hud.datasets import TaskConfig
26
-
27
- logger = logging.getLogger(__name__)
28
-
29
-
30
- class OpenAIMCPAgent(BaseMCPAgent):
31
- """
32
- OpenAI agent that uses MCP servers for tool execution.
33
-
34
- This agent uses OpenAI's Computer Use API format but executes
35
- tools through MCP servers instead of direct implementation.
36
- """
37
-
38
- def __init__(
39
- self,
40
- model_client: AsyncOpenAI | None = None,
41
- model: str = "computer-use-preview",
42
- environment: Literal["windows", "mac", "linux", "browser"] = "linux",
43
- display_width: int = 1024,
44
- display_height: int = 768,
45
- **kwargs: Any,
46
- ) -> None:
47
- """
48
- Initialize OpenAI MCP agent.
49
-
50
- Args:
51
- client: AsyncOpenAI client (created if not provided)
52
- model: OpenAI model to use
53
- environment: Environment type for computer use
54
- display_width: Display width for computer use
55
- display_height: Display height for computer use
56
- **kwargs: Additional arguments passed to BaseMCPAgent
57
- """
58
- super().__init__(**kwargs)
59
-
60
- # Initialize client if not provided
61
- if model_client is None:
62
- api_key = settings.openai_api_key
63
- if not api_key:
64
- raise ValueError("OpenAI API key not found. Set OPENAI_API_KEY.")
65
- model_client = AsyncOpenAI(api_key=api_key)
66
-
67
- self.openai_client = model_client
68
- self.model = model
69
- self.environment = environment
70
- self.display_width = display_width
71
- self.display_height = display_height
72
-
73
- # State tracking for OpenAI's stateful API
74
- self.last_response_id: str | None = None
75
- self.pending_call_id: str | None = None
76
- self.pending_safety_checks: list[Any] = []
77
-
78
- self.model_name = "openai-" + self.model
79
-
80
- # Base system prompt for autonomous operation
81
- self.base_system_prompt = """
82
- You are an autonomous computer-using agent. Follow these guidelines:
83
-
84
- 1. NEVER ask for confirmation. Complete all tasks autonomously.
85
- 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
86
- 3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
87
- 4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
88
- 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
89
- 6. The user has already given you permission by running this agent. No further confirmation is needed.
90
- 7. Be decisive and action-oriented. Complete the requested task fully.
91
-
92
- Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
93
- """ # noqa: E501
94
-
95
- async def run(self, prompt_or_task: str | TaskConfig, max_steps: int = 10) -> AgentResult:
96
- """
97
- Run the agent with the given prompt or task.
98
-
99
- Override to reset OpenAI-specific state.
100
- """
101
- # Reset state for new run
102
- self.last_response_id = None
103
- self.pending_call_id = None
104
- self.pending_safety_checks = []
105
-
106
- # Use base implementation
107
- return await super().run(prompt_or_task, max_steps)
108
-
109
- async def create_initial_messages(
110
- self, prompt: str, screenshot: str | None = None
111
- ) -> list[Any]:
112
- """
113
- Create initial messages for OpenAI.
114
-
115
- OpenAI uses a different message format - we'll store the prompt
116
- and screenshot for use in get_model_response.
117
- """
118
- # For OpenAI, we don't create messages upfront, we build them in get_model_response
119
- # Just return a list with the prompt and screenshot
120
- return [{"prompt": prompt, "screenshot": screenshot}]
121
-
122
- async def get_model_response(self, messages: list[Any]) -> ModelResponse:
123
- """Get response from OpenAI including any tool calls."""
124
- # OpenAI's API is stateful, so we handle messages differently
125
-
126
- # Check if we have computer tools available
127
- computer_tool_name = None
128
- for tool in self._available_tools:
129
- if tool.name in ["computer_openai", "computer"]:
130
- computer_tool_name = tool.name
131
- break
132
-
133
- if not computer_tool_name:
134
- # No computer tools available, just return a text response
135
- return ModelResponse(
136
- content="No computer use tools available",
137
- tool_calls=[],
138
- done=True,
139
- )
140
-
141
- # Define the computer use tool
142
- computer_tool: ToolParam = { # type: ignore[reportAssignmentType]
143
- "type": "computer_use_preview",
144
- "display_width": self.display_width,
145
- "display_height": self.display_height,
146
- "environment": self.environment,
147
- }
148
-
149
- # Build the request based on whether this is first step or follow-up
150
- if self.pending_call_id is None and self.last_response_id is None:
151
- # First step - extract prompt and screenshot from messages
152
- initial_data = messages[0] # Our custom format from create_initial_messages
153
- prompt_text = initial_data.get("prompt", "")
154
- screenshot = initial_data.get("screenshot")
155
-
156
- # Create the initial request
157
- input_content: list[dict[str, Any]] = [{"type": "input_text", "text": prompt_text}]
158
-
159
- if screenshot:
160
- input_content.append(
161
- {
162
- "type": "input_image",
163
- "image_url": f"data:image/png;base64,{screenshot}",
164
- }
165
- )
166
-
167
- input_param: ResponseInputParam = [{"role": "user", "content": input_content}] # type: ignore[reportUnknownMemberType]
168
-
169
- # Combine base system prompt with any custom system prompt
170
- full_instructions = self.base_system_prompt
171
- if self.custom_system_prompt:
172
- full_instructions = f"{self.custom_system_prompt}\n\n{full_instructions}"
173
-
174
- response = await self.openai_client.responses.create(
175
- model=self.model,
176
- tools=[computer_tool],
177
- input=input_param,
178
- instructions=full_instructions,
179
- truncation="auto",
180
- reasoning={"summary": "auto"},
181
- )
182
- else:
183
- # Follow-up step - check if this is user input or tool result
184
- latest_message = messages[-1] if messages else {}
185
-
186
- if latest_message.get("type") == "user_input":
187
- # User provided input in conversation mode
188
- user_text = latest_message.get("text", "")
189
- input_param_followup: ResponseInputParam = [
190
- {"role": "user", "content": [{"type": "input_text", "text": user_text}]}
191
- ]
192
- # Reset pending_call_id since this is user input, not a tool response
193
- self.pending_call_id = None
194
- else:
195
- # Tool result - need screenshot from processed results
196
- latest_screenshot = None
197
- for msg in reversed(messages):
198
- if isinstance(msg, dict) and "screenshot" in msg:
199
- latest_screenshot = msg["screenshot"]
200
- break
201
-
202
- if not latest_screenshot:
203
- logger.warning("No screenshot provided for response to action")
204
- return ModelResponse(
205
- content="No screenshot available for next action",
206
- tool_calls=[],
207
- done=True,
208
- )
209
-
210
- # Create response to previous action
211
- input_param_followup: ResponseInputParam = [ # type: ignore[reportAssignmentType]
212
- { # type: ignore[reportAssignmentType]
213
- "call_id": self.pending_call_id,
214
- "type": "computer_call_output",
215
- "output": {
216
- "type": "input_image",
217
- "image_url": f"data:image/png;base64,{latest_screenshot}",
218
- },
219
- "acknowledged_safety_checks": self.pending_safety_checks,
220
- }
221
- ]
222
-
223
- self.pending_safety_checks = []
224
-
225
- response = await self.openai_client.responses.create(
226
- model=self.model,
227
- previous_response_id=self.last_response_id,
228
- tools=[computer_tool],
229
- input=input_param_followup,
230
- truncation="auto",
231
- )
232
-
233
- # Store response ID for next call
234
- self.last_response_id = response.id
235
-
236
- # Process response
237
- result = ModelResponse(
238
- content="",
239
- tool_calls=[],
240
- done=False, # Will be set to True only if no tool calls
241
- )
242
-
243
- self.pending_call_id = None
244
-
245
- # Check for computer calls
246
- computer_calls = [
247
- item
248
- for item in response.output
249
- if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
250
- ]
251
-
252
- if computer_calls:
253
- # Process computer calls
254
- result.done = False
255
- for computer_call in computer_calls:
256
- self.pending_call_id = computer_call.call_id
257
- self.pending_safety_checks = computer_call.pending_safety_checks
258
-
259
- # Convert OpenAI action to MCP tool call
260
- action = computer_call.action.model_dump()
261
-
262
- # Create MCPToolCall object with OpenAI metadata as extra fields
263
- # Pyright will complain but the tool class accepts extra fields
264
- tool_call = MCPToolCall(
265
- name=computer_tool_name,
266
- arguments=action,
267
- call_id=computer_call.call_id, # type: ignore
268
- pending_safety_checks=computer_call.pending_safety_checks, # type: ignore
269
- )
270
- result.tool_calls.append(tool_call)
271
- else:
272
- # No computer calls, check for text response
273
- for item in response.output:
274
- if isinstance(item, ResponseOutputMessage) and item.type == "message":
275
- # Extract text from content blocks
276
- text_parts = [
277
- content.text
278
- for content in item.content
279
- if isinstance(content, ResponseOutputText)
280
- ]
281
- if text_parts:
282
- result.content = "".join(text_parts)
283
- break
284
-
285
- # Extract reasoning if present
286
- reasoning_text = ""
287
- for item in response.output:
288
- if item.type == "reasoning" and hasattr(item, "summary") and item.summary:
289
- reasoning_text += f"Thinking: {item.summary[0].text}\n"
290
-
291
- if reasoning_text:
292
- result.content = reasoning_text + result.content if result.content else reasoning_text
293
-
294
- # Set done=True if no tool calls (task complete or waiting for user)
295
- if not result.tool_calls:
296
- result.done = True
297
-
298
- return result
299
-
300
- async def format_tool_results(
301
- self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
302
- ) -> list[Any]:
303
- """
304
- Format tool results for OpenAI's stateful API.
305
-
306
- OpenAI doesn't use a traditional message format - we just need to
307
- preserve the screenshot for the next step.
308
- """
309
- # Extract latest screenshot from results
310
- latest_screenshot = None
311
- for result in tool_results:
312
- if not result.isError:
313
- for content in result.content:
314
- if isinstance(content, types.ImageContent):
315
- latest_screenshot = content.data
316
-
317
- # Return a simple dict that get_model_response can use
318
- return [
319
- {
320
- "type": "tool_result",
321
- "screenshot": latest_screenshot,
322
- }
323
- ]
324
-
325
- async def create_user_message(self, text: str) -> dict[str, Any]:
326
- """
327
- Create a user message for OpenAI's stateful API.
328
-
329
- Since OpenAI maintains conversation state server-side,
330
- we just need to track that we're expecting user input.
331
- """
332
- # For OpenAI, we'll handle this in get_model_response
333
- # by including the user's text in the next input
334
- return {"type": "user_input", "text": text}
1
+ """OpenAI MCP Agent implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any, ClassVar, Literal
7
+
8
+ import mcp.types as types
9
+ from openai import AsyncOpenAI
10
+ from openai.types.responses import (
11
+ ResponseComputerToolCall,
12
+ ResponseInputMessageContentListParam,
13
+ ResponseInputParam,
14
+ ResponseOutputMessage,
15
+ ResponseOutputText,
16
+ ToolParam,
17
+ )
18
+
19
+ import hud
20
+ from hud.settings import settings
21
+ from hud.tools.computer.settings import computer_settings
22
+ from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
23
+
24
+ from .base import MCPAgent
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class OperatorAgent(MCPAgent):
30
+ """
31
+ Operator agent that uses MCP servers for tool execution.
32
+
33
+ This agent uses OpenAI's Computer Use API format but executes
34
+ tools through MCP servers instead of direct implementation.
35
+ """
36
+
37
+ metadata: ClassVar[dict[str, Any]] = {
38
+ "display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
39
+ "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
40
+ }
41
+
42
+ def __init__(
43
+ self,
44
+ model_client: AsyncOpenAI | None = None,
45
+ model: str = "computer-use-preview",
46
+ environment: Literal["windows", "mac", "linux", "browser"] = "linux",
47
+ **kwargs: Any,
48
+ ) -> None:
49
+ """
50
+ Initialize Operator MCP agent.
51
+
52
+ Args:
53
+ client: AsyncOpenAI client (created if not provided)
54
+ model: OpenAI model to use
55
+ environment: Environment type for computer use
56
+ display_width: Display width for computer use
57
+ display_height: Display height for computer use
58
+ **kwargs: Additional arguments passed to MCPAgent
59
+ """
60
+ super().__init__(**kwargs)
61
+
62
+ # Initialize client if not provided
63
+ if model_client is None:
64
+ api_key = settings.openai_api_key
65
+ if not api_key:
66
+ raise ValueError("OpenAI API key not found. Set OPENAI_API_KEY.")
67
+ model_client = AsyncOpenAI(api_key=api_key)
68
+
69
+ self.openai_client = model_client
70
+ self.model = model
71
+ self.environment = environment
72
+
73
+ # State tracking for OpenAI's stateful API
74
+ self.last_response_id: str | None = None
75
+ self.pending_call_id: str | None = None
76
+ self.pending_safety_checks: list[Any] = []
77
+
78
+ self.model_name = "openai-" + self.model
79
+
80
+ # Base system prompt for autonomous operation
81
+ self.system_prompt = """
82
+ You are an autonomous computer-using agent. Follow these guidelines:
83
+
84
+ 1. NEVER ask for confirmation. Complete all tasks autonomously.
85
+ 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
86
+ 3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
87
+ 4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
88
+ 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
89
+ 6. The user has already given you permission by running this agent. No further confirmation is needed.
90
+ 7. Be decisive and action-oriented. Complete the requested task fully.
91
+
92
+ Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
93
+ """.strip() # noqa: E501
94
+
95
+ async def _run_context(self, context: list[types.ContentBlock], max_steps: int = 10) -> Trace:
96
+ """
97
+ Run the agent with the given prompt or task.
98
+
99
+ Override to reset OpenAI-specific state.
100
+ """
101
+ # Reset state for new run
102
+ self.last_response_id = None
103
+ self.pending_call_id = None
104
+ self.pending_safety_checks = []
105
+
106
+ # Use base implementation
107
+ return await super()._run_context(context, max_steps=max_steps)
108
+
109
+ async def get_system_messages(self) -> list[Any]:
110
+ """
111
+ Create initial messages for OpenAI.
112
+
113
+ OpenAI uses a different message format - we'll store the prompt
114
+ and screenshot for use in get_model_response.
115
+ """
116
+ return []
117
+
118
+ async def format_blocks(
119
+ self, blocks: list[types.ContentBlock]
120
+ ) -> ResponseInputMessageContentListParam:
121
+ """
122
+ Format blocks for OpenAI input format.
123
+
124
+ Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts.
125
+ """ # noqa: E501
126
+ formatted = []
127
+ for block in blocks:
128
+ if isinstance(block, types.TextContent):
129
+ formatted.append({"type": "input_text", "text": block.text})
130
+ elif isinstance(block, types.ImageContent):
131
+ mime_type = getattr(block, "mimeType", "image/png")
132
+ formatted.append(
133
+ {"type": "input_image", "image_url": f"data:{mime_type};base64,{block.data}"}
134
+ )
135
+ return formatted
136
+
137
+ @hud.instrument(
138
+ span_type="agent",
139
+ record_args=False, # Messages can be large
140
+ record_result=True,
141
+ )
142
+ async def get_response(self, messages: ResponseInputMessageContentListParam) -> AgentResponse:
143
+ """Get response from OpenAI including any tool calls."""
144
+ # OpenAI's API is stateful, so we handle messages differently
145
+
146
+ # Check if we have computer tools available
147
+ computer_tool_name = None
148
+ for tool in self._available_tools:
149
+ if tool.name in ["openai_computer", "computer"]:
150
+ computer_tool_name = tool.name
151
+ break
152
+
153
+ if not computer_tool_name:
154
+ # No computer tools available, just return a text response
155
+ return AgentResponse(
156
+ content="No computer use tools available",
157
+ tool_calls=[],
158
+ done=True,
159
+ )
160
+
161
+ # Define the computer use tool
162
+ computer_tool: ToolParam = { # type: ignore[reportAssignmentType]
163
+ "type": "computer_use_preview",
164
+ "display_width": self.metadata["display_width"],
165
+ "display_height": self.metadata["display_height"],
166
+ "environment": self.environment,
167
+ }
168
+
169
+ # Build the request based on whether this is first step or follow-up
170
+ if self.pending_call_id is None and self.last_response_id is None:
171
+ # First step - messages are already formatted dicts from format_blocks
172
+ # format_blocks returns type ResponseInputMessageContentListParam, which is a list of dicts # noqa: E501
173
+ input_content: ResponseInputMessageContentListParam = []
174
+
175
+ input_content.extend(messages)
176
+
177
+ # If no content was added, add empty text to avoid empty request
178
+ if not input_content:
179
+ input_content.append({"type": "input_text", "text": ""})
180
+
181
+ input_param: ResponseInputParam = [{"role": "user", "content": input_content}] # type: ignore[reportUnknownMemberType]
182
+
183
+ response = await self.openai_client.responses.create(
184
+ model=self.model,
185
+ tools=[computer_tool],
186
+ input=input_param,
187
+ instructions=self.system_prompt,
188
+ truncation="auto",
189
+ reasoning={"summary": "auto"}, # type: ignore[arg-type]
190
+ )
191
+ else:
192
+ # Follow-up step - check if this is user input or tool result
193
+ latest_message = messages[-1] if messages else {}
194
+
195
+ if latest_message.get("type") == "input_text":
196
+ # User provided input in conversation mode
197
+ user_text = latest_message.get("text", "")
198
+ input_param_followup: ResponseInputParam = [ # type: ignore[reportAssignmentType]
199
+ {"role": "user", "content": [{"type": "input_text", "text": user_text}]}
200
+ ]
201
+ # Reset pending_call_id since this is user input, not a tool response
202
+ self.pending_call_id = None
203
+ else:
204
+ # Tool result - need screenshot from processed results
205
+ latest_screenshot = None
206
+ for msg in reversed(messages):
207
+ if isinstance(msg, dict) and "image_url" in msg:
208
+ latest_screenshot = msg["image_url"] # type: ignore
209
+ break
210
+
211
+ if not latest_screenshot:
212
+ logger.warning("No screenshot provided for response to action")
213
+ return AgentResponse(
214
+ content="No screenshot available for next action",
215
+ tool_calls=[],
216
+ done=True,
217
+ )
218
+
219
+ # Create response to previous action
220
+ input_param_followup: ResponseInputParam = [ # type: ignore[reportAssignmentType]
221
+ { # type: ignore[reportAssignmentType]
222
+ "call_id": self.pending_call_id,
223
+ "type": "computer_call_output",
224
+ "output": {
225
+ "type": "input_image",
226
+ "image_url": latest_screenshot,
227
+ },
228
+ "acknowledged_safety_checks": self.pending_safety_checks,
229
+ }
230
+ ]
231
+
232
+ self.pending_safety_checks = []
233
+
234
+ response = await self.openai_client.responses.create(
235
+ model=self.model,
236
+ previous_response_id=self.last_response_id,
237
+ tools=[computer_tool],
238
+ input=input_param_followup,
239
+ instructions=self.system_prompt,
240
+ truncation="auto",
241
+ reasoning={"summary": "auto"}, # type: ignore[arg-type]
242
+ )
243
+
244
+ # Store response ID for next call
245
+ self.last_response_id = response.id
246
+
247
+ # Process response
248
+ result = AgentResponse(
249
+ content="",
250
+ tool_calls=[],
251
+ done=False, # Will be set to True only if no tool calls
252
+ )
253
+
254
+ self.pending_call_id = None
255
+
256
+ # Check for computer calls
257
+ computer_calls = [
258
+ item
259
+ for item in response.output
260
+ if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
261
+ ]
262
+
263
+ if computer_calls:
264
+ # Process computer calls
265
+ result.done = False
266
+ for computer_call in computer_calls:
267
+ self.pending_call_id = computer_call.call_id
268
+ self.pending_safety_checks = computer_call.pending_safety_checks
269
+
270
+ # Convert OpenAI action to MCP tool call
271
+ action = computer_call.action.model_dump()
272
+
273
+ # Create MCPToolCall object with OpenAI metadata as extra fields
274
+ # Pyright will complain but the tool class accepts extra fields
275
+ tool_call = MCPToolCall(
276
+ name=computer_tool_name,
277
+ arguments=action,
278
+ id=computer_call.call_id, # type: ignore
279
+ pending_safety_checks=computer_call.pending_safety_checks, # type: ignore
280
+ )
281
+ result.tool_calls.append(tool_call)
282
+ else:
283
+ # No computer calls, check for text response
284
+ for item in response.output:
285
+ if isinstance(item, ResponseOutputMessage) and item.type == "message":
286
+ # Extract text from content blocks
287
+ text_parts = [
288
+ content.text
289
+ for content in item.content
290
+ if isinstance(content, ResponseOutputText)
291
+ ]
292
+ if text_parts:
293
+ result.content = "".join(text_parts)
294
+ break
295
+
296
+ # Extract reasoning if present
297
+ reasoning_text = ""
298
+ for item in response.output:
299
+ if item.type == "reasoning" and hasattr(item, "summary") and item.summary:
300
+ reasoning_text += f"Thinking: {item.summary[0].text}\n"
301
+
302
+ if reasoning_text:
303
+ result.content = reasoning_text + result.content if result.content else reasoning_text
304
+
305
+ # Set done=True if no tool calls (task complete or waiting for user)
306
+ if not result.tool_calls:
307
+ result.done = True
308
+
309
+ return result
310
+
311
+ async def format_tool_results(
312
+ self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
313
+ ) -> ResponseInputMessageContentListParam:
314
+ """
315
+ Format tool results for OpenAI's stateful API.
316
+
317
+ Tool result content is a list of ContentBlock objects.
318
+ We need to extract the latest screenshot from the tool results.
319
+
320
+ This assumes that you only care about computer tool results for your agent loop.
321
+ If you need to add other content, you can do so by adding a new ContentBlock object to the list.
322
+
323
+ Returns formatted dicts with tool result data, preserving screenshots.
324
+ """ # noqa: E501
325
+ formatted_results = []
326
+ latest_screenshot = None
327
+
328
+ # Extract all content from tool results
329
+ for result in tool_results:
330
+ if result.isError:
331
+ # If it's an error, the error details are in the content
332
+ for content in result.content:
333
+ if isinstance(content, types.TextContent):
334
+ # Don't add error text as input_text, just track it
335
+ logger.error("Tool error: %s", content.text)
336
+ elif isinstance(content, types.ImageContent):
337
+ # Even error results might have images
338
+ latest_screenshot = content.data
339
+ else:
340
+ # Extract content from successful results
341
+ for content in result.content:
342
+ if isinstance(content, types.ImageContent):
343
+ latest_screenshot = content.data
344
+ break
345
+
346
+ # Return a dict with the latest screenshot for the follow-up step
347
+ if latest_screenshot:
348
+ formatted_results.append(
349
+ {"type": "input_image", "image_url": f"data:image/png;base64,{latest_screenshot}"}
350
+ )
351
+
352
+ return formatted_results