hud-python 0.2.9__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (64) hide show
  1. hud/__init__.py +14 -5
  2. hud/env/docker_client.py +1 -1
  3. hud/env/environment.py +13 -8
  4. hud/env/local_docker_client.py +1 -1
  5. hud/env/remote_client.py +1 -1
  6. hud/env/remote_docker_client.py +2 -2
  7. hud/exceptions.py +2 -1
  8. hud/mcp_agent/__init__.py +15 -0
  9. hud/mcp_agent/base.py +723 -0
  10. hud/mcp_agent/claude.py +316 -0
  11. hud/mcp_agent/langchain.py +231 -0
  12. hud/mcp_agent/openai.py +318 -0
  13. hud/mcp_agent/tests/__init__.py +1 -0
  14. hud/mcp_agent/tests/test_base.py +437 -0
  15. hud/settings.py +14 -2
  16. hud/task.py +4 -0
  17. hud/telemetry/__init__.py +11 -7
  18. hud/telemetry/_trace.py +82 -71
  19. hud/telemetry/context.py +9 -27
  20. hud/telemetry/exporter.py +6 -5
  21. hud/telemetry/instrumentation/mcp.py +174 -410
  22. hud/telemetry/mcp_models.py +13 -74
  23. hud/telemetry/tests/test_context.py +9 -6
  24. hud/telemetry/tests/test_trace.py +92 -61
  25. hud/tools/__init__.py +21 -0
  26. hud/tools/base.py +65 -0
  27. hud/tools/bash.py +137 -0
  28. hud/tools/computer/__init__.py +13 -0
  29. hud/tools/computer/anthropic.py +411 -0
  30. hud/tools/computer/hud.py +315 -0
  31. hud/tools/computer/openai.py +283 -0
  32. hud/tools/edit.py +290 -0
  33. hud/tools/executors/__init__.py +13 -0
  34. hud/tools/executors/base.py +331 -0
  35. hud/tools/executors/pyautogui.py +585 -0
  36. hud/tools/executors/tests/__init__.py +1 -0
  37. hud/tools/executors/tests/test_base_executor.py +338 -0
  38. hud/tools/executors/tests/test_pyautogui_executor.py +162 -0
  39. hud/tools/executors/xdo.py +503 -0
  40. hud/tools/helper/README.md +56 -0
  41. hud/tools/helper/__init__.py +9 -0
  42. hud/tools/helper/mcp_server.py +78 -0
  43. hud/tools/helper/server_initialization.py +115 -0
  44. hud/tools/helper/utils.py +58 -0
  45. hud/tools/playwright_tool.py +373 -0
  46. hud/tools/tests/__init__.py +3 -0
  47. hud/tools/tests/test_bash.py +152 -0
  48. hud/tools/tests/test_computer.py +52 -0
  49. hud/tools/tests/test_computer_actions.py +34 -0
  50. hud/tools/tests/test_edit.py +233 -0
  51. hud/tools/tests/test_init.py +27 -0
  52. hud/tools/tests/test_playwright_tool.py +183 -0
  53. hud/tools/tests/test_tools.py +154 -0
  54. hud/tools/tests/test_utils.py +156 -0
  55. hud/tools/utils.py +50 -0
  56. hud/types.py +10 -1
  57. hud/utils/tests/test_init.py +21 -0
  58. hud/utils/tests/test_version.py +1 -1
  59. hud/version.py +1 -1
  60. {hud_python-0.2.9.dist-info → hud_python-0.3.0.dist-info}/METADATA +9 -6
  61. hud_python-0.3.0.dist-info/RECORD +124 -0
  62. hud_python-0.2.9.dist-info/RECORD +0 -85
  63. {hud_python-0.2.9.dist-info → hud_python-0.3.0.dist-info}/WHEEL +0 -0
  64. {hud_python-0.2.9.dist-info → hud_python-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,318 @@
1
+ """OpenAI MCP Agent implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any, Literal
7
+
8
+ from openai import AsyncOpenAI
9
+ from openai.types.responses import (
10
+ ResponseComputerToolCall,
11
+ ResponseInputParam,
12
+ ResponseOutputMessage,
13
+ ResponseOutputText,
14
+ ToolParam,
15
+ )
16
+
17
+ from hud.settings import settings
18
+
19
+ from .base import BaseMCPAgent
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class OpenAIMCPAgent(BaseMCPAgent):
25
+ """
26
+ OpenAI agent that uses MCP servers for tool execution.
27
+
28
+ This agent uses OpenAI's Computer Use API format but executes
29
+ tools through MCP servers instead of direct implementation.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ model_client: AsyncOpenAI | None = None,
35
+ model: str = "computer-use-preview",
36
+ environment: Literal["windows", "mac", "linux", "browser"] = "linux",
37
+ display_width: int = 1024,
38
+ display_height: int = 768,
39
+ **kwargs: Any,
40
+ ) -> None:
41
+ """
42
+ Initialize OpenAI MCP agent.
43
+
44
+ Args:
45
+ client: AsyncOpenAI client (created if not provided)
46
+ model: OpenAI model to use
47
+ environment: Environment type for computer use
48
+ display_width: Display width for computer use
49
+ display_height: Display height for computer use
50
+ **kwargs: Additional arguments passed to BaseMCPAgent
51
+ """
52
+ super().__init__(**kwargs)
53
+
54
+ # Initialize client if not provided
55
+ if model_client is None:
56
+ api_key = settings.openai_api_key
57
+ if not api_key:
58
+ raise ValueError("OpenAI API key not found. Set OPENAI_API_KEY.")
59
+ model_client = AsyncOpenAI(api_key=api_key)
60
+
61
+ self.openai_client = model_client
62
+ self.model = model
63
+ self.environment = environment
64
+ self.display_width = display_width
65
+ self.display_height = display_height
66
+
67
+ # State tracking for OpenAI's stateful API
68
+ self.last_response_id: str | None = None
69
+ self.pending_call_id: str | None = None
70
+ self.pending_safety_checks: list[Any] = []
71
+
72
+ # Base system prompt for autonomous operation
73
+ self.base_system_prompt = """
74
+ You are an autonomous computer-using agent. Follow these guidelines:
75
+
76
+ 1. NEVER ask for confirmation. Complete all tasks autonomously.
77
+ 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
78
+ 3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
79
+ 4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
80
+ 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
81
+ 6. The user has already given you permission by running this agent. No further confirmation is needed.
82
+ 7. Be decisive and action-oriented. Complete the requested task fully.
83
+
84
+ Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
85
+ """ # noqa: E501
86
+
87
+ async def run(
88
+ self, prompt: str, max_steps: int = 10, conversation_mode: bool = False
89
+ ) -> dict[str, Any]:
90
+ """
91
+ Run the agent with the given prompt.
92
+
93
+ Override to reset OpenAI-specific state.
94
+ """
95
+ # Reset state for new run
96
+ self.last_response_id = None
97
+ self.pending_call_id = None
98
+ self.pending_safety_checks = []
99
+
100
+ # Use base implementation
101
+ return await super().run(prompt, max_steps, conversation_mode)
102
+
103
+ async def create_initial_messages(self, prompt: str, screenshot: str | None) -> list[Any]:
104
+ """
105
+ Create initial messages for OpenAI.
106
+
107
+ OpenAI uses a different message format - we'll store the prompt
108
+ and screenshot for use in get_model_response.
109
+ """
110
+ # For OpenAI, we don't create messages upfront, we build them in get_model_response
111
+ # Just return a list with the prompt and screenshot
112
+ return [{"prompt": prompt, "screenshot": screenshot}]
113
+
114
+ async def get_model_response(self, messages: list[Any], step: int) -> dict[str, Any]:
115
+ """Get response from OpenAI including any tool calls."""
116
+ # OpenAI's API is stateful, so we handle messages differently
117
+
118
+ # Check if we have computer tools available
119
+ computer_tool_name = None
120
+ for tool in self._available_tools:
121
+ if tool.name in ["computer_openai", "computer"]:
122
+ computer_tool_name = tool.name
123
+ break
124
+
125
+ if not computer_tool_name:
126
+ # No computer tools available, just return a text response
127
+ return {
128
+ "content": "No computer use tools available",
129
+ "tool_calls": [],
130
+ "done": True,
131
+ }
132
+
133
+ # Define the computer use tool
134
+ computer_tool: ToolParam = { # type: ignore[reportAssignmentType]
135
+ "type": "computer_use_preview",
136
+ "display_width": self.display_width,
137
+ "display_height": self.display_height,
138
+ "environment": self.environment,
139
+ }
140
+
141
+ # Build the request based on whether this is first step or follow-up
142
+ if self.pending_call_id is None and self.last_response_id is None:
143
+ # First step - extract prompt and screenshot from messages
144
+ initial_data = messages[0] # Our custom format from create_initial_messages
145
+ prompt_text = initial_data.get("prompt", "")
146
+ screenshot = initial_data.get("screenshot")
147
+
148
+ # Create the initial request
149
+ input_content: list[dict[str, Any]] = [{"type": "input_text", "text": prompt_text}]
150
+
151
+ if screenshot:
152
+ input_content.append(
153
+ {
154
+ "type": "input_image",
155
+ "image_url": f"data:image/png;base64,{screenshot}",
156
+ }
157
+ )
158
+
159
+ input_param: ResponseInputParam = [{"role": "user", "content": input_content}] # type: ignore[reportUnknownMemberType]
160
+
161
+ # Combine base system prompt with any custom system prompt
162
+ full_instructions = self.base_system_prompt
163
+ if self.custom_system_prompt:
164
+ full_instructions = f"{self.custom_system_prompt}\n\n{full_instructions}"
165
+
166
+ response = await self.openai_client.responses.create(
167
+ model=self.model,
168
+ tools=[computer_tool],
169
+ input=input_param,
170
+ instructions=full_instructions,
171
+ truncation="auto",
172
+ reasoning={"summary": "auto"},
173
+ )
174
+ else:
175
+ # Follow-up step - check if this is user input or tool result
176
+ latest_message = messages[-1] if messages else {}
177
+
178
+ if latest_message.get("type") == "user_input":
179
+ # User provided input in conversation mode
180
+ user_text = latest_message.get("text", "")
181
+ input_param_followup: ResponseInputParam = [
182
+ {"role": "user", "content": [{"type": "input_text", "text": user_text}]}
183
+ ]
184
+ # Reset pending_call_id since this is user input, not a tool response
185
+ self.pending_call_id = None
186
+ else:
187
+ # Tool result - need screenshot from processed results
188
+ latest_screenshot = None
189
+ for msg in reversed(messages):
190
+ if isinstance(msg, dict) and "screenshot" in msg:
191
+ latest_screenshot = msg["screenshot"]
192
+ break
193
+
194
+ if not latest_screenshot:
195
+ logger.warning("No screenshot provided for response to action")
196
+ return {
197
+ "content": "No screenshot available for next action",
198
+ "tool_calls": [],
199
+ "done": True,
200
+ }
201
+
202
+ # Create response to previous action
203
+ input_param_followup: ResponseInputParam = [ # type: ignore[reportAssignmentType]
204
+ { # type: ignore[reportAssignmentType]
205
+ "call_id": self.pending_call_id,
206
+ "type": "computer_call_output",
207
+ "output": {
208
+ "type": "input_image",
209
+ "image_url": f"data:image/png;base64,{latest_screenshot}",
210
+ },
211
+ "acknowledged_safety_checks": self.pending_safety_checks,
212
+ }
213
+ ]
214
+
215
+ self.pending_safety_checks = []
216
+
217
+ response = await self.openai_client.responses.create(
218
+ model=self.model,
219
+ previous_response_id=self.last_response_id,
220
+ tools=[computer_tool],
221
+ input=input_param_followup,
222
+ truncation="auto",
223
+ )
224
+
225
+ # Store response ID for next call
226
+ self.last_response_id = response.id
227
+
228
+ # Process response
229
+ result = {
230
+ "content": "",
231
+ "tool_calls": [],
232
+ "done": False, # Will be set to True only if no tool calls
233
+ "raw_response": response.model_dump(), # For debugging
234
+ }
235
+
236
+ self.pending_call_id = None
237
+
238
+ # Check for computer calls
239
+ computer_calls = [
240
+ item
241
+ for item in response.output
242
+ if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
243
+ ]
244
+
245
+ if computer_calls:
246
+ # Process computer calls
247
+ result["done"] = False
248
+ for computer_call in computer_calls:
249
+ self.pending_call_id = computer_call.call_id
250
+ self.pending_safety_checks = computer_call.pending_safety_checks
251
+
252
+ # Convert OpenAI action to MCP tool call
253
+ action = computer_call.action.model_dump()
254
+
255
+ # Map OpenAI action to MCP tool call format
256
+ tool_call = {
257
+ "name": computer_tool_name,
258
+ "arguments": action,
259
+ "call_id": computer_call.call_id, # Store for reference
260
+ }
261
+ result["tool_calls"].append(tool_call)
262
+ else:
263
+ # No computer calls, check for text response
264
+ for item in response.output:
265
+ if isinstance(item, ResponseOutputMessage) and item.type == "message":
266
+ # Extract text from content blocks
267
+ text_parts = [
268
+ content.text
269
+ for content in item.content
270
+ if isinstance(content, ResponseOutputText)
271
+ ]
272
+ if text_parts:
273
+ result["content"] = "".join(text_parts)
274
+ break
275
+
276
+ # Extract reasoning if present
277
+ reasoning_text = ""
278
+ for item in response.output:
279
+ if item.type == "reasoning" and hasattr(item, "summary") and item.summary:
280
+ reasoning_text += f"Thinking: {item.summary[0].text}\n"
281
+
282
+ if reasoning_text:
283
+ result["content"] = reasoning_text + result["content"]
284
+
285
+ # Set done=True if no tool calls (task complete or waiting for user)
286
+ if not result["tool_calls"]:
287
+ result["done"] = True
288
+
289
+ return result
290
+
291
+ async def format_tool_results(
292
+ self, processed_results: dict[str, Any], tool_calls: list[dict]
293
+ ) -> list[Any]:
294
+ """
295
+ Format tool results for OpenAI's stateful API.
296
+
297
+ OpenAI doesn't use a traditional message format - we just need to
298
+ preserve the screenshot for the next step.
299
+ """
300
+ # For OpenAI, we just need to track the latest screenshot
301
+ # Return a simple dict that get_model_response can use
302
+ return [
303
+ {
304
+ "type": "tool_result",
305
+ "screenshot": processed_results.get("screenshot"),
306
+ }
307
+ ]
308
+
309
+ async def create_user_message(self, text: str) -> dict[str, Any]:
310
+ """
311
+ Create a user message for OpenAI's stateful API.
312
+
313
+ Since OpenAI maintains conversation state server-side,
314
+ we just need to track that we're expecting user input.
315
+ """
316
+ # For OpenAI, we'll handle this in get_model_response
317
+ # by including the user's text in the next input
318
+ return {"type": "user_input", "text": text}
@@ -0,0 +1 @@
1
+ """Tests for MCP Agent module."""