hud-python 0.2.9__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (64) hide show
  1. hud/__init__.py +14 -5
  2. hud/env/docker_client.py +1 -1
  3. hud/env/environment.py +13 -8
  4. hud/env/local_docker_client.py +1 -1
  5. hud/env/remote_client.py +1 -1
  6. hud/env/remote_docker_client.py +2 -2
  7. hud/exceptions.py +2 -1
  8. hud/mcp_agent/__init__.py +15 -0
  9. hud/mcp_agent/base.py +723 -0
  10. hud/mcp_agent/claude.py +316 -0
  11. hud/mcp_agent/langchain.py +231 -0
  12. hud/mcp_agent/openai.py +318 -0
  13. hud/mcp_agent/tests/__init__.py +1 -0
  14. hud/mcp_agent/tests/test_base.py +437 -0
  15. hud/settings.py +14 -2
  16. hud/task.py +4 -0
  17. hud/telemetry/__init__.py +11 -7
  18. hud/telemetry/_trace.py +82 -71
  19. hud/telemetry/context.py +9 -27
  20. hud/telemetry/exporter.py +6 -5
  21. hud/telemetry/instrumentation/mcp.py +174 -410
  22. hud/telemetry/mcp_models.py +13 -74
  23. hud/telemetry/tests/test_context.py +9 -6
  24. hud/telemetry/tests/test_trace.py +92 -61
  25. hud/tools/__init__.py +21 -0
  26. hud/tools/base.py +65 -0
  27. hud/tools/bash.py +137 -0
  28. hud/tools/computer/__init__.py +13 -0
  29. hud/tools/computer/anthropic.py +411 -0
  30. hud/tools/computer/hud.py +315 -0
  31. hud/tools/computer/openai.py +283 -0
  32. hud/tools/edit.py +290 -0
  33. hud/tools/executors/__init__.py +13 -0
  34. hud/tools/executors/base.py +331 -0
  35. hud/tools/executors/pyautogui.py +585 -0
  36. hud/tools/executors/tests/__init__.py +1 -0
  37. hud/tools/executors/tests/test_base_executor.py +338 -0
  38. hud/tools/executors/tests/test_pyautogui_executor.py +162 -0
  39. hud/tools/executors/xdo.py +503 -0
  40. hud/tools/helper/README.md +56 -0
  41. hud/tools/helper/__init__.py +9 -0
  42. hud/tools/helper/mcp_server.py +78 -0
  43. hud/tools/helper/server_initialization.py +115 -0
  44. hud/tools/helper/utils.py +58 -0
  45. hud/tools/playwright_tool.py +373 -0
  46. hud/tools/tests/__init__.py +3 -0
  47. hud/tools/tests/test_bash.py +152 -0
  48. hud/tools/tests/test_computer.py +52 -0
  49. hud/tools/tests/test_computer_actions.py +34 -0
  50. hud/tools/tests/test_edit.py +233 -0
  51. hud/tools/tests/test_init.py +27 -0
  52. hud/tools/tests/test_playwright_tool.py +183 -0
  53. hud/tools/tests/test_tools.py +154 -0
  54. hud/tools/tests/test_utils.py +156 -0
  55. hud/tools/utils.py +50 -0
  56. hud/types.py +10 -1
  57. hud/utils/tests/test_init.py +21 -0
  58. hud/utils/tests/test_version.py +1 -1
  59. hud/version.py +1 -1
  60. {hud_python-0.2.9.dist-info → hud_python-0.3.0.dist-info}/METADATA +9 -6
  61. hud_python-0.3.0.dist-info/RECORD +124 -0
  62. hud_python-0.2.9.dist-info/RECORD +0 -85
  63. {hud_python-0.2.9.dist-info → hud_python-0.3.0.dist-info}/WHEEL +0 -0
  64. {hud_python-0.2.9.dist-info → hud_python-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,437 @@
1
+ """Tests for BaseMCPAgent using simulated actions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+ from unittest.mock import MagicMock
7
+
8
+ import pytest
9
+ from mcp import types
10
+
11
+ from hud.mcp_agent.base import BaseMCPAgent
12
+ from hud.tools.executors.base import BaseExecutor
13
+
14
+ if TYPE_CHECKING:
15
+ from hud.task import Task
16
+
17
+
18
+ class MockMCPAgent(BaseMCPAgent):
19
+ """Concrete implementation of BaseMCPAgent for testing."""
20
+
21
+ def __init__(self, **kwargs: Any) -> None:
22
+ super().__init__(**kwargs)
23
+ self.executor = BaseExecutor() # Use simulated executor
24
+ self._messages = []
25
+
26
+ async def run(self, task: Task) -> list[dict[str, Any]]:
27
+ """Mock run method."""
28
+ return self._messages
29
+
30
+ def create_initial_messages(
31
+ self, prompt: str, screenshot: str | None = None
32
+ ) -> list[dict[str, Any]]:
33
+ """Mock create initial messages."""
34
+ messages = [{"role": "user", "content": prompt}]
35
+ if screenshot:
36
+ messages.append({"role": "assistant", "content": f"Screenshot: {screenshot}"})
37
+ return messages
38
+
39
+ def get_model_response(self, messages: list[dict[str, Any]]) -> dict[str, Any]:
40
+ """Mock get model response."""
41
+ return {"role": "assistant", "content": "Mock response"}
42
+
43
+ def format_tool_results(
44
+ self,
45
+ results: list[tuple[str, Any]],
46
+ screenshot: str | None = None,
47
+ assistant_msg: dict[str, Any] | None = None,
48
+ ) -> list[dict[str, Any]]:
49
+ """Mock format tool results."""
50
+ formatted = []
51
+ for tool_name, result in results:
52
+ formatted.append({"role": "tool", "name": tool_name, "content": str(result)})
53
+ if screenshot:
54
+ formatted.append({"role": "screenshot", "content": screenshot})
55
+ return formatted
56
+
57
+ async def create_user_message(self, text: str) -> Any:
58
+ """Mock create user message."""
59
+ return {"role": "user", "content": text}
60
+
61
+
62
+ class TestBaseMCPAgent:
63
+ """Tests for BaseMCPAgent with simulated actions."""
64
+
65
+ def test_init_defaults(self):
66
+ """Test initialization with default values."""
67
+ agent = MockMCPAgent()
68
+
69
+ assert agent.client is not None
70
+ assert agent.allowed_tools is None
71
+ assert agent.disallowed_tools == []
72
+ assert agent.initial_screenshot is False
73
+ assert agent.max_screenshot_history == 3
74
+ assert agent.append_tool_system_prompt is True
75
+ assert agent.custom_system_prompt is None
76
+ assert agent.lifecycle_tools == {"setup": "setup", "evaluate": "evaluate"}
77
+
78
+ def test_init_with_params(self):
79
+ """Test initialization with custom parameters."""
80
+ client = MagicMock()
81
+ agent = MockMCPAgent(
82
+ client=client,
83
+ allowed_tools=["tool1", "tool2"],
84
+ disallowed_tools=["bad_tool"],
85
+ initial_screenshot=True,
86
+ max_screenshot_history=5,
87
+ append_tool_system_prompt=False,
88
+ custom_system_prompt="Custom prompt",
89
+ lifecycle_tools={"setup": "custom_setup", "evaluate": "custom_eval"},
90
+ )
91
+
92
+ assert agent.client == client
93
+ assert agent.allowed_tools == ["tool1", "tool2"]
94
+ assert agent.disallowed_tools == ["bad_tool"]
95
+ assert agent.initial_screenshot is True
96
+ assert agent.max_screenshot_history == 5
97
+ assert agent.append_tool_system_prompt is False
98
+ assert agent.custom_system_prompt == "Custom prompt"
99
+ assert agent.lifecycle_tools == {"setup": "custom_setup", "evaluate": "custom_eval"}
100
+
101
+ @pytest.mark.asyncio
102
+ async def test_initialize_no_client(self):
103
+ """Test initialize fails without client."""
104
+ agent = MockMCPAgent()
105
+ agent.client = None
106
+
107
+ with pytest.raises(ValueError, match="Client is not initialized"):
108
+ await agent.initialize()
109
+
110
+ @pytest.mark.asyncio
111
+ async def test_initialize_with_sessions(self):
112
+ """Test initialize with existing sessions."""
113
+ agent = MockMCPAgent()
114
+
115
+ # Create proper async mock for session
116
+ mock_session = MagicMock()
117
+
118
+ # Set up the connector and client_session structure
119
+ mock_session.connector = MagicMock()
120
+ mock_session.connector.client_session = MagicMock()
121
+
122
+ # Mock list_tools on the client_session
123
+ async def mock_list_tools():
124
+ return types.ListToolsResult(
125
+ tools=[
126
+ types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
127
+ types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
128
+ types.Tool(
129
+ name="setup", description="Setup tool", inputSchema={"type": "object"}
130
+ ),
131
+ ]
132
+ )
133
+
134
+ mock_session.connector.client_session.list_tools = mock_list_tools
135
+
136
+ assert agent.client is not None
137
+ agent.client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
138
+
139
+ await agent.initialize()
140
+
141
+ # Check available tools were populated (excludes lifecycle tools)
142
+ tools = agent.get_available_tools()
143
+ assert len(tools) == 2 # tool1 and tool2 (setup is excluded as lifecycle tool)
144
+
145
+ # Check tool map was populated (includes all tools)
146
+ tool_map = agent.get_tool_map()
147
+ assert len(tool_map) == 3
148
+ assert "tool1" in tool_map
149
+ assert "tool2" in tool_map
150
+ assert "setup" in tool_map
151
+
152
+ @pytest.mark.asyncio
153
+ async def test_initialize_with_filtering(self):
154
+ """Test initialize with tool filtering."""
155
+ agent = MockMCPAgent(allowed_tools=["tool1"], disallowed_tools=["tool3"])
156
+
157
+ # Create proper async mock for session
158
+ mock_session = MagicMock()
159
+
160
+ # Set up the connector and client_session structure
161
+ mock_session.connector = MagicMock()
162
+ mock_session.connector.client_session = MagicMock()
163
+
164
+ async def mock_list_tools():
165
+ return types.ListToolsResult(
166
+ tools=[
167
+ types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
168
+ types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
169
+ types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
170
+ types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
171
+ ]
172
+ )
173
+
174
+ mock_session.connector.client_session.list_tools = mock_list_tools
175
+
176
+ assert agent.client is not None
177
+ agent.client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
178
+
179
+ await agent.initialize()
180
+
181
+ # Check filtering worked - get_available_tools excludes lifecycle tools
182
+ tools = agent.get_available_tools()
183
+ tool_names = [t.name for t in tools]
184
+ assert len(tools) == 1 # Only tool1 (setup is excluded as lifecycle tool)
185
+ assert "tool1" in tool_names
186
+ assert "setup" not in tool_names # Lifecycle tool excluded from available tools
187
+ assert "tool2" not in tool_names # Not in allowed list
188
+ assert "tool3" not in tool_names # In disallowed list
189
+
190
+ @pytest.mark.asyncio
191
+ async def test_call_tool_success(self):
192
+ """Test successful tool call."""
193
+ agent = MockMCPAgent()
194
+
195
+ # Initialize with a tool
196
+ mock_session = MagicMock()
197
+ mock_session.connector = MagicMock()
198
+ mock_session.connector.client_session = MagicMock()
199
+
200
+ async def mock_list_tools():
201
+ return types.ListToolsResult(
202
+ tools=[
203
+ types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
204
+ ]
205
+ )
206
+
207
+ mock_session.connector.client_session.list_tools = mock_list_tools
208
+
209
+ # Mock the call_tool method on the client session
210
+ mock_result = types.CallToolResult(
211
+ content=[types.TextContent(type="text", text="Tool result")], isError=False
212
+ )
213
+
214
+ async def mock_call_tool(name, args):
215
+ return mock_result
216
+
217
+ mock_session.connector.client_session.call_tool = mock_call_tool
218
+
219
+ assert agent.client is not None
220
+ agent.client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
221
+ agent.client.get_session = MagicMock(return_value=mock_session)
222
+
223
+ await agent.initialize()
224
+
225
+ # Call the tool
226
+ result = await agent.call_tool({"name": "test_tool", "arguments": {"param": "value"}})
227
+
228
+ assert result == mock_result
229
+ assert not result.isError
230
+
231
+ @pytest.mark.asyncio
232
+ async def test_call_tool_not_found(self):
233
+ """Test calling non-existent tool."""
234
+ agent = MockMCPAgent()
235
+
236
+ # Initialize without tools
237
+ mock_session = MagicMock()
238
+
239
+ async def mock_list_tools():
240
+ return types.ListToolsResult(tools=[])
241
+
242
+ mock_session.list_tools = mock_list_tools
243
+ assert agent.client is not None
244
+ agent.client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
245
+
246
+ await agent.initialize()
247
+
248
+ # Try to call unknown tool
249
+ with pytest.raises(ValueError, match="Tool 'unknown_tool' not found"):
250
+ await agent.call_tool({"name": "unknown_tool", "arguments": {}})
251
+
252
+ @pytest.mark.asyncio
253
+ async def test_call_tool_no_name(self):
254
+ """Test calling tool without name."""
255
+ agent = MockMCPAgent()
256
+
257
+ with pytest.raises(ValueError, match="Tool call must have a 'name' field"):
258
+ await agent.call_tool({"arguments": {}})
259
+
260
+ def test_get_system_prompt_default(self):
261
+ """Test get_system_prompt with default settings."""
262
+ agent = MockMCPAgent()
263
+
264
+ # Add some tools
265
+ agent._available_tools = [
266
+ types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
267
+ types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
268
+ ]
269
+
270
+ prompt = agent.get_system_prompt()
271
+
272
+ # Should include ALL tool descriptions (including lifecycle tools)
273
+ assert "tool1" in prompt
274
+ assert "Tool 1" in prompt
275
+ assert "setup" in prompt
276
+ assert "Setup" in prompt
277
+
278
+ def test_get_system_prompt_custom(self):
279
+ """Test get_system_prompt with custom prompt."""
280
+ agent = MockMCPAgent(
281
+ custom_system_prompt="My custom prompt", append_tool_system_prompt=False
282
+ )
283
+
284
+ prompt = agent.get_system_prompt()
285
+ assert prompt == "My custom prompt"
286
+
287
+ def test_has_computer_tools(self):
288
+ """Test checking for computer tools."""
289
+ agent = MockMCPAgent()
290
+
291
+ # No tools
292
+ assert not agent.has_computer_tools()
293
+
294
+ # With computer tool
295
+ agent._available_tools = [
296
+ types.Tool(name="computer", description="Computer", inputSchema={"type": "object"})
297
+ ]
298
+ assert agent.has_computer_tools()
299
+
300
+ # With screenshot tool
301
+ agent._available_tools = [
302
+ types.Tool(name="screenshot", description="Screenshot", inputSchema={"type": "object"})
303
+ ]
304
+ assert agent.has_computer_tools()
305
+
306
+ def test_get_tool_schemas(self):
307
+ """Test getting tool schemas."""
308
+ agent = MockMCPAgent()
309
+
310
+ agent._available_tools = [
311
+ types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
312
+ types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
313
+ ]
314
+
315
+ schemas = agent.get_tool_schemas()
316
+
317
+ # Should include non-lifecycle tools
318
+ assert len(schemas) == 1
319
+ assert schemas[0]["name"] == "tool1"
320
+
321
+ @pytest.mark.asyncio
322
+ async def test_capture_screenshot_no_tool(self):
323
+ """Test screenshot capture without screenshot tool."""
324
+ agent = MockMCPAgent()
325
+
326
+ screenshot = await agent.capture_screenshot()
327
+ assert screenshot is None
328
+
329
+ @pytest.mark.asyncio
330
+ async def test_capture_screenshot_with_tool(self):
331
+ """Test screenshot capture with screenshot tool."""
332
+ agent = MockMCPAgent()
333
+
334
+ # Set up screenshot tool
335
+ mock_session = MagicMock()
336
+ mock_session.connector = MagicMock()
337
+ mock_session.connector.client_session = MagicMock()
338
+
339
+ async def mock_list_tools():
340
+ return types.ListToolsResult(
341
+ tools=[
342
+ types.Tool(
343
+ name="screenshot", description="Screenshot", inputSchema={"type": "object"}
344
+ )
345
+ ]
346
+ )
347
+
348
+ mock_session.connector.client_session.list_tools = mock_list_tools
349
+
350
+ # Mock screenshot result
351
+ mock_result = types.CallToolResult(
352
+ content=[
353
+ types.ImageContent(type="image", data="base64imagedata", mimeType="image/png")
354
+ ],
355
+ isError=False,
356
+ )
357
+
358
+ async def mock_call_tool(name, args):
359
+ return mock_result
360
+
361
+ mock_session.connector.client_session.call_tool = mock_call_tool
362
+
363
+ assert agent.client is not None
364
+ agent.client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
365
+ agent.client.get_session = MagicMock(return_value=mock_session)
366
+
367
+ await agent.initialize()
368
+
369
+ screenshot = await agent.capture_screenshot()
370
+ assert screenshot == "base64imagedata"
371
+
372
+ def test_process_tool_results_extracts_text(self):
373
+ """Test processing tool results extracts text content."""
374
+ agent = MockMCPAgent()
375
+
376
+ # Create a proper CallToolResult object
377
+ result = types.CallToolResult(
378
+ content=[
379
+ types.TextContent(type="text", text="Result text"),
380
+ types.ImageContent(type="image", data="imagedata", mimeType="image/png"),
381
+ ],
382
+ isError=False,
383
+ )
384
+
385
+ tool_results = [{"tool_name": "test_tool", "result": result}]
386
+
387
+ processed = agent.process_tool_results(tool_results)
388
+
389
+ assert "text" in processed
390
+ assert "Result text" in processed["text"]
391
+ assert "results" in processed
392
+ assert len(processed["results"]) == 1
393
+
394
+ def test_get_tools_by_server(self):
395
+ """Test getting tools grouped by server."""
396
+ agent = MockMCPAgent()
397
+
398
+ # Set up tools from different servers
399
+ tool1 = types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"})
400
+ tool2 = types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"})
401
+
402
+ agent._available_tools = [tool1, tool2]
403
+ agent._tool_map = {
404
+ "tool1": ("server1", tool1),
405
+ "tool2": ("server2", tool2),
406
+ }
407
+
408
+ tools_by_server = agent.get_tools_by_server()
409
+
410
+ assert len(tools_by_server) == 2
411
+ assert "server1" in tools_by_server
412
+ assert "server2" in tools_by_server
413
+ assert tools_by_server["server1"] == [tool1]
414
+ assert tools_by_server["server2"] == [tool2]
415
+
416
+ @pytest.mark.asyncio
417
+ async def test_executor_integration(self):
418
+ """Test integration with BaseExecutor for simulated actions."""
419
+ agent = MockMCPAgent()
420
+
421
+ # Test various executor actions
422
+ click_result = await agent.executor.click(100, 200, take_screenshot=False)
423
+ assert click_result.output is not None
424
+ assert "[SIMULATED] Click at (100, 200)" in click_result.output
425
+
426
+ type_result = await agent.executor.type("Test input", take_screenshot=False)
427
+ assert type_result.output is not None
428
+ assert "[SIMULATED] Type 'Test input'" in type_result.output
429
+
430
+ scroll_result = await agent.executor.scroll(x=50, y=50, scroll_y=5, take_screenshot=False)
431
+ assert scroll_result.output is not None
432
+ assert "[SIMULATED] Scroll" in scroll_result.output
433
+
434
+ # Test screenshot
435
+ screenshot = await agent.executor.screenshot()
436
+ assert isinstance(screenshot, str)
437
+ assert screenshot.startswith("iVBORw0KGgo") # PNG header
hud/settings.py CHANGED
@@ -41,13 +41,25 @@ class Settings(BaseSettings):
41
41
  telemetry_enabled: bool = Field(
42
42
  default=True,
43
43
  description="Enable telemetry for the HUD SDK",
44
- validation_alias="TELEMETRY_ENABLED",
44
+ validation_alias="HUD_TELEMETRY_ENABLED",
45
45
  )
46
46
 
47
47
  fancy_logging: bool = Field(
48
48
  default=True,
49
49
  description="Enable fancy logging for the HUD SDK",
50
- validation_alias="FANCY_LOGGING",
50
+ validation_alias="HUD_FANCY_LOGGING",
51
+ )
52
+
53
+ log_stream: str = Field(
54
+ default="stdout",
55
+ description="Stream to use for logging output: 'stdout' or 'stderr'",
56
+ validation_alias="HUD_LOG_STREAM",
57
+ )
58
+
59
+ display: str = Field(
60
+ default=":0",
61
+ description="Display to use for the HUD SDK",
62
+ validation_alias="HUD_DISPLAY",
51
63
  )
52
64
 
53
65
 
hud/task.py CHANGED
@@ -74,6 +74,8 @@ class Task(BaseModel):
74
74
  # Description of the task, for extra information about its purpose and context
75
75
  description: str | None = None
76
76
 
77
+ gold_file_url: str | None = None
78
+
77
79
  @classmethod
78
80
  def from_dict(cls, data: dict[str, Any]) -> Task:
79
81
  return cls(**data)
@@ -110,6 +112,7 @@ class Task(BaseModel):
110
112
  description=data.get("description"),
111
113
  sensitive_data=data.get("sensitive_data", {}),
112
114
  metadata=data.get("metadata", {}),
115
+ gold_file_url=data.get("gold_file_url"),
113
116
  )
114
117
 
115
118
  @classmethod
@@ -221,4 +224,5 @@ class Task(BaseModel):
221
224
  "gym": parsed_gym,
222
225
  "sensitive_data": self.sensitive_data,
223
226
  "metadata": self.metadata,
227
+ "gold_file_url": self.gold_file_url,
224
228
  }
hud/telemetry/__init__.py CHANGED
@@ -1,21 +1,25 @@
1
1
  """
2
- HUD telemetry module for capturing and reporting telemetry data from MCP calls.
2
+ HUD Telemetry module.
3
3
 
4
- This module provides functionality to trace MCP calls and export telemetry data
5
- to the HUD platform for analysis.
4
+ Provides context managers and utilities for capturing MCP telemetry data.
6
5
  """
7
6
 
8
7
  from __future__ import annotations
9
8
 
10
- from hud.telemetry._trace import init_telemetry, register_trace, trace
11
- from hud.telemetry.context import get_current_task_run_id, set_current_task_run_id
9
+ # Main trace functions
10
+ from hud.telemetry._trace import init_telemetry, trace, trace_open
11
+ from hud.telemetry.context import flush_buffer, get_current_task_run_id
12
12
  from hud.telemetry.exporter import flush
13
13
 
14
14
  __all__ = [
15
+ # Management
15
16
  "flush",
17
+ "flush_buffer",
18
+ # Context management
16
19
  "get_current_task_run_id",
20
+ # Management
17
21
  "init_telemetry",
18
- "register_trace",
19
- "set_current_task_run_id",
22
+ # Trace functions
20
23
  "trace",
24
+ "trace_open",
21
25
  ]