hud-python 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (130) hide show
  1. hud/__init__.py +22 -22
  2. hud/agents/__init__.py +13 -15
  3. hud/agents/base.py +599 -599
  4. hud/agents/claude.py +373 -373
  5. hud/agents/langchain.py +261 -250
  6. hud/agents/misc/__init__.py +7 -7
  7. hud/agents/misc/response_agent.py +82 -80
  8. hud/agents/openai.py +352 -352
  9. hud/agents/openai_chat_generic.py +154 -154
  10. hud/agents/tests/__init__.py +1 -1
  11. hud/agents/tests/test_base.py +742 -742
  12. hud/agents/tests/test_claude.py +324 -324
  13. hud/agents/tests/test_client.py +363 -363
  14. hud/agents/tests/test_openai.py +237 -237
  15. hud/cli/__init__.py +617 -617
  16. hud/cli/__main__.py +8 -8
  17. hud/cli/analyze.py +371 -371
  18. hud/cli/analyze_metadata.py +230 -230
  19. hud/cli/build.py +498 -427
  20. hud/cli/clone.py +185 -185
  21. hud/cli/cursor.py +92 -92
  22. hud/cli/debug.py +392 -392
  23. hud/cli/docker_utils.py +83 -83
  24. hud/cli/init.py +280 -281
  25. hud/cli/interactive.py +353 -353
  26. hud/cli/mcp_server.py +764 -756
  27. hud/cli/pull.py +330 -336
  28. hud/cli/push.py +404 -370
  29. hud/cli/remote_runner.py +311 -311
  30. hud/cli/runner.py +160 -160
  31. hud/cli/tests/__init__.py +3 -3
  32. hud/cli/tests/test_analyze.py +284 -284
  33. hud/cli/tests/test_cli_init.py +265 -265
  34. hud/cli/tests/test_cli_main.py +27 -27
  35. hud/cli/tests/test_clone.py +142 -142
  36. hud/cli/tests/test_cursor.py +253 -253
  37. hud/cli/tests/test_debug.py +453 -453
  38. hud/cli/tests/test_mcp_server.py +139 -139
  39. hud/cli/tests/test_utils.py +388 -388
  40. hud/cli/utils.py +263 -263
  41. hud/clients/README.md +143 -143
  42. hud/clients/__init__.py +16 -16
  43. hud/clients/base.py +378 -379
  44. hud/clients/fastmcp.py +222 -222
  45. hud/clients/mcp_use.py +298 -278
  46. hud/clients/tests/__init__.py +1 -1
  47. hud/clients/tests/test_client_integration.py +111 -111
  48. hud/clients/tests/test_fastmcp.py +342 -342
  49. hud/clients/tests/test_protocol.py +188 -188
  50. hud/clients/utils/__init__.py +1 -1
  51. hud/clients/utils/retry_transport.py +160 -160
  52. hud/datasets.py +327 -322
  53. hud/misc/__init__.py +1 -1
  54. hud/misc/claude_plays_pokemon.py +292 -292
  55. hud/otel/__init__.py +35 -35
  56. hud/otel/collector.py +142 -142
  57. hud/otel/config.py +164 -164
  58. hud/otel/context.py +536 -536
  59. hud/otel/exporters.py +366 -366
  60. hud/otel/instrumentation.py +97 -97
  61. hud/otel/processors.py +118 -118
  62. hud/otel/tests/__init__.py +1 -1
  63. hud/otel/tests/test_processors.py +197 -197
  64. hud/server/__init__.py +5 -5
  65. hud/server/context.py +114 -114
  66. hud/server/helper/__init__.py +5 -5
  67. hud/server/low_level.py +132 -132
  68. hud/server/server.py +170 -166
  69. hud/server/tests/__init__.py +3 -3
  70. hud/settings.py +73 -73
  71. hud/shared/__init__.py +5 -5
  72. hud/shared/exceptions.py +180 -180
  73. hud/shared/requests.py +264 -264
  74. hud/shared/tests/test_exceptions.py +157 -157
  75. hud/shared/tests/test_requests.py +275 -275
  76. hud/telemetry/__init__.py +25 -25
  77. hud/telemetry/instrument.py +379 -379
  78. hud/telemetry/job.py +309 -309
  79. hud/telemetry/replay.py +74 -74
  80. hud/telemetry/trace.py +83 -83
  81. hud/tools/__init__.py +33 -33
  82. hud/tools/base.py +365 -365
  83. hud/tools/bash.py +161 -161
  84. hud/tools/computer/__init__.py +15 -15
  85. hud/tools/computer/anthropic.py +437 -437
  86. hud/tools/computer/hud.py +376 -376
  87. hud/tools/computer/openai.py +295 -295
  88. hud/tools/computer/settings.py +82 -82
  89. hud/tools/edit.py +314 -314
  90. hud/tools/executors/__init__.py +30 -30
  91. hud/tools/executors/base.py +539 -539
  92. hud/tools/executors/pyautogui.py +621 -621
  93. hud/tools/executors/tests/__init__.py +1 -1
  94. hud/tools/executors/tests/test_base_executor.py +338 -338
  95. hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
  96. hud/tools/executors/xdo.py +511 -511
  97. hud/tools/playwright.py +412 -412
  98. hud/tools/tests/__init__.py +3 -3
  99. hud/tools/tests/test_base.py +282 -282
  100. hud/tools/tests/test_bash.py +158 -158
  101. hud/tools/tests/test_bash_extended.py +197 -197
  102. hud/tools/tests/test_computer.py +425 -425
  103. hud/tools/tests/test_computer_actions.py +34 -34
  104. hud/tools/tests/test_edit.py +259 -259
  105. hud/tools/tests/test_init.py +27 -27
  106. hud/tools/tests/test_playwright_tool.py +183 -183
  107. hud/tools/tests/test_tools.py +145 -145
  108. hud/tools/tests/test_utils.py +156 -156
  109. hud/tools/types.py +72 -72
  110. hud/tools/utils.py +50 -50
  111. hud/types.py +136 -136
  112. hud/utils/__init__.py +10 -10
  113. hud/utils/async_utils.py +65 -65
  114. hud/utils/design.py +236 -168
  115. hud/utils/mcp.py +55 -55
  116. hud/utils/progress.py +149 -149
  117. hud/utils/telemetry.py +66 -66
  118. hud/utils/tests/test_async_utils.py +173 -173
  119. hud/utils/tests/test_init.py +17 -17
  120. hud/utils/tests/test_progress.py +261 -261
  121. hud/utils/tests/test_telemetry.py +82 -82
  122. hud/utils/tests/test_version.py +8 -8
  123. hud/version.py +7 -7
  124. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/METADATA +10 -8
  125. hud_python-0.4.3.dist-info/RECORD +131 -0
  126. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/licenses/LICENSE +21 -21
  127. hud/agents/art.py +0 -101
  128. hud_python-0.4.1.dist-info/RECORD +0 -132
  129. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/WHEEL +0 -0
  130. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/entry_points.txt +0 -0
@@ -1,742 +1,742 @@
1
- """Tests for BaseMCPAgent using simulated actions."""
2
-
3
- from __future__ import annotations
4
-
5
- from typing import Any, ClassVar
6
- from unittest.mock import MagicMock
7
-
8
- # Import AsyncMock from unittest.mock if available (Python 3.8+)
9
- try:
10
- from unittest.mock import AsyncMock
11
- except ImportError:
12
- # Fallback for older Python versions
13
- from unittest.mock import MagicMock as AsyncMock
14
-
15
- import pytest
16
- from mcp import types
17
-
18
- from hud.agents import MCPAgent
19
- from hud.datasets import Task
20
- from hud.tools.executors.base import BaseExecutor
21
- from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
22
-
23
-
24
- class MockMCPAgent(MCPAgent):
25
- """Concrete implementation of BaseMCPAgent for testing."""
26
-
27
- metadata: ClassVar[dict[str, Any]] = {} # Optional metadata for MCP config
28
-
29
- def __init__(self, mcp_client: Any = None, **kwargs: Any) -> None:
30
- if mcp_client is None:
31
- # Create a mock client if none provided
32
- mcp_client = MagicMock()
33
- mcp_client.get_available_tools = MagicMock(return_value=[])
34
- mcp_client.initialize = AsyncMock()
35
- mcp_client.list_tools = AsyncMock(return_value=[])
36
- mcp_client.mcp_config = {"test_server": {"url": "http://localhost"}}
37
- super().__init__(mcp_client=mcp_client, **kwargs)
38
- self.executor = BaseExecutor() # Use simulated executor
39
- self._messages = []
40
-
41
- async def run(self, task: Task) -> list[dict[str, Any]]:
42
- """Mock run method."""
43
- return self._messages
44
-
45
- async def create_initial_messages(
46
- self, prompt: str, initial_screenshot: bool = False
47
- ) -> list[dict[str, Any]]:
48
- """Mock create initial messages."""
49
- messages = [{"role": "user", "content": prompt}]
50
- if initial_screenshot:
51
- messages.append({"role": "assistant", "content": "Screenshot: mock_screenshot"})
52
- return messages
53
-
54
- async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
55
- """Mock get response."""
56
- return AgentResponse(content="Mock response", tool_calls=[], done=True)
57
-
58
- async def format_tool_results(
59
- self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
60
- ) -> list[dict[str, Any]]:
61
- """Mock format tool results."""
62
- formatted = []
63
- for tool_call, result in zip(tool_calls, tool_results):
64
- formatted.append({"role": "tool", "name": tool_call.name, "content": str(result)})
65
- return formatted
66
-
67
- async def create_user_message(self, text: str) -> Any:
68
- """Mock create user message."""
69
- return {"role": "user", "content": text}
70
-
71
- async def get_system_messages(self) -> list[Any]:
72
- """Mock get system messages."""
73
- return []
74
-
75
- async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
76
- """Mock format blocks."""
77
- formatted = []
78
- for block in blocks:
79
- if isinstance(block, types.TextContent):
80
- formatted.append({"type": "text", "text": block.text})
81
- elif isinstance(block, types.ImageContent):
82
- formatted.append({"type": "image", "data": block.data})
83
- elif hasattr(block, "type"):
84
- formatted.append({"type": getattr(block, "type", "unknown")})
85
- return formatted
86
-
87
-
88
- class TestBaseMCPAgent:
89
- """Tests for BaseMCPAgent with simulated actions."""
90
-
91
- def test_init_defaults(self):
92
- """Test initialization with default values."""
93
- agent = MockMCPAgent()
94
-
95
- assert agent.mcp_client is not None
96
- assert agent.allowed_tools is None
97
- assert agent.disallowed_tools == []
98
- assert agent.initial_screenshot is True
99
- assert agent.system_prompt is not None # Default system prompt is set
100
- assert agent.lifecycle_tools == []
101
-
102
- def test_init_with_params(self):
103
- """Test initialization with custom parameters."""
104
- client = MagicMock()
105
- agent = MockMCPAgent(
106
- mcp_client=client,
107
- allowed_tools=["tool1", "tool2"],
108
- disallowed_tools=["bad_tool"],
109
- initial_screenshot=True,
110
- system_prompt="Custom prompt",
111
- lifecycle_tools=["custom_setup", "custom_eval"],
112
- )
113
-
114
- assert agent.mcp_client == client
115
- assert agent.allowed_tools == ["tool1", "tool2"]
116
- assert agent.disallowed_tools == ["bad_tool"]
117
- assert agent.initial_screenshot is True
118
- assert agent.system_prompt == "Custom prompt"
119
- assert agent.lifecycle_tools == ["custom_setup", "custom_eval"]
120
-
121
- @pytest.mark.asyncio
122
- async def test_init_no_client_no_task(self):
123
- """Test initialize fails without client and without task."""
124
-
125
- # Create a minimal concrete implementation to test the ValueError
126
- class TestAgent(MCPAgent):
127
- async def create_initial_messages(
128
- self, prompt: str, initial_screenshot: bool = False
129
- ) -> list[dict[str, Any]]:
130
- return []
131
-
132
- async def format_tool_results(
133
- self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
134
- ) -> list[dict[str, Any]]:
135
- return []
136
-
137
- async def get_response(self, messages: list[dict[str, Any]]) -> dict[str, Any]:
138
- return {"content": "test"}
139
-
140
- async def get_system_messages(self) -> list[Any]:
141
- return []
142
-
143
- async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
144
- return []
145
-
146
- # Agent can be created with None client
147
- agent = TestAgent(mcp_client=None)
148
-
149
- # But initialize should fail without client or task
150
- with pytest.raises(ValueError, match="No MCPClient"):
151
- await agent.initialize()
152
-
153
- @pytest.mark.asyncio
154
- async def test_initialize_with_sessions(self):
155
- """Test initialize with existing sessions."""
156
- agent = MockMCPAgent()
157
-
158
- # Create proper async mock for session
159
- mock_session = MagicMock()
160
-
161
- # Set up the connector and client_session structure
162
- mock_session.connector = MagicMock()
163
- mock_session.connector.client_session = MagicMock()
164
-
165
- # Mock list_tools on the client_session
166
- async def mock_list_tools():
167
- return types.ListToolsResult(
168
- tools=[
169
- types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
170
- types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
171
- types.Tool(
172
- name="setup", description="Setup tool", inputSchema={"type": "object"}
173
- ),
174
- ]
175
- )
176
-
177
- mock_session.connector.client_session.list_tools = mock_list_tools
178
-
179
- assert agent.mcp_client is not None
180
-
181
- # Mock the list_tools method on mcp_client to return the tools
182
- agent.mcp_client.list_tools = AsyncMock(
183
- return_value=[
184
- types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
185
- types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
186
- types.Tool(name="setup", description="Setup tool", inputSchema={"type": "object"}),
187
- ]
188
- )
189
-
190
- await agent.initialize()
191
-
192
- # Check available tools were populated (excludes lifecycle tools)
193
- tools = agent.get_available_tools()
194
- assert len(tools) == 3 # All tools (setup is not in default lifecycle tools)
195
-
196
- # Ensure names exist in available tools
197
- names = {t.name for t in tools}
198
- assert {"tool1", "tool2", "setup"} <= names
199
-
200
- @pytest.mark.asyncio
201
- async def test_initialize_with_filtering(self):
202
- """Test initialize with tool filtering."""
203
- agent = MockMCPAgent(allowed_tools=["tool1"], disallowed_tools=["tool3"])
204
-
205
- # Create proper async mock for session
206
- mock_session = MagicMock()
207
-
208
- # Set up the connector and client_session structure
209
- mock_session.connector = MagicMock()
210
- mock_session.connector.client_session = MagicMock()
211
-
212
- async def mock_list_tools():
213
- return types.ListToolsResult(
214
- tools=[
215
- types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
216
- types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
217
- types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
218
- types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
219
- ]
220
- )
221
-
222
- mock_session.connector.client_session.list_tools = mock_list_tools
223
-
224
- assert agent.mcp_client is not None
225
-
226
- # Mock the list_tools method on mcp_client to return the tools
227
- agent.mcp_client.list_tools = AsyncMock(
228
- return_value=[
229
- types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
230
- types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
231
- types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
232
- types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
233
- ]
234
- )
235
-
236
- await agent.initialize()
237
-
238
- # Check filtering worked - get_available_tools excludes lifecycle tools
239
- tools = agent.get_available_tools()
240
- tool_names = [t.name for t in tools]
241
- assert len(tools) == 1 # Only tool1 (tool2 and tool3 are filtered out)
242
- assert "tool1" in tool_names
243
- assert "setup" not in tool_names # Lifecycle tool excluded from available tools
244
- assert "tool2" not in tool_names # Not in allowed list
245
- assert "tool3" not in tool_names # In disallowed list
246
-
247
- @pytest.mark.asyncio
248
- async def test_call_tool_success(self):
249
- """Test successful tool call."""
250
- agent = MockMCPAgent()
251
-
252
- # Initialize with a tool
253
- mock_session = MagicMock()
254
- mock_session.connector = MagicMock()
255
- mock_session.connector.client_session = MagicMock()
256
-
257
- async def mock_list_tools():
258
- return types.ListToolsResult(
259
- tools=[
260
- types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
261
- ]
262
- )
263
-
264
- mock_session.connector.client_session.list_tools = mock_list_tools
265
-
266
- # Mock the call_tool method on the client session
267
- mock_result = types.CallToolResult(
268
- content=[types.TextContent(type="text", text="Tool result")], isError=False
269
- )
270
-
271
- async def mock_call_tool(name, args):
272
- return mock_result
273
-
274
- mock_session.connector.client_session.call_tool = mock_call_tool
275
-
276
- assert agent.mcp_client is not None
277
-
278
- # Mock the client's call_tool method directly
279
- agent.mcp_client.call_tool = AsyncMock(return_value=mock_result)
280
-
281
- # Mock the list_tools method to return the test tool
282
- agent.mcp_client.list_tools = AsyncMock(
283
- return_value=[
284
- types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
285
- ]
286
- )
287
-
288
- await agent.initialize()
289
-
290
- # Call the tool
291
- tool_call = MCPToolCall(name="test_tool", arguments={"param": "value"})
292
- results = await agent.call_tools(tool_call)
293
-
294
- assert len(results) == 1
295
- assert results[0] == mock_result
296
- assert not results[0].isError
297
-
298
- @pytest.mark.asyncio
299
- async def test_call_tool_not_found(self):
300
- """Test calling non-existent tool."""
301
- agent = MockMCPAgent()
302
-
303
- # Initialize without tools
304
- mock_session = MagicMock()
305
-
306
- async def mock_list_tools():
307
- return types.ListToolsResult(tools=[])
308
-
309
- mock_session.list_tools = mock_list_tools
310
- assert agent.mcp_client is not None
311
-
312
- await agent.initialize()
313
-
314
- # Try to call unknown tool - call_tools doesn't raise for unknown tools
315
- tool_call = MCPToolCall(name="unknown_tool", arguments={})
316
- await agent.call_tools(tool_call)
317
-
318
- @pytest.mark.asyncio
319
- async def test_call_tool_no_name(self):
320
- """Test calling tool without name."""
321
- # MCPToolCall accepts empty names
322
- agent = MockMCPAgent()
323
- tool_call = MCPToolCall(name="", arguments={})
324
-
325
- # call_tools doesn't validate empty names, it will return error
326
- await agent.call_tools(tool_call)
327
-
328
- def test_get_tool_schemas(self):
329
- """Test getting tool schemas."""
330
- agent = MockMCPAgent()
331
-
332
- # Add setup to lifecycle tools to test filtering
333
- agent.lifecycle_tools = ["setup"]
334
-
335
- agent._available_tools = [
336
- types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
337
- types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
338
- ]
339
-
340
- schemas = agent.get_tool_schemas()
341
-
342
- # Should include non-lifecycle tools
343
- assert len(schemas) == 1
344
- assert schemas[0]["name"] == "tool1"
345
-
346
- def test_get_tools_by_server(self):
347
- """Test getting tools grouped by server."""
348
- agent = MockMCPAgent()
349
-
350
- # Set up tools from different servers
351
- tool1 = types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"})
352
- tool2 = types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"})
353
-
354
- agent._available_tools = [tool1, tool2]
355
- tools = agent.get_available_tools()
356
- assert {t.name for t in tools} == {"tool1", "tool2"}
357
-
358
- @pytest.mark.asyncio
359
- async def test_executor_integration(self):
360
- """Test integration with BaseExecutor for simulated actions."""
361
- agent = MockMCPAgent()
362
-
363
- # Test various executor actions
364
- click_result = await agent.executor.click(100, 200, take_screenshot=False)
365
- assert click_result.output is not None
366
- assert "[SIMULATED] Click at (100, 200)" in click_result.output
367
-
368
- type_result = await agent.executor.write("Test input", take_screenshot=False)
369
- assert type_result.output is not None
370
- assert "[SIMULATED] Type 'Test input'" in type_result.output
371
-
372
- scroll_result = await agent.executor.scroll(x=50, y=50, scroll_y=5, take_screenshot=False)
373
- assert scroll_result.output is not None
374
- assert "[SIMULATED] Scroll" in scroll_result.output
375
-
376
- # Test screenshot
377
- screenshot = await agent.executor.screenshot()
378
- assert isinstance(screenshot, str)
379
- assert screenshot.startswith("iVBORw0KGgo") # PNG header
380
-
381
-
382
- class MockAgentExtended(MCPAgent):
383
- """Mock agent for testing with predefined responses."""
384
-
385
- metadata: ClassVar[dict[str, Any]] = {} # Optional metadata for MCP config
386
-
387
- def __init__(self, responses=None, **kwargs):
388
- super().__init__(**kwargs)
389
- self.responses = responses or []
390
- self.call_count = 0
391
-
392
- async def create_initial_messages(
393
- self, prompt: str, initial_screenshot: bool = False
394
- ) -> list[dict[str, Any]]:
395
- """Create initial messages."""
396
- messages = [{"role": "user", "content": prompt}]
397
- if initial_screenshot:
398
- # capture_screenshot doesn't exist, just mock it
399
- screenshot = "mock_screenshot_data"
400
- messages.append({"role": "assistant", "content": f"Screenshot: {screenshot}"})
401
- return messages
402
-
403
- async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
404
- """Return predefined responses - must be async."""
405
- if self.call_count < len(self.responses):
406
- response_dict = self.responses[self.call_count]
407
- self.call_count += 1
408
- # Convert dict to AgentResponse
409
- return AgentResponse(
410
- content=response_dict.get("content", ""),
411
- tool_calls=response_dict.get("tool_calls", []),
412
- done=response_dict.get("done", not bool(response_dict.get("tool_calls"))),
413
- )
414
- return AgentResponse(content="Done", tool_calls=[], done=True)
415
-
416
- async def format_tool_results(
417
- self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
418
- ) -> list[dict[str, Any]]:
419
- """Format tool results."""
420
- formatted = []
421
- for tool_call, result in zip(tool_calls, tool_results):
422
- formatted.append({"role": "tool", "name": tool_call.name, "content": str(result)})
423
- return formatted
424
-
425
- async def create_user_message(self, text: str) -> Any:
426
- """Create user message."""
427
- return {"role": "user", "content": text}
428
-
429
- async def get_system_messages(self) -> list[Any]:
430
- """Mock get system messages."""
431
- return []
432
-
433
- async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
434
- """Mock format blocks."""
435
- formatted = []
436
- for block in blocks:
437
- if isinstance(block, types.TextContent):
438
- formatted.append({"type": "text", "text": block.text})
439
- elif isinstance(block, types.ImageContent):
440
- formatted.append({"type": "image", "data": block.data})
441
- elif hasattr(block, "type"):
442
- formatted.append({"type": getattr(block, "type", "unknown")})
443
- return formatted
444
-
445
-
446
- class TestMCPAgentExtended:
447
- """Extended tests for MCPAgent."""
448
-
449
- @pytest.fixture
450
- def mock_client(self):
451
- """Create a mock MCP client."""
452
- client = MagicMock()
453
- client.get_all_active_sessions = MagicMock(return_value={})
454
- client.initialize = AsyncMock()
455
- client.list_tools = AsyncMock(return_value=[])
456
- client.call_tool = AsyncMock(
457
- return_value=types.CallToolResult(
458
- content=[types.TextContent(type="text", text="Success")],
459
- isError=False,
460
- )
461
- )
462
- return client
463
-
464
- @pytest.fixture
465
- def agent_with_tools(self, mock_client):
466
- """Create agent with mock tools."""
467
- mock_client.list_tools = AsyncMock(
468
- return_value=[
469
- types.Tool(name="screenshot", description="Take screenshot", inputSchema={}),
470
- types.Tool(name="click", description="Click at coordinates", inputSchema={}),
471
- types.Tool(name="type", description="Type text", inputSchema={}),
472
- types.Tool(name="bad_tool", description="A tool that fails", inputSchema={}),
473
- ]
474
- )
475
- return MockAgentExtended(mcp_client=mock_client)
476
-
477
- @pytest.mark.asyncio
478
- async def test_run_with_task_object(self, agent_with_tools):
479
- """Test running agent with Task object."""
480
- from hud.types import MCPToolResult
481
-
482
- task = Task(
483
- id="test_task",
484
- prompt="Click the button",
485
- mcp_config={"test_server": {"url": "http://localhost:8080"}},
486
- setup_tool={"name": "navigate", "arguments": {"url": "https://example.com"}}, # type: ignore[arg-type]
487
- evaluate_tool={"name": "check_result", "arguments": {}}, # type: ignore[arg-type]
488
- )
489
-
490
- # Set up responses
491
- agent_with_tools.responses = [
492
- {
493
- "role": "assistant",
494
- "content": "I'll click the button",
495
- "tool_calls": [MCPToolCall(name="click", arguments={"x": 100, "y": 200})],
496
- }
497
- ]
498
-
499
- # Mock the evaluation to return a reward
500
- agent_with_tools.mcp_client.call_tool = AsyncMock(
501
- side_effect=[
502
- # Setup tool
503
- MCPToolResult(
504
- content=[types.TextContent(type="text", text="Navigated")],
505
- isError=False,
506
- ),
507
- # Click tool
508
- MCPToolResult(
509
- content=[types.TextContent(type="text", text="Clicked")],
510
- isError=False,
511
- ),
512
- # Evaluate tool with reward
513
- MCPToolResult(
514
- content=[types.TextContent(type="text", text="Success")],
515
- isError=False,
516
- structuredContent={"reward": 1.0},
517
- ),
518
- ]
519
- )
520
-
521
- result = await agent_with_tools.run(task)
522
-
523
- assert isinstance(result, Trace)
524
- assert result.reward == 1.0
525
- assert not result.isError
526
- assert result.done
527
-
528
- @pytest.mark.asyncio
529
- async def test_run_with_setup_error(self, agent_with_tools):
530
- """Test task execution with setup phase error."""
531
- from hud.types import MCPToolResult
532
-
533
- task = Task(
534
- id="test_task",
535
- prompt="Do something",
536
- mcp_config={"test_server": {"url": "http://localhost:8080"}},
537
- setup_tool={"name": "bad_setup", "arguments": {}}, # type: ignore[arg-type]
538
- )
539
-
540
- # Mock setup tool to fail
541
- agent_with_tools.mcp_client.call_tool = AsyncMock(
542
- return_value=MCPToolResult(
543
- content=[types.TextContent(type="text", text="Setup failed")],
544
- isError=True,
545
- )
546
- )
547
-
548
- result = await agent_with_tools.run(task)
549
-
550
- assert isinstance(result, Trace)
551
- assert result.isError
552
- # Error content is the string representation of the MCPToolResult list
553
- assert result.content is not None
554
- assert "Setup failed" in result.content
555
- assert "MCPToolResult" in result.content
556
-
557
- @pytest.mark.asyncio
558
- async def test_run_with_multiple_setup_tools(self, agent_with_tools):
559
- """Test task with multiple setup tools."""
560
-
561
- task = Task(
562
- id="test_task",
563
- prompt="Test multiple setup",
564
- mcp_config={"test_server": {"url": "http://localhost:8080"}},
565
- setup_tool=[
566
- MCPToolCall(name="setup1", arguments={}),
567
- MCPToolCall(name="setup2", arguments={}),
568
- ],
569
- )
570
-
571
- agent_with_tools.responses = [{"role": "assistant", "content": "Done", "tool_calls": []}]
572
-
573
- setup_calls = []
574
- agent_with_tools.mcp_client.call_tool = AsyncMock(
575
- side_effect=lambda tool_call: setup_calls.append(tool_call)
576
- or MCPToolResult(
577
- content=[types.TextContent(type="text", text=f"{tool_call.name} done")],
578
- isError=False,
579
- )
580
- )
581
-
582
- result = await agent_with_tools.run(task)
583
-
584
- # Check that the tool names match
585
- setup_names = [call.name for call in setup_calls]
586
- assert "setup1" in setup_names
587
- assert "setup2" in setup_names
588
- assert not result.isError
589
-
590
- @pytest.mark.asyncio
591
- async def test_allowed_tools_filtering(self, mock_client):
592
- """Test that allowed_tools filters available tools."""
593
- mock_client.list_tools = AsyncMock(
594
- return_value=[
595
- types.Tool(name="tool1", description="Tool 1", inputSchema={}),
596
- types.Tool(name="tool2", description="Tool 2", inputSchema={}),
597
- types.Tool(name="tool3", description="Tool 3", inputSchema={}),
598
- ]
599
- )
600
-
601
- agent = MockAgentExtended(mcp_client=mock_client, allowed_tools=["tool1", "tool3"])
602
- await agent.initialize("test")
603
-
604
- available_names = [tool.name for tool in agent._available_tools]
605
- assert "tool1" in available_names
606
- assert "tool3" in available_names
607
- assert "tool2" not in available_names
608
-
609
- @pytest.mark.asyncio
610
- async def test_disallowed_tools_filtering(self, mock_client):
611
- """Test that disallowed_tools filters available tools."""
612
- mock_client.list_tools = AsyncMock(
613
- return_value=[
614
- types.Tool(name="tool1", description="Tool 1", inputSchema={}),
615
- types.Tool(name="tool2", description="Tool 2", inputSchema={}),
616
- types.Tool(name="tool3", description="Tool 3", inputSchema={}),
617
- ]
618
- )
619
-
620
- agent = MockAgentExtended(mcp_client=mock_client, disallowed_tools=["tool2"])
621
- await agent.initialize("test")
622
-
623
- available_names = [tool.name for tool in agent._available_tools]
624
- assert "tool1" in available_names
625
- assert "tool3" in available_names
626
- assert "tool2" not in available_names
627
-
628
- @pytest.mark.asyncio
629
- async def test_lifecycle_tools(self, mock_client):
630
- """Test lifecycle tools are called in run_prompt."""
631
- # Lifecycle tools are specified by name, not as objects
632
- agent = MockAgentExtended(
633
- mcp_client=mock_client,
634
- lifecycle_tools=["screenshot"], # Use tool name
635
- responses=[{"role": "assistant", "content": "Done", "tool_calls": []}],
636
- )
637
-
638
- # Add screenshot tool to available tools
639
- mock_client.list_tools = AsyncMock(
640
- return_value=[
641
- types.Tool(name="screenshot", description="Take screenshot", inputSchema={})
642
- ]
643
- )
644
-
645
- # Initialize to make tools available
646
- await agent.initialize()
647
-
648
- result = await agent.run("Test lifecycle", max_steps=1)
649
- assert not result.isError
650
-
651
- # This test is commented out as screenshot history management may have changed
652
- # @pytest.mark.asyncio
653
- # async def test_screenshot_history_management(self, agent_with_tools):
654
- # """Test screenshot history is maintained."""
655
- # agent_with_tools.initial_screenshot = True
656
-
657
- # # Set up responses with tool calls
658
- # agent_with_tools.responses = [
659
- # {
660
- # "role": "assistant",
661
- # "content": "Action 1",
662
- # "tool_calls": [MCPToolCall(name="click", arguments={"x": 1, "y": 1})],
663
- # },
664
- # {
665
- # "role": "assistant",
666
- # "content": "Action 2",
667
- # "tool_calls": [MCPToolCall(name="click", arguments={"x": 2, "y": 2})],
668
- # },
669
- # {
670
- # "role": "assistant",
671
- # "content": "Action 3",
672
- # "tool_calls": [MCPToolCall(name="click", arguments={"x": 3, "y": 3})],
673
- # },
674
- # ]
675
-
676
- # await agent_with_tools.run("Test screenshots", max_steps=3)
677
-
678
- # # Should have screenshots in history
679
- # assert len(agent_with_tools.screenshot_history) > 0
680
-
681
- @pytest.mark.asyncio
682
- async def test_run_with_invalid_prompt_type(self, agent_with_tools):
683
- """Test run with invalid prompt type raises TypeError."""
684
- with pytest.raises(TypeError, match="prompt_or_task must be str or Task"):
685
- await agent_with_tools.run(123) # Invalid type
686
-
687
- @pytest.mark.asyncio
688
- async def test_evaluate_phase_with_multiple_tools(self, agent_with_tools):
689
- """Test evaluation phase with multiple evaluation tools."""
690
- from hud.types import MCPToolResult
691
-
692
- task = Task(
693
- id="test_task",
694
- prompt="Test evaluation",
695
- mcp_config={"test_server": {"url": "http://localhost:8080"}},
696
- evaluate_tool=[
697
- MCPToolCall(name="eval1", arguments={}),
698
- MCPToolCall(name="eval2", arguments={"reward": True}),
699
- ],
700
- )
701
-
702
- agent_with_tools.responses = [{"role": "assistant", "content": "Done", "tool_calls": []}]
703
-
704
- eval_calls = []
705
- agent_with_tools.mcp_client.call_tool = AsyncMock(
706
- side_effect=lambda tool_call: eval_calls.append(tool_call)
707
- or MCPToolResult(
708
- content=[types.TextContent(type="text", text=f"{tool_call.name} result")],
709
- isError=False,
710
- structuredContent={"reward": 0.5} if tool_call.name == "eval1" else {"reward": 1.0},
711
- )
712
- )
713
-
714
- result = await agent_with_tools.run(task)
715
-
716
- # Check that the tool names match
717
- eval_names = [call.name for call in eval_calls]
718
- assert "eval1" in eval_names
719
- assert "eval2" in eval_names
720
- assert result.reward == 0.5 # From eval1 (first evaluation tool)
721
-
722
- @pytest.mark.asyncio
723
- async def test_trace_population_on_error(self, agent_with_tools):
724
- """Test that trace is populated on task execution error."""
725
-
726
- task = Task(
727
- id="test_task",
728
- prompt="Test error",
729
- mcp_config={"test_server": {"url": "http://localhost:8080"}},
730
- setup_tool={"name": "failing_setup", "arguments": {}}, # type: ignore[arg-type]
731
- )
732
-
733
- # Make setup fail with exception
734
- agent_with_tools.mcp_client.call_tool = AsyncMock(side_effect=Exception("Setup explosion"))
735
-
736
- result = await agent_with_tools.run(task)
737
-
738
- assert result.isError
739
- # Error content is the string representation of the MCPToolResult list
740
- assert "Setup explosion" in result.content
741
- assert "MCPToolResult" in result.content
742
- assert result.done
1
+ """Tests for BaseMCPAgent using simulated actions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, ClassVar
6
+ from unittest.mock import MagicMock
7
+
8
+ # Import AsyncMock from unittest.mock if available (Python 3.8+)
9
+ try:
10
+ from unittest.mock import AsyncMock
11
+ except ImportError:
12
+ # Fallback for older Python versions
13
+ from unittest.mock import MagicMock as AsyncMock
14
+
15
+ import pytest
16
+ from mcp import types
17
+
18
+ from hud.agents import MCPAgent
19
+ from hud.datasets import Task
20
+ from hud.tools.executors.base import BaseExecutor
21
+ from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
22
+
23
+
24
+ class MockMCPAgent(MCPAgent):
25
+ """Concrete implementation of BaseMCPAgent for testing."""
26
+
27
+ metadata: ClassVar[dict[str, Any]] = {} # Optional metadata for MCP config
28
+
29
+ def __init__(self, mcp_client: Any = None, **kwargs: Any) -> None:
30
+ if mcp_client is None:
31
+ # Create a mock client if none provided
32
+ mcp_client = MagicMock()
33
+ mcp_client.get_available_tools = MagicMock(return_value=[])
34
+ mcp_client.initialize = AsyncMock()
35
+ mcp_client.list_tools = AsyncMock(return_value=[])
36
+ mcp_client.mcp_config = {"test_server": {"url": "http://localhost"}}
37
+ super().__init__(mcp_client=mcp_client, **kwargs)
38
+ self.executor = BaseExecutor() # Use simulated executor
39
+ self._messages = []
40
+
41
+ async def run(self, task: Task) -> list[dict[str, Any]]:
42
+ """Mock run method."""
43
+ return self._messages
44
+
45
+ async def create_initial_messages(
46
+ self, prompt: str, initial_screenshot: bool = False
47
+ ) -> list[dict[str, Any]]:
48
+ """Mock create initial messages."""
49
+ messages = [{"role": "user", "content": prompt}]
50
+ if initial_screenshot:
51
+ messages.append({"role": "assistant", "content": "Screenshot: mock_screenshot"})
52
+ return messages
53
+
54
+ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
55
+ """Mock get response."""
56
+ return AgentResponse(content="Mock response", tool_calls=[], done=True)
57
+
58
+ async def format_tool_results(
59
+ self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
60
+ ) -> list[dict[str, Any]]:
61
+ """Mock format tool results."""
62
+ formatted = []
63
+ for tool_call, result in zip(tool_calls, tool_results):
64
+ formatted.append({"role": "tool", "name": tool_call.name, "content": str(result)})
65
+ return formatted
66
+
67
+ async def create_user_message(self, text: str) -> Any:
68
+ """Mock create user message."""
69
+ return {"role": "user", "content": text}
70
+
71
+ async def get_system_messages(self) -> list[Any]:
72
+ """Mock get system messages."""
73
+ return []
74
+
75
+ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
76
+ """Mock format blocks."""
77
+ formatted = []
78
+ for block in blocks:
79
+ if isinstance(block, types.TextContent):
80
+ formatted.append({"type": "text", "text": block.text})
81
+ elif isinstance(block, types.ImageContent):
82
+ formatted.append({"type": "image", "data": block.data})
83
+ elif hasattr(block, "type"):
84
+ formatted.append({"type": getattr(block, "type", "unknown")})
85
+ return formatted
86
+
87
+
88
+ class TestBaseMCPAgent:
89
+ """Tests for BaseMCPAgent with simulated actions."""
90
+
91
+ def test_init_defaults(self):
92
+ """Test initialization with default values."""
93
+ agent = MockMCPAgent()
94
+
95
+ assert agent.mcp_client is not None
96
+ assert agent.allowed_tools is None
97
+ assert agent.disallowed_tools == []
98
+ assert agent.initial_screenshot is True
99
+ assert agent.system_prompt is not None # Default system prompt is set
100
+ assert agent.lifecycle_tools == []
101
+
102
+ def test_init_with_params(self):
103
+ """Test initialization with custom parameters."""
104
+ client = MagicMock()
105
+ agent = MockMCPAgent(
106
+ mcp_client=client,
107
+ allowed_tools=["tool1", "tool2"],
108
+ disallowed_tools=["bad_tool"],
109
+ initial_screenshot=True,
110
+ system_prompt="Custom prompt",
111
+ lifecycle_tools=["custom_setup", "custom_eval"],
112
+ )
113
+
114
+ assert agent.mcp_client == client
115
+ assert agent.allowed_tools == ["tool1", "tool2"]
116
+ assert agent.disallowed_tools == ["bad_tool"]
117
+ assert agent.initial_screenshot is True
118
+ assert agent.system_prompt == "Custom prompt"
119
+ assert agent.lifecycle_tools == ["custom_setup", "custom_eval"]
120
+
121
+ @pytest.mark.asyncio
122
+ async def test_init_no_client_no_task(self):
123
+ """Test initialize fails without client and without task."""
124
+
125
+ # Create a minimal concrete implementation to test the ValueError
126
+ class TestAgent(MCPAgent):
127
+ async def create_initial_messages(
128
+ self, prompt: str, initial_screenshot: bool = False
129
+ ) -> list[dict[str, Any]]:
130
+ return []
131
+
132
+ async def format_tool_results(
133
+ self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
134
+ ) -> list[dict[str, Any]]:
135
+ return []
136
+
137
+ async def get_response(self, messages: list[dict[str, Any]]) -> dict[str, Any]:
138
+ return {"content": "test"}
139
+
140
+ async def get_system_messages(self) -> list[Any]:
141
+ return []
142
+
143
+ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
144
+ return []
145
+
146
+ # Agent can be created with None client
147
+ agent = TestAgent(mcp_client=None)
148
+
149
+ # But initialize should fail without client or task
150
+ with pytest.raises(ValueError, match="No MCPClient"):
151
+ await agent.initialize()
152
+
153
+ @pytest.mark.asyncio
154
+ async def test_initialize_with_sessions(self):
155
+ """Test initialize with existing sessions."""
156
+ agent = MockMCPAgent()
157
+
158
+ # Create proper async mock for session
159
+ mock_session = MagicMock()
160
+
161
+ # Set up the connector and client_session structure
162
+ mock_session.connector = MagicMock()
163
+ mock_session.connector.client_session = MagicMock()
164
+
165
+ # Mock list_tools on the client_session
166
+ async def mock_list_tools():
167
+ return types.ListToolsResult(
168
+ tools=[
169
+ types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
170
+ types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
171
+ types.Tool(
172
+ name="setup", description="Setup tool", inputSchema={"type": "object"}
173
+ ),
174
+ ]
175
+ )
176
+
177
+ mock_session.connector.client_session.list_tools = mock_list_tools
178
+
179
+ assert agent.mcp_client is not None
180
+
181
+ # Mock the list_tools method on mcp_client to return the tools
182
+ agent.mcp_client.list_tools = AsyncMock(
183
+ return_value=[
184
+ types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
185
+ types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
186
+ types.Tool(name="setup", description="Setup tool", inputSchema={"type": "object"}),
187
+ ]
188
+ )
189
+
190
+ await agent.initialize()
191
+
192
+ # Check available tools were populated (excludes lifecycle tools)
193
+ tools = agent.get_available_tools()
194
+ assert len(tools) == 3 # All tools (setup is not in default lifecycle tools)
195
+
196
+ # Ensure names exist in available tools
197
+ names = {t.name for t in tools}
198
+ assert {"tool1", "tool2", "setup"} <= names
199
+
200
+ @pytest.mark.asyncio
201
+ async def test_initialize_with_filtering(self):
202
+ """Test initialize with tool filtering."""
203
+ agent = MockMCPAgent(allowed_tools=["tool1"], disallowed_tools=["tool3"])
204
+
205
+ # Create proper async mock for session
206
+ mock_session = MagicMock()
207
+
208
+ # Set up the connector and client_session structure
209
+ mock_session.connector = MagicMock()
210
+ mock_session.connector.client_session = MagicMock()
211
+
212
+ async def mock_list_tools():
213
+ return types.ListToolsResult(
214
+ tools=[
215
+ types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
216
+ types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
217
+ types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
218
+ types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
219
+ ]
220
+ )
221
+
222
+ mock_session.connector.client_session.list_tools = mock_list_tools
223
+
224
+ assert agent.mcp_client is not None
225
+
226
+ # Mock the list_tools method on mcp_client to return the tools
227
+ agent.mcp_client.list_tools = AsyncMock(
228
+ return_value=[
229
+ types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
230
+ types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
231
+ types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
232
+ types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
233
+ ]
234
+ )
235
+
236
+ await agent.initialize()
237
+
238
+ # Check filtering worked - get_available_tools excludes lifecycle tools
239
+ tools = agent.get_available_tools()
240
+ tool_names = [t.name for t in tools]
241
+ assert len(tools) == 1 # Only tool1 (tool2 and tool3 are filtered out)
242
+ assert "tool1" in tool_names
243
+ assert "setup" not in tool_names # Lifecycle tool excluded from available tools
244
+ assert "tool2" not in tool_names # Not in allowed list
245
+ assert "tool3" not in tool_names # In disallowed list
246
+
247
+ @pytest.mark.asyncio
248
+ async def test_call_tool_success(self):
249
+ """Test successful tool call."""
250
+ agent = MockMCPAgent()
251
+
252
+ # Initialize with a tool
253
+ mock_session = MagicMock()
254
+ mock_session.connector = MagicMock()
255
+ mock_session.connector.client_session = MagicMock()
256
+
257
+ async def mock_list_tools():
258
+ return types.ListToolsResult(
259
+ tools=[
260
+ types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
261
+ ]
262
+ )
263
+
264
+ mock_session.connector.client_session.list_tools = mock_list_tools
265
+
266
+ # Mock the call_tool method on the client session
267
+ mock_result = types.CallToolResult(
268
+ content=[types.TextContent(type="text", text="Tool result")], isError=False
269
+ )
270
+
271
+ async def mock_call_tool(name, args):
272
+ return mock_result
273
+
274
+ mock_session.connector.client_session.call_tool = mock_call_tool
275
+
276
+ assert agent.mcp_client is not None
277
+
278
+ # Mock the client's call_tool method directly
279
+ agent.mcp_client.call_tool = AsyncMock(return_value=mock_result)
280
+
281
+ # Mock the list_tools method to return the test tool
282
+ agent.mcp_client.list_tools = AsyncMock(
283
+ return_value=[
284
+ types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
285
+ ]
286
+ )
287
+
288
+ await agent.initialize()
289
+
290
+ # Call the tool
291
+ tool_call = MCPToolCall(name="test_tool", arguments={"param": "value"})
292
+ results = await agent.call_tools(tool_call)
293
+
294
+ assert len(results) == 1
295
+ assert results[0] == mock_result
296
+ assert not results[0].isError
297
+
298
+ @pytest.mark.asyncio
299
+ async def test_call_tool_not_found(self):
300
+ """Test calling non-existent tool."""
301
+ agent = MockMCPAgent()
302
+
303
+ # Initialize without tools
304
+ mock_session = MagicMock()
305
+
306
+ async def mock_list_tools():
307
+ return types.ListToolsResult(tools=[])
308
+
309
+ mock_session.list_tools = mock_list_tools
310
+ assert agent.mcp_client is not None
311
+
312
+ await agent.initialize()
313
+
314
+ # Try to call unknown tool - call_tools doesn't raise for unknown tools
315
+ tool_call = MCPToolCall(name="unknown_tool", arguments={})
316
+ await agent.call_tools(tool_call)
317
+
318
+ @pytest.mark.asyncio
319
+ async def test_call_tool_no_name(self):
320
+ """Test calling tool without name."""
321
+ # MCPToolCall accepts empty names
322
+ agent = MockMCPAgent()
323
+ tool_call = MCPToolCall(name="", arguments={})
324
+
325
+ # call_tools doesn't validate empty names, it will return error
326
+ await agent.call_tools(tool_call)
327
+
328
+ def test_get_tool_schemas(self):
329
+ """Test getting tool schemas."""
330
+ agent = MockMCPAgent()
331
+
332
+ # Add setup to lifecycle tools to test filtering
333
+ agent.lifecycle_tools = ["setup"]
334
+
335
+ agent._available_tools = [
336
+ types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
337
+ types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
338
+ ]
339
+
340
+ schemas = agent.get_tool_schemas()
341
+
342
+ # Should include non-lifecycle tools
343
+ assert len(schemas) == 1
344
+ assert schemas[0]["name"] == "tool1"
345
+
346
+ def test_get_tools_by_server(self):
347
+ """Test getting tools grouped by server."""
348
+ agent = MockMCPAgent()
349
+
350
+ # Set up tools from different servers
351
+ tool1 = types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"})
352
+ tool2 = types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"})
353
+
354
+ agent._available_tools = [tool1, tool2]
355
+ tools = agent.get_available_tools()
356
+ assert {t.name for t in tools} == {"tool1", "tool2"}
357
+
358
+ @pytest.mark.asyncio
359
+ async def test_executor_integration(self):
360
+ """Test integration with BaseExecutor for simulated actions."""
361
+ agent = MockMCPAgent()
362
+
363
+ # Test various executor actions
364
+ click_result = await agent.executor.click(100, 200, take_screenshot=False)
365
+ assert click_result.output is not None
366
+ assert "[SIMULATED] Click at (100, 200)" in click_result.output
367
+
368
+ type_result = await agent.executor.write("Test input", take_screenshot=False)
369
+ assert type_result.output is not None
370
+ assert "[SIMULATED] Type 'Test input'" in type_result.output
371
+
372
+ scroll_result = await agent.executor.scroll(x=50, y=50, scroll_y=5, take_screenshot=False)
373
+ assert scroll_result.output is not None
374
+ assert "[SIMULATED] Scroll" in scroll_result.output
375
+
376
+ # Test screenshot
377
+ screenshot = await agent.executor.screenshot()
378
+ assert isinstance(screenshot, str)
379
+ assert screenshot.startswith("iVBORw0KGgo") # PNG header
380
+
381
+
382
+ class MockAgentExtended(MCPAgent):
383
+ """Mock agent for testing with predefined responses."""
384
+
385
+ metadata: ClassVar[dict[str, Any]] = {} # Optional metadata for MCP config
386
+
387
+ def __init__(self, responses=None, **kwargs):
388
+ super().__init__(**kwargs)
389
+ self.responses = responses or []
390
+ self.call_count = 0
391
+
392
+ async def create_initial_messages(
393
+ self, prompt: str, initial_screenshot: bool = False
394
+ ) -> list[dict[str, Any]]:
395
+ """Create initial messages."""
396
+ messages = [{"role": "user", "content": prompt}]
397
+ if initial_screenshot:
398
+ # capture_screenshot doesn't exist, just mock it
399
+ screenshot = "mock_screenshot_data"
400
+ messages.append({"role": "assistant", "content": f"Screenshot: {screenshot}"})
401
+ return messages
402
+
403
+ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
404
+ """Return predefined responses - must be async."""
405
+ if self.call_count < len(self.responses):
406
+ response_dict = self.responses[self.call_count]
407
+ self.call_count += 1
408
+ # Convert dict to AgentResponse
409
+ return AgentResponse(
410
+ content=response_dict.get("content", ""),
411
+ tool_calls=response_dict.get("tool_calls", []),
412
+ done=response_dict.get("done", not bool(response_dict.get("tool_calls"))),
413
+ )
414
+ return AgentResponse(content="Done", tool_calls=[], done=True)
415
+
416
+ async def format_tool_results(
417
+ self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
418
+ ) -> list[dict[str, Any]]:
419
+ """Format tool results."""
420
+ formatted = []
421
+ for tool_call, result in zip(tool_calls, tool_results):
422
+ formatted.append({"role": "tool", "name": tool_call.name, "content": str(result)})
423
+ return formatted
424
+
425
+ async def create_user_message(self, text: str) -> Any:
426
+ """Create user message."""
427
+ return {"role": "user", "content": text}
428
+
429
+ async def get_system_messages(self) -> list[Any]:
430
+ """Mock get system messages."""
431
+ return []
432
+
433
+ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
434
+ """Mock format blocks."""
435
+ formatted = []
436
+ for block in blocks:
437
+ if isinstance(block, types.TextContent):
438
+ formatted.append({"type": "text", "text": block.text})
439
+ elif isinstance(block, types.ImageContent):
440
+ formatted.append({"type": "image", "data": block.data})
441
+ elif hasattr(block, "type"):
442
+ formatted.append({"type": getattr(block, "type", "unknown")})
443
+ return formatted
444
+
445
+
446
+ class TestMCPAgentExtended:
447
+ """Extended tests for MCPAgent."""
448
+
449
+ @pytest.fixture
450
+ def mock_client(self):
451
+ """Create a mock MCP client."""
452
+ client = MagicMock()
453
+ client.get_all_active_sessions = MagicMock(return_value={})
454
+ client.initialize = AsyncMock()
455
+ client.list_tools = AsyncMock(return_value=[])
456
+ client.call_tool = AsyncMock(
457
+ return_value=types.CallToolResult(
458
+ content=[types.TextContent(type="text", text="Success")],
459
+ isError=False,
460
+ )
461
+ )
462
+ return client
463
+
464
+ @pytest.fixture
465
+ def agent_with_tools(self, mock_client):
466
+ """Create agent with mock tools."""
467
+ mock_client.list_tools = AsyncMock(
468
+ return_value=[
469
+ types.Tool(name="screenshot", description="Take screenshot", inputSchema={}),
470
+ types.Tool(name="click", description="Click at coordinates", inputSchema={}),
471
+ types.Tool(name="type", description="Type text", inputSchema={}),
472
+ types.Tool(name="bad_tool", description="A tool that fails", inputSchema={}),
473
+ ]
474
+ )
475
+ return MockAgentExtended(mcp_client=mock_client)
476
+
477
+ @pytest.mark.asyncio
478
+ async def test_run_with_task_object(self, agent_with_tools):
479
+ """Test running agent with Task object."""
480
+ from hud.types import MCPToolResult
481
+
482
+ task = Task(
483
+ id="test_task",
484
+ prompt="Click the button",
485
+ mcp_config={"test_server": {"url": "http://localhost:8080"}},
486
+ setup_tool={"name": "navigate", "arguments": {"url": "https://example.com"}}, # type: ignore[arg-type]
487
+ evaluate_tool={"name": "check_result", "arguments": {}}, # type: ignore[arg-type]
488
+ )
489
+
490
+ # Set up responses
491
+ agent_with_tools.responses = [
492
+ {
493
+ "role": "assistant",
494
+ "content": "I'll click the button",
495
+ "tool_calls": [MCPToolCall(name="click", arguments={"x": 100, "y": 200})],
496
+ }
497
+ ]
498
+
499
+ # Mock the evaluation to return a reward
500
+ agent_with_tools.mcp_client.call_tool = AsyncMock(
501
+ side_effect=[
502
+ # Setup tool
503
+ MCPToolResult(
504
+ content=[types.TextContent(type="text", text="Navigated")],
505
+ isError=False,
506
+ ),
507
+ # Click tool
508
+ MCPToolResult(
509
+ content=[types.TextContent(type="text", text="Clicked")],
510
+ isError=False,
511
+ ),
512
+ # Evaluate tool with reward
513
+ MCPToolResult(
514
+ content=[types.TextContent(type="text", text="Success")],
515
+ isError=False,
516
+ structuredContent={"reward": 1.0},
517
+ ),
518
+ ]
519
+ )
520
+
521
+ result = await agent_with_tools.run(task)
522
+
523
+ assert isinstance(result, Trace)
524
+ assert result.reward == 1.0
525
+ assert not result.isError
526
+ assert result.done
527
+
528
+ @pytest.mark.asyncio
529
+ async def test_run_with_setup_error(self, agent_with_tools):
530
+ """Test task execution with setup phase error."""
531
+ from hud.types import MCPToolResult
532
+
533
+ task = Task(
534
+ id="test_task",
535
+ prompt="Do something",
536
+ mcp_config={"test_server": {"url": "http://localhost:8080"}},
537
+ setup_tool={"name": "bad_setup", "arguments": {}}, # type: ignore[arg-type]
538
+ )
539
+
540
+ # Mock setup tool to fail
541
+ agent_with_tools.mcp_client.call_tool = AsyncMock(
542
+ return_value=MCPToolResult(
543
+ content=[types.TextContent(type="text", text="Setup failed")],
544
+ isError=True,
545
+ )
546
+ )
547
+
548
+ result = await agent_with_tools.run(task)
549
+
550
+ assert isinstance(result, Trace)
551
+ assert result.isError
552
+ # Error content is the string representation of the MCPToolResult list
553
+ assert result.content is not None
554
+ assert "Setup failed" in result.content
555
+ assert "MCPToolResult" in result.content
556
+
557
+ @pytest.mark.asyncio
558
+ async def test_run_with_multiple_setup_tools(self, agent_with_tools):
559
+ """Test task with multiple setup tools."""
560
+
561
+ task = Task(
562
+ id="test_task",
563
+ prompt="Test multiple setup",
564
+ mcp_config={"test_server": {"url": "http://localhost:8080"}},
565
+ setup_tool=[
566
+ MCPToolCall(name="setup1", arguments={}),
567
+ MCPToolCall(name="setup2", arguments={}),
568
+ ],
569
+ )
570
+
571
+ agent_with_tools.responses = [{"role": "assistant", "content": "Done", "tool_calls": []}]
572
+
573
+ setup_calls = []
574
+ agent_with_tools.mcp_client.call_tool = AsyncMock(
575
+ side_effect=lambda tool_call: setup_calls.append(tool_call)
576
+ or MCPToolResult(
577
+ content=[types.TextContent(type="text", text=f"{tool_call.name} done")],
578
+ isError=False,
579
+ )
580
+ )
581
+
582
+ result = await agent_with_tools.run(task)
583
+
584
+ # Check that the tool names match
585
+ setup_names = [call.name for call in setup_calls]
586
+ assert "setup1" in setup_names
587
+ assert "setup2" in setup_names
588
+ assert not result.isError
589
+
590
+ @pytest.mark.asyncio
591
+ async def test_allowed_tools_filtering(self, mock_client):
592
+ """Test that allowed_tools filters available tools."""
593
+ mock_client.list_tools = AsyncMock(
594
+ return_value=[
595
+ types.Tool(name="tool1", description="Tool 1", inputSchema={}),
596
+ types.Tool(name="tool2", description="Tool 2", inputSchema={}),
597
+ types.Tool(name="tool3", description="Tool 3", inputSchema={}),
598
+ ]
599
+ )
600
+
601
+ agent = MockAgentExtended(mcp_client=mock_client, allowed_tools=["tool1", "tool3"])
602
+ await agent.initialize("test")
603
+
604
+ available_names = [tool.name for tool in agent._available_tools]
605
+ assert "tool1" in available_names
606
+ assert "tool3" in available_names
607
+ assert "tool2" not in available_names
608
+
609
+ @pytest.mark.asyncio
610
+ async def test_disallowed_tools_filtering(self, mock_client):
611
+ """Test that disallowed_tools filters available tools."""
612
+ mock_client.list_tools = AsyncMock(
613
+ return_value=[
614
+ types.Tool(name="tool1", description="Tool 1", inputSchema={}),
615
+ types.Tool(name="tool2", description="Tool 2", inputSchema={}),
616
+ types.Tool(name="tool3", description="Tool 3", inputSchema={}),
617
+ ]
618
+ )
619
+
620
+ agent = MockAgentExtended(mcp_client=mock_client, disallowed_tools=["tool2"])
621
+ await agent.initialize("test")
622
+
623
+ available_names = [tool.name for tool in agent._available_tools]
624
+ assert "tool1" in available_names
625
+ assert "tool3" in available_names
626
+ assert "tool2" not in available_names
627
+
628
+ @pytest.mark.asyncio
629
+ async def test_lifecycle_tools(self, mock_client):
630
+ """Test lifecycle tools are called in run_prompt."""
631
+ # Lifecycle tools are specified by name, not as objects
632
+ agent = MockAgentExtended(
633
+ mcp_client=mock_client,
634
+ lifecycle_tools=["screenshot"], # Use tool name
635
+ responses=[{"role": "assistant", "content": "Done", "tool_calls": []}],
636
+ )
637
+
638
+ # Add screenshot tool to available tools
639
+ mock_client.list_tools = AsyncMock(
640
+ return_value=[
641
+ types.Tool(name="screenshot", description="Take screenshot", inputSchema={})
642
+ ]
643
+ )
644
+
645
+ # Initialize to make tools available
646
+ await agent.initialize()
647
+
648
+ result = await agent.run("Test lifecycle", max_steps=1)
649
+ assert not result.isError
650
+
651
+ # This test is commented out as screenshot history management may have changed
652
+ # @pytest.mark.asyncio
653
+ # async def test_screenshot_history_management(self, agent_with_tools):
654
+ # """Test screenshot history is maintained."""
655
+ # agent_with_tools.initial_screenshot = True
656
+
657
+ # # Set up responses with tool calls
658
+ # agent_with_tools.responses = [
659
+ # {
660
+ # "role": "assistant",
661
+ # "content": "Action 1",
662
+ # "tool_calls": [MCPToolCall(name="click", arguments={"x": 1, "y": 1})],
663
+ # },
664
+ # {
665
+ # "role": "assistant",
666
+ # "content": "Action 2",
667
+ # "tool_calls": [MCPToolCall(name="click", arguments={"x": 2, "y": 2})],
668
+ # },
669
+ # {
670
+ # "role": "assistant",
671
+ # "content": "Action 3",
672
+ # "tool_calls": [MCPToolCall(name="click", arguments={"x": 3, "y": 3})],
673
+ # },
674
+ # ]
675
+
676
+ # await agent_with_tools.run("Test screenshots", max_steps=3)
677
+
678
+ # # Should have screenshots in history
679
+ # assert len(agent_with_tools.screenshot_history) > 0
680
+
681
+ @pytest.mark.asyncio
682
+ async def test_run_with_invalid_prompt_type(self, agent_with_tools):
683
+ """Test run with invalid prompt type raises TypeError."""
684
+ with pytest.raises(TypeError, match="prompt_or_task must be str or Task"):
685
+ await agent_with_tools.run(123) # Invalid type
686
+
687
+ @pytest.mark.asyncio
688
+ async def test_evaluate_phase_with_multiple_tools(self, agent_with_tools):
689
+ """Test evaluation phase with multiple evaluation tools."""
690
+ from hud.types import MCPToolResult
691
+
692
+ task = Task(
693
+ id="test_task",
694
+ prompt="Test evaluation",
695
+ mcp_config={"test_server": {"url": "http://localhost:8080"}},
696
+ evaluate_tool=[
697
+ MCPToolCall(name="eval1", arguments={}),
698
+ MCPToolCall(name="eval2", arguments={"reward": True}),
699
+ ],
700
+ )
701
+
702
+ agent_with_tools.responses = [{"role": "assistant", "content": "Done", "tool_calls": []}]
703
+
704
+ eval_calls = []
705
+ agent_with_tools.mcp_client.call_tool = AsyncMock(
706
+ side_effect=lambda tool_call: eval_calls.append(tool_call)
707
+ or MCPToolResult(
708
+ content=[types.TextContent(type="text", text=f"{tool_call.name} result")],
709
+ isError=False,
710
+ structuredContent={"reward": 0.5} if tool_call.name == "eval1" else {"reward": 1.0},
711
+ )
712
+ )
713
+
714
+ result = await agent_with_tools.run(task)
715
+
716
+ # Check that the tool names match
717
+ eval_names = [call.name for call in eval_calls]
718
+ assert "eval1" in eval_names
719
+ assert "eval2" in eval_names
720
+ assert result.reward == 0.5 # From eval1 (first evaluation tool)
721
+
722
+ @pytest.mark.asyncio
723
+ async def test_trace_population_on_error(self, agent_with_tools):
724
+ """Test that trace is populated on task execution error."""
725
+
726
+ task = Task(
727
+ id="test_task",
728
+ prompt="Test error",
729
+ mcp_config={"test_server": {"url": "http://localhost:8080"}},
730
+ setup_tool={"name": "failing_setup", "arguments": {}}, # type: ignore[arg-type]
731
+ )
732
+
733
+ # Make setup fail with exception
734
+ agent_with_tools.mcp_client.call_tool = AsyncMock(side_effect=Exception("Setup explosion"))
735
+
736
+ result = await agent_with_tools.run(task)
737
+
738
+ assert result.isError
739
+ # Error content is the string representation of the MCPToolResult list
740
+ assert "Setup explosion" in result.content
741
+ assert "MCPToolResult" in result.content
742
+ assert result.done