hud-python 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +22 -22
- hud/agents/__init__.py +13 -15
- hud/agents/base.py +599 -599
- hud/agents/claude.py +373 -373
- hud/agents/langchain.py +261 -250
- hud/agents/misc/__init__.py +7 -7
- hud/agents/misc/response_agent.py +82 -80
- hud/agents/openai.py +352 -352
- hud/agents/openai_chat_generic.py +154 -154
- hud/agents/tests/__init__.py +1 -1
- hud/agents/tests/test_base.py +742 -742
- hud/agents/tests/test_claude.py +324 -324
- hud/agents/tests/test_client.py +363 -363
- hud/agents/tests/test_openai.py +237 -237
- hud/cli/__init__.py +617 -617
- hud/cli/__main__.py +8 -8
- hud/cli/analyze.py +371 -371
- hud/cli/analyze_metadata.py +230 -230
- hud/cli/build.py +498 -427
- hud/cli/clone.py +185 -185
- hud/cli/cursor.py +92 -92
- hud/cli/debug.py +392 -392
- hud/cli/docker_utils.py +83 -83
- hud/cli/init.py +280 -281
- hud/cli/interactive.py +353 -353
- hud/cli/mcp_server.py +764 -756
- hud/cli/pull.py +330 -336
- hud/cli/push.py +404 -370
- hud/cli/remote_runner.py +311 -311
- hud/cli/runner.py +160 -160
- hud/cli/tests/__init__.py +3 -3
- hud/cli/tests/test_analyze.py +284 -284
- hud/cli/tests/test_cli_init.py +265 -265
- hud/cli/tests/test_cli_main.py +27 -27
- hud/cli/tests/test_clone.py +142 -142
- hud/cli/tests/test_cursor.py +253 -253
- hud/cli/tests/test_debug.py +453 -453
- hud/cli/tests/test_mcp_server.py +139 -139
- hud/cli/tests/test_utils.py +388 -388
- hud/cli/utils.py +263 -263
- hud/clients/README.md +143 -143
- hud/clients/__init__.py +16 -16
- hud/clients/base.py +378 -379
- hud/clients/fastmcp.py +222 -222
- hud/clients/mcp_use.py +298 -278
- hud/clients/tests/__init__.py +1 -1
- hud/clients/tests/test_client_integration.py +111 -111
- hud/clients/tests/test_fastmcp.py +342 -342
- hud/clients/tests/test_protocol.py +188 -188
- hud/clients/utils/__init__.py +1 -1
- hud/clients/utils/retry_transport.py +160 -160
- hud/datasets.py +327 -322
- hud/misc/__init__.py +1 -1
- hud/misc/claude_plays_pokemon.py +292 -292
- hud/otel/__init__.py +35 -35
- hud/otel/collector.py +142 -142
- hud/otel/config.py +164 -164
- hud/otel/context.py +536 -536
- hud/otel/exporters.py +366 -366
- hud/otel/instrumentation.py +97 -97
- hud/otel/processors.py +118 -118
- hud/otel/tests/__init__.py +1 -1
- hud/otel/tests/test_processors.py +197 -197
- hud/server/__init__.py +5 -5
- hud/server/context.py +114 -114
- hud/server/helper/__init__.py +5 -5
- hud/server/low_level.py +132 -132
- hud/server/server.py +170 -166
- hud/server/tests/__init__.py +3 -3
- hud/settings.py +73 -73
- hud/shared/__init__.py +5 -5
- hud/shared/exceptions.py +180 -180
- hud/shared/requests.py +264 -264
- hud/shared/tests/test_exceptions.py +157 -157
- hud/shared/tests/test_requests.py +275 -275
- hud/telemetry/__init__.py +25 -25
- hud/telemetry/instrument.py +379 -379
- hud/telemetry/job.py +309 -309
- hud/telemetry/replay.py +74 -74
- hud/telemetry/trace.py +83 -83
- hud/tools/__init__.py +33 -33
- hud/tools/base.py +365 -365
- hud/tools/bash.py +161 -161
- hud/tools/computer/__init__.py +15 -15
- hud/tools/computer/anthropic.py +437 -437
- hud/tools/computer/hud.py +376 -376
- hud/tools/computer/openai.py +295 -295
- hud/tools/computer/settings.py +82 -82
- hud/tools/edit.py +314 -314
- hud/tools/executors/__init__.py +30 -30
- hud/tools/executors/base.py +539 -539
- hud/tools/executors/pyautogui.py +621 -621
- hud/tools/executors/tests/__init__.py +1 -1
- hud/tools/executors/tests/test_base_executor.py +338 -338
- hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
- hud/tools/executors/xdo.py +511 -511
- hud/tools/playwright.py +412 -412
- hud/tools/tests/__init__.py +3 -3
- hud/tools/tests/test_base.py +282 -282
- hud/tools/tests/test_bash.py +158 -158
- hud/tools/tests/test_bash_extended.py +197 -197
- hud/tools/tests/test_computer.py +425 -425
- hud/tools/tests/test_computer_actions.py +34 -34
- hud/tools/tests/test_edit.py +259 -259
- hud/tools/tests/test_init.py +27 -27
- hud/tools/tests/test_playwright_tool.py +183 -183
- hud/tools/tests/test_tools.py +145 -145
- hud/tools/tests/test_utils.py +156 -156
- hud/tools/types.py +72 -72
- hud/tools/utils.py +50 -50
- hud/types.py +136 -136
- hud/utils/__init__.py +10 -10
- hud/utils/async_utils.py +65 -65
- hud/utils/design.py +236 -168
- hud/utils/mcp.py +55 -55
- hud/utils/progress.py +149 -149
- hud/utils/telemetry.py +66 -66
- hud/utils/tests/test_async_utils.py +173 -173
- hud/utils/tests/test_init.py +17 -17
- hud/utils/tests/test_progress.py +261 -261
- hud/utils/tests/test_telemetry.py +82 -82
- hud/utils/tests/test_version.py +8 -8
- hud/version.py +7 -7
- {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/METADATA +10 -8
- hud_python-0.4.3.dist-info/RECORD +131 -0
- {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/licenses/LICENSE +21 -21
- hud/agents/art.py +0 -101
- hud_python-0.4.1.dist-info/RECORD +0 -132
- {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/WHEEL +0 -0
- {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/entry_points.txt +0 -0
hud/agents/tests/test_base.py
CHANGED
|
@@ -1,742 +1,742 @@
|
|
|
1
|
-
"""Tests for BaseMCPAgent using simulated actions."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from typing import Any, ClassVar
|
|
6
|
-
from unittest.mock import MagicMock
|
|
7
|
-
|
|
8
|
-
# Import AsyncMock from unittest.mock if available (Python 3.8+)
|
|
9
|
-
try:
|
|
10
|
-
from unittest.mock import AsyncMock
|
|
11
|
-
except ImportError:
|
|
12
|
-
# Fallback for older Python versions
|
|
13
|
-
from unittest.mock import MagicMock as AsyncMock
|
|
14
|
-
|
|
15
|
-
import pytest
|
|
16
|
-
from mcp import types
|
|
17
|
-
|
|
18
|
-
from hud.agents import MCPAgent
|
|
19
|
-
from hud.datasets import Task
|
|
20
|
-
from hud.tools.executors.base import BaseExecutor
|
|
21
|
-
from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class MockMCPAgent(MCPAgent):
|
|
25
|
-
"""Concrete implementation of BaseMCPAgent for testing."""
|
|
26
|
-
|
|
27
|
-
metadata: ClassVar[dict[str, Any]] = {} # Optional metadata for MCP config
|
|
28
|
-
|
|
29
|
-
def __init__(self, mcp_client: Any = None, **kwargs: Any) -> None:
|
|
30
|
-
if mcp_client is None:
|
|
31
|
-
# Create a mock client if none provided
|
|
32
|
-
mcp_client = MagicMock()
|
|
33
|
-
mcp_client.get_available_tools = MagicMock(return_value=[])
|
|
34
|
-
mcp_client.initialize = AsyncMock()
|
|
35
|
-
mcp_client.list_tools = AsyncMock(return_value=[])
|
|
36
|
-
mcp_client.mcp_config = {"test_server": {"url": "http://localhost"}}
|
|
37
|
-
super().__init__(mcp_client=mcp_client, **kwargs)
|
|
38
|
-
self.executor = BaseExecutor() # Use simulated executor
|
|
39
|
-
self._messages = []
|
|
40
|
-
|
|
41
|
-
async def run(self, task: Task) -> list[dict[str, Any]]:
|
|
42
|
-
"""Mock run method."""
|
|
43
|
-
return self._messages
|
|
44
|
-
|
|
45
|
-
async def create_initial_messages(
|
|
46
|
-
self, prompt: str, initial_screenshot: bool = False
|
|
47
|
-
) -> list[dict[str, Any]]:
|
|
48
|
-
"""Mock create initial messages."""
|
|
49
|
-
messages = [{"role": "user", "content": prompt}]
|
|
50
|
-
if initial_screenshot:
|
|
51
|
-
messages.append({"role": "assistant", "content": "Screenshot: mock_screenshot"})
|
|
52
|
-
return messages
|
|
53
|
-
|
|
54
|
-
async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
|
|
55
|
-
"""Mock get response."""
|
|
56
|
-
return AgentResponse(content="Mock response", tool_calls=[], done=True)
|
|
57
|
-
|
|
58
|
-
async def format_tool_results(
|
|
59
|
-
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
|
|
60
|
-
) -> list[dict[str, Any]]:
|
|
61
|
-
"""Mock format tool results."""
|
|
62
|
-
formatted = []
|
|
63
|
-
for tool_call, result in zip(tool_calls, tool_results):
|
|
64
|
-
formatted.append({"role": "tool", "name": tool_call.name, "content": str(result)})
|
|
65
|
-
return formatted
|
|
66
|
-
|
|
67
|
-
async def create_user_message(self, text: str) -> Any:
|
|
68
|
-
"""Mock create user message."""
|
|
69
|
-
return {"role": "user", "content": text}
|
|
70
|
-
|
|
71
|
-
async def get_system_messages(self) -> list[Any]:
|
|
72
|
-
"""Mock get system messages."""
|
|
73
|
-
return []
|
|
74
|
-
|
|
75
|
-
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
|
|
76
|
-
"""Mock format blocks."""
|
|
77
|
-
formatted = []
|
|
78
|
-
for block in blocks:
|
|
79
|
-
if isinstance(block, types.TextContent):
|
|
80
|
-
formatted.append({"type": "text", "text": block.text})
|
|
81
|
-
elif isinstance(block, types.ImageContent):
|
|
82
|
-
formatted.append({"type": "image", "data": block.data})
|
|
83
|
-
elif hasattr(block, "type"):
|
|
84
|
-
formatted.append({"type": getattr(block, "type", "unknown")})
|
|
85
|
-
return formatted
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
class TestBaseMCPAgent:
|
|
89
|
-
"""Tests for BaseMCPAgent with simulated actions."""
|
|
90
|
-
|
|
91
|
-
def test_init_defaults(self):
|
|
92
|
-
"""Test initialization with default values."""
|
|
93
|
-
agent = MockMCPAgent()
|
|
94
|
-
|
|
95
|
-
assert agent.mcp_client is not None
|
|
96
|
-
assert agent.allowed_tools is None
|
|
97
|
-
assert agent.disallowed_tools == []
|
|
98
|
-
assert agent.initial_screenshot is True
|
|
99
|
-
assert agent.system_prompt is not None # Default system prompt is set
|
|
100
|
-
assert agent.lifecycle_tools == []
|
|
101
|
-
|
|
102
|
-
def test_init_with_params(self):
|
|
103
|
-
"""Test initialization with custom parameters."""
|
|
104
|
-
client = MagicMock()
|
|
105
|
-
agent = MockMCPAgent(
|
|
106
|
-
mcp_client=client,
|
|
107
|
-
allowed_tools=["tool1", "tool2"],
|
|
108
|
-
disallowed_tools=["bad_tool"],
|
|
109
|
-
initial_screenshot=True,
|
|
110
|
-
system_prompt="Custom prompt",
|
|
111
|
-
lifecycle_tools=["custom_setup", "custom_eval"],
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
assert agent.mcp_client == client
|
|
115
|
-
assert agent.allowed_tools == ["tool1", "tool2"]
|
|
116
|
-
assert agent.disallowed_tools == ["bad_tool"]
|
|
117
|
-
assert agent.initial_screenshot is True
|
|
118
|
-
assert agent.system_prompt == "Custom prompt"
|
|
119
|
-
assert agent.lifecycle_tools == ["custom_setup", "custom_eval"]
|
|
120
|
-
|
|
121
|
-
@pytest.mark.asyncio
|
|
122
|
-
async def test_init_no_client_no_task(self):
|
|
123
|
-
"""Test initialize fails without client and without task."""
|
|
124
|
-
|
|
125
|
-
# Create a minimal concrete implementation to test the ValueError
|
|
126
|
-
class TestAgent(MCPAgent):
|
|
127
|
-
async def create_initial_messages(
|
|
128
|
-
self, prompt: str, initial_screenshot: bool = False
|
|
129
|
-
) -> list[dict[str, Any]]:
|
|
130
|
-
return []
|
|
131
|
-
|
|
132
|
-
async def format_tool_results(
|
|
133
|
-
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
|
|
134
|
-
) -> list[dict[str, Any]]:
|
|
135
|
-
return []
|
|
136
|
-
|
|
137
|
-
async def get_response(self, messages: list[dict[str, Any]]) -> dict[str, Any]:
|
|
138
|
-
return {"content": "test"}
|
|
139
|
-
|
|
140
|
-
async def get_system_messages(self) -> list[Any]:
|
|
141
|
-
return []
|
|
142
|
-
|
|
143
|
-
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
|
|
144
|
-
return []
|
|
145
|
-
|
|
146
|
-
# Agent can be created with None client
|
|
147
|
-
agent = TestAgent(mcp_client=None)
|
|
148
|
-
|
|
149
|
-
# But initialize should fail without client or task
|
|
150
|
-
with pytest.raises(ValueError, match="No MCPClient"):
|
|
151
|
-
await agent.initialize()
|
|
152
|
-
|
|
153
|
-
@pytest.mark.asyncio
|
|
154
|
-
async def test_initialize_with_sessions(self):
|
|
155
|
-
"""Test initialize with existing sessions."""
|
|
156
|
-
agent = MockMCPAgent()
|
|
157
|
-
|
|
158
|
-
# Create proper async mock for session
|
|
159
|
-
mock_session = MagicMock()
|
|
160
|
-
|
|
161
|
-
# Set up the connector and client_session structure
|
|
162
|
-
mock_session.connector = MagicMock()
|
|
163
|
-
mock_session.connector.client_session = MagicMock()
|
|
164
|
-
|
|
165
|
-
# Mock list_tools on the client_session
|
|
166
|
-
async def mock_list_tools():
|
|
167
|
-
return types.ListToolsResult(
|
|
168
|
-
tools=[
|
|
169
|
-
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
170
|
-
types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
|
|
171
|
-
types.Tool(
|
|
172
|
-
name="setup", description="Setup tool", inputSchema={"type": "object"}
|
|
173
|
-
),
|
|
174
|
-
]
|
|
175
|
-
)
|
|
176
|
-
|
|
177
|
-
mock_session.connector.client_session.list_tools = mock_list_tools
|
|
178
|
-
|
|
179
|
-
assert agent.mcp_client is not None
|
|
180
|
-
|
|
181
|
-
# Mock the list_tools method on mcp_client to return the tools
|
|
182
|
-
agent.mcp_client.list_tools = AsyncMock(
|
|
183
|
-
return_value=[
|
|
184
|
-
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
185
|
-
types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
|
|
186
|
-
types.Tool(name="setup", description="Setup tool", inputSchema={"type": "object"}),
|
|
187
|
-
]
|
|
188
|
-
)
|
|
189
|
-
|
|
190
|
-
await agent.initialize()
|
|
191
|
-
|
|
192
|
-
# Check available tools were populated (excludes lifecycle tools)
|
|
193
|
-
tools = agent.get_available_tools()
|
|
194
|
-
assert len(tools) == 3 # All tools (setup is not in default lifecycle tools)
|
|
195
|
-
|
|
196
|
-
# Ensure names exist in available tools
|
|
197
|
-
names = {t.name for t in tools}
|
|
198
|
-
assert {"tool1", "tool2", "setup"} <= names
|
|
199
|
-
|
|
200
|
-
@pytest.mark.asyncio
|
|
201
|
-
async def test_initialize_with_filtering(self):
|
|
202
|
-
"""Test initialize with tool filtering."""
|
|
203
|
-
agent = MockMCPAgent(allowed_tools=["tool1"], disallowed_tools=["tool3"])
|
|
204
|
-
|
|
205
|
-
# Create proper async mock for session
|
|
206
|
-
mock_session = MagicMock()
|
|
207
|
-
|
|
208
|
-
# Set up the connector and client_session structure
|
|
209
|
-
mock_session.connector = MagicMock()
|
|
210
|
-
mock_session.connector.client_session = MagicMock()
|
|
211
|
-
|
|
212
|
-
async def mock_list_tools():
|
|
213
|
-
return types.ListToolsResult(
|
|
214
|
-
tools=[
|
|
215
|
-
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
216
|
-
types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
|
|
217
|
-
types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
|
|
218
|
-
types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
|
|
219
|
-
]
|
|
220
|
-
)
|
|
221
|
-
|
|
222
|
-
mock_session.connector.client_session.list_tools = mock_list_tools
|
|
223
|
-
|
|
224
|
-
assert agent.mcp_client is not None
|
|
225
|
-
|
|
226
|
-
# Mock the list_tools method on mcp_client to return the tools
|
|
227
|
-
agent.mcp_client.list_tools = AsyncMock(
|
|
228
|
-
return_value=[
|
|
229
|
-
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
230
|
-
types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
|
|
231
|
-
types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
|
|
232
|
-
types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
|
|
233
|
-
]
|
|
234
|
-
)
|
|
235
|
-
|
|
236
|
-
await agent.initialize()
|
|
237
|
-
|
|
238
|
-
# Check filtering worked - get_available_tools excludes lifecycle tools
|
|
239
|
-
tools = agent.get_available_tools()
|
|
240
|
-
tool_names = [t.name for t in tools]
|
|
241
|
-
assert len(tools) == 1 # Only tool1 (tool2 and tool3 are filtered out)
|
|
242
|
-
assert "tool1" in tool_names
|
|
243
|
-
assert "setup" not in tool_names # Lifecycle tool excluded from available tools
|
|
244
|
-
assert "tool2" not in tool_names # Not in allowed list
|
|
245
|
-
assert "tool3" not in tool_names # In disallowed list
|
|
246
|
-
|
|
247
|
-
@pytest.mark.asyncio
|
|
248
|
-
async def test_call_tool_success(self):
|
|
249
|
-
"""Test successful tool call."""
|
|
250
|
-
agent = MockMCPAgent()
|
|
251
|
-
|
|
252
|
-
# Initialize with a tool
|
|
253
|
-
mock_session = MagicMock()
|
|
254
|
-
mock_session.connector = MagicMock()
|
|
255
|
-
mock_session.connector.client_session = MagicMock()
|
|
256
|
-
|
|
257
|
-
async def mock_list_tools():
|
|
258
|
-
return types.ListToolsResult(
|
|
259
|
-
tools=[
|
|
260
|
-
types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
|
|
261
|
-
]
|
|
262
|
-
)
|
|
263
|
-
|
|
264
|
-
mock_session.connector.client_session.list_tools = mock_list_tools
|
|
265
|
-
|
|
266
|
-
# Mock the call_tool method on the client session
|
|
267
|
-
mock_result = types.CallToolResult(
|
|
268
|
-
content=[types.TextContent(type="text", text="Tool result")], isError=False
|
|
269
|
-
)
|
|
270
|
-
|
|
271
|
-
async def mock_call_tool(name, args):
|
|
272
|
-
return mock_result
|
|
273
|
-
|
|
274
|
-
mock_session.connector.client_session.call_tool = mock_call_tool
|
|
275
|
-
|
|
276
|
-
assert agent.mcp_client is not None
|
|
277
|
-
|
|
278
|
-
# Mock the client's call_tool method directly
|
|
279
|
-
agent.mcp_client.call_tool = AsyncMock(return_value=mock_result)
|
|
280
|
-
|
|
281
|
-
# Mock the list_tools method to return the test tool
|
|
282
|
-
agent.mcp_client.list_tools = AsyncMock(
|
|
283
|
-
return_value=[
|
|
284
|
-
types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
|
|
285
|
-
]
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
await agent.initialize()
|
|
289
|
-
|
|
290
|
-
# Call the tool
|
|
291
|
-
tool_call = MCPToolCall(name="test_tool", arguments={"param": "value"})
|
|
292
|
-
results = await agent.call_tools(tool_call)
|
|
293
|
-
|
|
294
|
-
assert len(results) == 1
|
|
295
|
-
assert results[0] == mock_result
|
|
296
|
-
assert not results[0].isError
|
|
297
|
-
|
|
298
|
-
@pytest.mark.asyncio
|
|
299
|
-
async def test_call_tool_not_found(self):
|
|
300
|
-
"""Test calling non-existent tool."""
|
|
301
|
-
agent = MockMCPAgent()
|
|
302
|
-
|
|
303
|
-
# Initialize without tools
|
|
304
|
-
mock_session = MagicMock()
|
|
305
|
-
|
|
306
|
-
async def mock_list_tools():
|
|
307
|
-
return types.ListToolsResult(tools=[])
|
|
308
|
-
|
|
309
|
-
mock_session.list_tools = mock_list_tools
|
|
310
|
-
assert agent.mcp_client is not None
|
|
311
|
-
|
|
312
|
-
await agent.initialize()
|
|
313
|
-
|
|
314
|
-
# Try to call unknown tool - call_tools doesn't raise for unknown tools
|
|
315
|
-
tool_call = MCPToolCall(name="unknown_tool", arguments={})
|
|
316
|
-
await agent.call_tools(tool_call)
|
|
317
|
-
|
|
318
|
-
@pytest.mark.asyncio
|
|
319
|
-
async def test_call_tool_no_name(self):
|
|
320
|
-
"""Test calling tool without name."""
|
|
321
|
-
# MCPToolCall accepts empty names
|
|
322
|
-
agent = MockMCPAgent()
|
|
323
|
-
tool_call = MCPToolCall(name="", arguments={})
|
|
324
|
-
|
|
325
|
-
# call_tools doesn't validate empty names, it will return error
|
|
326
|
-
await agent.call_tools(tool_call)
|
|
327
|
-
|
|
328
|
-
def test_get_tool_schemas(self):
|
|
329
|
-
"""Test getting tool schemas."""
|
|
330
|
-
agent = MockMCPAgent()
|
|
331
|
-
|
|
332
|
-
# Add setup to lifecycle tools to test filtering
|
|
333
|
-
agent.lifecycle_tools = ["setup"]
|
|
334
|
-
|
|
335
|
-
agent._available_tools = [
|
|
336
|
-
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
337
|
-
types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
|
|
338
|
-
]
|
|
339
|
-
|
|
340
|
-
schemas = agent.get_tool_schemas()
|
|
341
|
-
|
|
342
|
-
# Should include non-lifecycle tools
|
|
343
|
-
assert len(schemas) == 1
|
|
344
|
-
assert schemas[0]["name"] == "tool1"
|
|
345
|
-
|
|
346
|
-
def test_get_tools_by_server(self):
|
|
347
|
-
"""Test getting tools grouped by server."""
|
|
348
|
-
agent = MockMCPAgent()
|
|
349
|
-
|
|
350
|
-
# Set up tools from different servers
|
|
351
|
-
tool1 = types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"})
|
|
352
|
-
tool2 = types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"})
|
|
353
|
-
|
|
354
|
-
agent._available_tools = [tool1, tool2]
|
|
355
|
-
tools = agent.get_available_tools()
|
|
356
|
-
assert {t.name for t in tools} == {"tool1", "tool2"}
|
|
357
|
-
|
|
358
|
-
@pytest.mark.asyncio
|
|
359
|
-
async def test_executor_integration(self):
|
|
360
|
-
"""Test integration with BaseExecutor for simulated actions."""
|
|
361
|
-
agent = MockMCPAgent()
|
|
362
|
-
|
|
363
|
-
# Test various executor actions
|
|
364
|
-
click_result = await agent.executor.click(100, 200, take_screenshot=False)
|
|
365
|
-
assert click_result.output is not None
|
|
366
|
-
assert "[SIMULATED] Click at (100, 200)" in click_result.output
|
|
367
|
-
|
|
368
|
-
type_result = await agent.executor.write("Test input", take_screenshot=False)
|
|
369
|
-
assert type_result.output is not None
|
|
370
|
-
assert "[SIMULATED] Type 'Test input'" in type_result.output
|
|
371
|
-
|
|
372
|
-
scroll_result = await agent.executor.scroll(x=50, y=50, scroll_y=5, take_screenshot=False)
|
|
373
|
-
assert scroll_result.output is not None
|
|
374
|
-
assert "[SIMULATED] Scroll" in scroll_result.output
|
|
375
|
-
|
|
376
|
-
# Test screenshot
|
|
377
|
-
screenshot = await agent.executor.screenshot()
|
|
378
|
-
assert isinstance(screenshot, str)
|
|
379
|
-
assert screenshot.startswith("iVBORw0KGgo") # PNG header
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
class MockAgentExtended(MCPAgent):
|
|
383
|
-
"""Mock agent for testing with predefined responses."""
|
|
384
|
-
|
|
385
|
-
metadata: ClassVar[dict[str, Any]] = {} # Optional metadata for MCP config
|
|
386
|
-
|
|
387
|
-
def __init__(self, responses=None, **kwargs):
|
|
388
|
-
super().__init__(**kwargs)
|
|
389
|
-
self.responses = responses or []
|
|
390
|
-
self.call_count = 0
|
|
391
|
-
|
|
392
|
-
async def create_initial_messages(
|
|
393
|
-
self, prompt: str, initial_screenshot: bool = False
|
|
394
|
-
) -> list[dict[str, Any]]:
|
|
395
|
-
"""Create initial messages."""
|
|
396
|
-
messages = [{"role": "user", "content": prompt}]
|
|
397
|
-
if initial_screenshot:
|
|
398
|
-
# capture_screenshot doesn't exist, just mock it
|
|
399
|
-
screenshot = "mock_screenshot_data"
|
|
400
|
-
messages.append({"role": "assistant", "content": f"Screenshot: {screenshot}"})
|
|
401
|
-
return messages
|
|
402
|
-
|
|
403
|
-
async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
|
|
404
|
-
"""Return predefined responses - must be async."""
|
|
405
|
-
if self.call_count < len(self.responses):
|
|
406
|
-
response_dict = self.responses[self.call_count]
|
|
407
|
-
self.call_count += 1
|
|
408
|
-
# Convert dict to AgentResponse
|
|
409
|
-
return AgentResponse(
|
|
410
|
-
content=response_dict.get("content", ""),
|
|
411
|
-
tool_calls=response_dict.get("tool_calls", []),
|
|
412
|
-
done=response_dict.get("done", not bool(response_dict.get("tool_calls"))),
|
|
413
|
-
)
|
|
414
|
-
return AgentResponse(content="Done", tool_calls=[], done=True)
|
|
415
|
-
|
|
416
|
-
async def format_tool_results(
|
|
417
|
-
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
|
|
418
|
-
) -> list[dict[str, Any]]:
|
|
419
|
-
"""Format tool results."""
|
|
420
|
-
formatted = []
|
|
421
|
-
for tool_call, result in zip(tool_calls, tool_results):
|
|
422
|
-
formatted.append({"role": "tool", "name": tool_call.name, "content": str(result)})
|
|
423
|
-
return formatted
|
|
424
|
-
|
|
425
|
-
async def create_user_message(self, text: str) -> Any:
|
|
426
|
-
"""Create user message."""
|
|
427
|
-
return {"role": "user", "content": text}
|
|
428
|
-
|
|
429
|
-
async def get_system_messages(self) -> list[Any]:
|
|
430
|
-
"""Mock get system messages."""
|
|
431
|
-
return []
|
|
432
|
-
|
|
433
|
-
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
|
|
434
|
-
"""Mock format blocks."""
|
|
435
|
-
formatted = []
|
|
436
|
-
for block in blocks:
|
|
437
|
-
if isinstance(block, types.TextContent):
|
|
438
|
-
formatted.append({"type": "text", "text": block.text})
|
|
439
|
-
elif isinstance(block, types.ImageContent):
|
|
440
|
-
formatted.append({"type": "image", "data": block.data})
|
|
441
|
-
elif hasattr(block, "type"):
|
|
442
|
-
formatted.append({"type": getattr(block, "type", "unknown")})
|
|
443
|
-
return formatted
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
class TestMCPAgentExtended:
|
|
447
|
-
"""Extended tests for MCPAgent."""
|
|
448
|
-
|
|
449
|
-
@pytest.fixture
|
|
450
|
-
def mock_client(self):
|
|
451
|
-
"""Create a mock MCP client."""
|
|
452
|
-
client = MagicMock()
|
|
453
|
-
client.get_all_active_sessions = MagicMock(return_value={})
|
|
454
|
-
client.initialize = AsyncMock()
|
|
455
|
-
client.list_tools = AsyncMock(return_value=[])
|
|
456
|
-
client.call_tool = AsyncMock(
|
|
457
|
-
return_value=types.CallToolResult(
|
|
458
|
-
content=[types.TextContent(type="text", text="Success")],
|
|
459
|
-
isError=False,
|
|
460
|
-
)
|
|
461
|
-
)
|
|
462
|
-
return client
|
|
463
|
-
|
|
464
|
-
@pytest.fixture
|
|
465
|
-
def agent_with_tools(self, mock_client):
|
|
466
|
-
"""Create agent with mock tools."""
|
|
467
|
-
mock_client.list_tools = AsyncMock(
|
|
468
|
-
return_value=[
|
|
469
|
-
types.Tool(name="screenshot", description="Take screenshot", inputSchema={}),
|
|
470
|
-
types.Tool(name="click", description="Click at coordinates", inputSchema={}),
|
|
471
|
-
types.Tool(name="type", description="Type text", inputSchema={}),
|
|
472
|
-
types.Tool(name="bad_tool", description="A tool that fails", inputSchema={}),
|
|
473
|
-
]
|
|
474
|
-
)
|
|
475
|
-
return MockAgentExtended(mcp_client=mock_client)
|
|
476
|
-
|
|
477
|
-
@pytest.mark.asyncio
|
|
478
|
-
async def test_run_with_task_object(self, agent_with_tools):
|
|
479
|
-
"""Test running agent with Task object."""
|
|
480
|
-
from hud.types import MCPToolResult
|
|
481
|
-
|
|
482
|
-
task = Task(
|
|
483
|
-
id="test_task",
|
|
484
|
-
prompt="Click the button",
|
|
485
|
-
mcp_config={"test_server": {"url": "http://localhost:8080"}},
|
|
486
|
-
setup_tool={"name": "navigate", "arguments": {"url": "https://example.com"}}, # type: ignore[arg-type]
|
|
487
|
-
evaluate_tool={"name": "check_result", "arguments": {}}, # type: ignore[arg-type]
|
|
488
|
-
)
|
|
489
|
-
|
|
490
|
-
# Set up responses
|
|
491
|
-
agent_with_tools.responses = [
|
|
492
|
-
{
|
|
493
|
-
"role": "assistant",
|
|
494
|
-
"content": "I'll click the button",
|
|
495
|
-
"tool_calls": [MCPToolCall(name="click", arguments={"x": 100, "y": 200})],
|
|
496
|
-
}
|
|
497
|
-
]
|
|
498
|
-
|
|
499
|
-
# Mock the evaluation to return a reward
|
|
500
|
-
agent_with_tools.mcp_client.call_tool = AsyncMock(
|
|
501
|
-
side_effect=[
|
|
502
|
-
# Setup tool
|
|
503
|
-
MCPToolResult(
|
|
504
|
-
content=[types.TextContent(type="text", text="Navigated")],
|
|
505
|
-
isError=False,
|
|
506
|
-
),
|
|
507
|
-
# Click tool
|
|
508
|
-
MCPToolResult(
|
|
509
|
-
content=[types.TextContent(type="text", text="Clicked")],
|
|
510
|
-
isError=False,
|
|
511
|
-
),
|
|
512
|
-
# Evaluate tool with reward
|
|
513
|
-
MCPToolResult(
|
|
514
|
-
content=[types.TextContent(type="text", text="Success")],
|
|
515
|
-
isError=False,
|
|
516
|
-
structuredContent={"reward": 1.0},
|
|
517
|
-
),
|
|
518
|
-
]
|
|
519
|
-
)
|
|
520
|
-
|
|
521
|
-
result = await agent_with_tools.run(task)
|
|
522
|
-
|
|
523
|
-
assert isinstance(result, Trace)
|
|
524
|
-
assert result.reward == 1.0
|
|
525
|
-
assert not result.isError
|
|
526
|
-
assert result.done
|
|
527
|
-
|
|
528
|
-
@pytest.mark.asyncio
|
|
529
|
-
async def test_run_with_setup_error(self, agent_with_tools):
|
|
530
|
-
"""Test task execution with setup phase error."""
|
|
531
|
-
from hud.types import MCPToolResult
|
|
532
|
-
|
|
533
|
-
task = Task(
|
|
534
|
-
id="test_task",
|
|
535
|
-
prompt="Do something",
|
|
536
|
-
mcp_config={"test_server": {"url": "http://localhost:8080"}},
|
|
537
|
-
setup_tool={"name": "bad_setup", "arguments": {}}, # type: ignore[arg-type]
|
|
538
|
-
)
|
|
539
|
-
|
|
540
|
-
# Mock setup tool to fail
|
|
541
|
-
agent_with_tools.mcp_client.call_tool = AsyncMock(
|
|
542
|
-
return_value=MCPToolResult(
|
|
543
|
-
content=[types.TextContent(type="text", text="Setup failed")],
|
|
544
|
-
isError=True,
|
|
545
|
-
)
|
|
546
|
-
)
|
|
547
|
-
|
|
548
|
-
result = await agent_with_tools.run(task)
|
|
549
|
-
|
|
550
|
-
assert isinstance(result, Trace)
|
|
551
|
-
assert result.isError
|
|
552
|
-
# Error content is the string representation of the MCPToolResult list
|
|
553
|
-
assert result.content is not None
|
|
554
|
-
assert "Setup failed" in result.content
|
|
555
|
-
assert "MCPToolResult" in result.content
|
|
556
|
-
|
|
557
|
-
@pytest.mark.asyncio
|
|
558
|
-
async def test_run_with_multiple_setup_tools(self, agent_with_tools):
|
|
559
|
-
"""Test task with multiple setup tools."""
|
|
560
|
-
|
|
561
|
-
task = Task(
|
|
562
|
-
id="test_task",
|
|
563
|
-
prompt="Test multiple setup",
|
|
564
|
-
mcp_config={"test_server": {"url": "http://localhost:8080"}},
|
|
565
|
-
setup_tool=[
|
|
566
|
-
MCPToolCall(name="setup1", arguments={}),
|
|
567
|
-
MCPToolCall(name="setup2", arguments={}),
|
|
568
|
-
],
|
|
569
|
-
)
|
|
570
|
-
|
|
571
|
-
agent_with_tools.responses = [{"role": "assistant", "content": "Done", "tool_calls": []}]
|
|
572
|
-
|
|
573
|
-
setup_calls = []
|
|
574
|
-
agent_with_tools.mcp_client.call_tool = AsyncMock(
|
|
575
|
-
side_effect=lambda tool_call: setup_calls.append(tool_call)
|
|
576
|
-
or MCPToolResult(
|
|
577
|
-
content=[types.TextContent(type="text", text=f"{tool_call.name} done")],
|
|
578
|
-
isError=False,
|
|
579
|
-
)
|
|
580
|
-
)
|
|
581
|
-
|
|
582
|
-
result = await agent_with_tools.run(task)
|
|
583
|
-
|
|
584
|
-
# Check that the tool names match
|
|
585
|
-
setup_names = [call.name for call in setup_calls]
|
|
586
|
-
assert "setup1" in setup_names
|
|
587
|
-
assert "setup2" in setup_names
|
|
588
|
-
assert not result.isError
|
|
589
|
-
|
|
590
|
-
@pytest.mark.asyncio
|
|
591
|
-
async def test_allowed_tools_filtering(self, mock_client):
|
|
592
|
-
"""Test that allowed_tools filters available tools."""
|
|
593
|
-
mock_client.list_tools = AsyncMock(
|
|
594
|
-
return_value=[
|
|
595
|
-
types.Tool(name="tool1", description="Tool 1", inputSchema={}),
|
|
596
|
-
types.Tool(name="tool2", description="Tool 2", inputSchema={}),
|
|
597
|
-
types.Tool(name="tool3", description="Tool 3", inputSchema={}),
|
|
598
|
-
]
|
|
599
|
-
)
|
|
600
|
-
|
|
601
|
-
agent = MockAgentExtended(mcp_client=mock_client, allowed_tools=["tool1", "tool3"])
|
|
602
|
-
await agent.initialize("test")
|
|
603
|
-
|
|
604
|
-
available_names = [tool.name for tool in agent._available_tools]
|
|
605
|
-
assert "tool1" in available_names
|
|
606
|
-
assert "tool3" in available_names
|
|
607
|
-
assert "tool2" not in available_names
|
|
608
|
-
|
|
609
|
-
@pytest.mark.asyncio
|
|
610
|
-
async def test_disallowed_tools_filtering(self, mock_client):
|
|
611
|
-
"""Test that disallowed_tools filters available tools."""
|
|
612
|
-
mock_client.list_tools = AsyncMock(
|
|
613
|
-
return_value=[
|
|
614
|
-
types.Tool(name="tool1", description="Tool 1", inputSchema={}),
|
|
615
|
-
types.Tool(name="tool2", description="Tool 2", inputSchema={}),
|
|
616
|
-
types.Tool(name="tool3", description="Tool 3", inputSchema={}),
|
|
617
|
-
]
|
|
618
|
-
)
|
|
619
|
-
|
|
620
|
-
agent = MockAgentExtended(mcp_client=mock_client, disallowed_tools=["tool2"])
|
|
621
|
-
await agent.initialize("test")
|
|
622
|
-
|
|
623
|
-
available_names = [tool.name for tool in agent._available_tools]
|
|
624
|
-
assert "tool1" in available_names
|
|
625
|
-
assert "tool3" in available_names
|
|
626
|
-
assert "tool2" not in available_names
|
|
627
|
-
|
|
628
|
-
@pytest.mark.asyncio
|
|
629
|
-
async def test_lifecycle_tools(self, mock_client):
|
|
630
|
-
"""Test lifecycle tools are called in run_prompt."""
|
|
631
|
-
# Lifecycle tools are specified by name, not as objects
|
|
632
|
-
agent = MockAgentExtended(
|
|
633
|
-
mcp_client=mock_client,
|
|
634
|
-
lifecycle_tools=["screenshot"], # Use tool name
|
|
635
|
-
responses=[{"role": "assistant", "content": "Done", "tool_calls": []}],
|
|
636
|
-
)
|
|
637
|
-
|
|
638
|
-
# Add screenshot tool to available tools
|
|
639
|
-
mock_client.list_tools = AsyncMock(
|
|
640
|
-
return_value=[
|
|
641
|
-
types.Tool(name="screenshot", description="Take screenshot", inputSchema={})
|
|
642
|
-
]
|
|
643
|
-
)
|
|
644
|
-
|
|
645
|
-
# Initialize to make tools available
|
|
646
|
-
await agent.initialize()
|
|
647
|
-
|
|
648
|
-
result = await agent.run("Test lifecycle", max_steps=1)
|
|
649
|
-
assert not result.isError
|
|
650
|
-
|
|
651
|
-
# This test is commented out as screenshot history management may have changed
|
|
652
|
-
# @pytest.mark.asyncio
|
|
653
|
-
# async def test_screenshot_history_management(self, agent_with_tools):
|
|
654
|
-
# """Test screenshot history is maintained."""
|
|
655
|
-
# agent_with_tools.initial_screenshot = True
|
|
656
|
-
|
|
657
|
-
# # Set up responses with tool calls
|
|
658
|
-
# agent_with_tools.responses = [
|
|
659
|
-
# {
|
|
660
|
-
# "role": "assistant",
|
|
661
|
-
# "content": "Action 1",
|
|
662
|
-
# "tool_calls": [MCPToolCall(name="click", arguments={"x": 1, "y": 1})],
|
|
663
|
-
# },
|
|
664
|
-
# {
|
|
665
|
-
# "role": "assistant",
|
|
666
|
-
# "content": "Action 2",
|
|
667
|
-
# "tool_calls": [MCPToolCall(name="click", arguments={"x": 2, "y": 2})],
|
|
668
|
-
# },
|
|
669
|
-
# {
|
|
670
|
-
# "role": "assistant",
|
|
671
|
-
# "content": "Action 3",
|
|
672
|
-
# "tool_calls": [MCPToolCall(name="click", arguments={"x": 3, "y": 3})],
|
|
673
|
-
# },
|
|
674
|
-
# ]
|
|
675
|
-
|
|
676
|
-
# await agent_with_tools.run("Test screenshots", max_steps=3)
|
|
677
|
-
|
|
678
|
-
# # Should have screenshots in history
|
|
679
|
-
# assert len(agent_with_tools.screenshot_history) > 0
|
|
680
|
-
|
|
681
|
-
@pytest.mark.asyncio
|
|
682
|
-
async def test_run_with_invalid_prompt_type(self, agent_with_tools):
|
|
683
|
-
"""Test run with invalid prompt type raises TypeError."""
|
|
684
|
-
with pytest.raises(TypeError, match="prompt_or_task must be str or Task"):
|
|
685
|
-
await agent_with_tools.run(123) # Invalid type
|
|
686
|
-
|
|
687
|
-
@pytest.mark.asyncio
|
|
688
|
-
async def test_evaluate_phase_with_multiple_tools(self, agent_with_tools):
|
|
689
|
-
"""Test evaluation phase with multiple evaluation tools."""
|
|
690
|
-
from hud.types import MCPToolResult
|
|
691
|
-
|
|
692
|
-
task = Task(
|
|
693
|
-
id="test_task",
|
|
694
|
-
prompt="Test evaluation",
|
|
695
|
-
mcp_config={"test_server": {"url": "http://localhost:8080"}},
|
|
696
|
-
evaluate_tool=[
|
|
697
|
-
MCPToolCall(name="eval1", arguments={}),
|
|
698
|
-
MCPToolCall(name="eval2", arguments={"reward": True}),
|
|
699
|
-
],
|
|
700
|
-
)
|
|
701
|
-
|
|
702
|
-
agent_with_tools.responses = [{"role": "assistant", "content": "Done", "tool_calls": []}]
|
|
703
|
-
|
|
704
|
-
eval_calls = []
|
|
705
|
-
agent_with_tools.mcp_client.call_tool = AsyncMock(
|
|
706
|
-
side_effect=lambda tool_call: eval_calls.append(tool_call)
|
|
707
|
-
or MCPToolResult(
|
|
708
|
-
content=[types.TextContent(type="text", text=f"{tool_call.name} result")],
|
|
709
|
-
isError=False,
|
|
710
|
-
structuredContent={"reward": 0.5} if tool_call.name == "eval1" else {"reward": 1.0},
|
|
711
|
-
)
|
|
712
|
-
)
|
|
713
|
-
|
|
714
|
-
result = await agent_with_tools.run(task)
|
|
715
|
-
|
|
716
|
-
# Check that the tool names match
|
|
717
|
-
eval_names = [call.name for call in eval_calls]
|
|
718
|
-
assert "eval1" in eval_names
|
|
719
|
-
assert "eval2" in eval_names
|
|
720
|
-
assert result.reward == 0.5 # From eval1 (first evaluation tool)
|
|
721
|
-
|
|
722
|
-
@pytest.mark.asyncio
|
|
723
|
-
async def test_trace_population_on_error(self, agent_with_tools):
|
|
724
|
-
"""Test that trace is populated on task execution error."""
|
|
725
|
-
|
|
726
|
-
task = Task(
|
|
727
|
-
id="test_task",
|
|
728
|
-
prompt="Test error",
|
|
729
|
-
mcp_config={"test_server": {"url": "http://localhost:8080"}},
|
|
730
|
-
setup_tool={"name": "failing_setup", "arguments": {}}, # type: ignore[arg-type]
|
|
731
|
-
)
|
|
732
|
-
|
|
733
|
-
# Make setup fail with exception
|
|
734
|
-
agent_with_tools.mcp_client.call_tool = AsyncMock(side_effect=Exception("Setup explosion"))
|
|
735
|
-
|
|
736
|
-
result = await agent_with_tools.run(task)
|
|
737
|
-
|
|
738
|
-
assert result.isError
|
|
739
|
-
# Error content is the string representation of the MCPToolResult list
|
|
740
|
-
assert "Setup explosion" in result.content
|
|
741
|
-
assert "MCPToolResult" in result.content
|
|
742
|
-
assert result.done
|
|
1
|
+
"""Tests for BaseMCPAgent using simulated actions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, ClassVar
|
|
6
|
+
from unittest.mock import MagicMock
|
|
7
|
+
|
|
8
|
+
# Import AsyncMock from unittest.mock if available (Python 3.8+)
|
|
9
|
+
try:
|
|
10
|
+
from unittest.mock import AsyncMock
|
|
11
|
+
except ImportError:
|
|
12
|
+
# Fallback for older Python versions
|
|
13
|
+
from unittest.mock import MagicMock as AsyncMock
|
|
14
|
+
|
|
15
|
+
import pytest
|
|
16
|
+
from mcp import types
|
|
17
|
+
|
|
18
|
+
from hud.agents import MCPAgent
|
|
19
|
+
from hud.datasets import Task
|
|
20
|
+
from hud.tools.executors.base import BaseExecutor
|
|
21
|
+
from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class MockMCPAgent(MCPAgent):
|
|
25
|
+
"""Concrete implementation of BaseMCPAgent for testing."""
|
|
26
|
+
|
|
27
|
+
metadata: ClassVar[dict[str, Any]] = {} # Optional metadata for MCP config
|
|
28
|
+
|
|
29
|
+
def __init__(self, mcp_client: Any = None, **kwargs: Any) -> None:
|
|
30
|
+
if mcp_client is None:
|
|
31
|
+
# Create a mock client if none provided
|
|
32
|
+
mcp_client = MagicMock()
|
|
33
|
+
mcp_client.get_available_tools = MagicMock(return_value=[])
|
|
34
|
+
mcp_client.initialize = AsyncMock()
|
|
35
|
+
mcp_client.list_tools = AsyncMock(return_value=[])
|
|
36
|
+
mcp_client.mcp_config = {"test_server": {"url": "http://localhost"}}
|
|
37
|
+
super().__init__(mcp_client=mcp_client, **kwargs)
|
|
38
|
+
self.executor = BaseExecutor() # Use simulated executor
|
|
39
|
+
self._messages = []
|
|
40
|
+
|
|
41
|
+
async def run(self, task: Task) -> list[dict[str, Any]]:
|
|
42
|
+
"""Mock run method."""
|
|
43
|
+
return self._messages
|
|
44
|
+
|
|
45
|
+
async def create_initial_messages(
|
|
46
|
+
self, prompt: str, initial_screenshot: bool = False
|
|
47
|
+
) -> list[dict[str, Any]]:
|
|
48
|
+
"""Mock create initial messages."""
|
|
49
|
+
messages = [{"role": "user", "content": prompt}]
|
|
50
|
+
if initial_screenshot:
|
|
51
|
+
messages.append({"role": "assistant", "content": "Screenshot: mock_screenshot"})
|
|
52
|
+
return messages
|
|
53
|
+
|
|
54
|
+
async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
|
|
55
|
+
"""Mock get response."""
|
|
56
|
+
return AgentResponse(content="Mock response", tool_calls=[], done=True)
|
|
57
|
+
|
|
58
|
+
async def format_tool_results(
|
|
59
|
+
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
|
|
60
|
+
) -> list[dict[str, Any]]:
|
|
61
|
+
"""Mock format tool results."""
|
|
62
|
+
formatted = []
|
|
63
|
+
for tool_call, result in zip(tool_calls, tool_results):
|
|
64
|
+
formatted.append({"role": "tool", "name": tool_call.name, "content": str(result)})
|
|
65
|
+
return formatted
|
|
66
|
+
|
|
67
|
+
async def create_user_message(self, text: str) -> Any:
|
|
68
|
+
"""Mock create user message."""
|
|
69
|
+
return {"role": "user", "content": text}
|
|
70
|
+
|
|
71
|
+
async def get_system_messages(self) -> list[Any]:
|
|
72
|
+
"""Mock get system messages."""
|
|
73
|
+
return []
|
|
74
|
+
|
|
75
|
+
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
|
|
76
|
+
"""Mock format blocks."""
|
|
77
|
+
formatted = []
|
|
78
|
+
for block in blocks:
|
|
79
|
+
if isinstance(block, types.TextContent):
|
|
80
|
+
formatted.append({"type": "text", "text": block.text})
|
|
81
|
+
elif isinstance(block, types.ImageContent):
|
|
82
|
+
formatted.append({"type": "image", "data": block.data})
|
|
83
|
+
elif hasattr(block, "type"):
|
|
84
|
+
formatted.append({"type": getattr(block, "type", "unknown")})
|
|
85
|
+
return formatted
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class TestBaseMCPAgent:
|
|
89
|
+
"""Tests for BaseMCPAgent with simulated actions."""
|
|
90
|
+
|
|
91
|
+
def test_init_defaults(self):
|
|
92
|
+
"""Test initialization with default values."""
|
|
93
|
+
agent = MockMCPAgent()
|
|
94
|
+
|
|
95
|
+
assert agent.mcp_client is not None
|
|
96
|
+
assert agent.allowed_tools is None
|
|
97
|
+
assert agent.disallowed_tools == []
|
|
98
|
+
assert agent.initial_screenshot is True
|
|
99
|
+
assert agent.system_prompt is not None # Default system prompt is set
|
|
100
|
+
assert agent.lifecycle_tools == []
|
|
101
|
+
|
|
102
|
+
def test_init_with_params(self):
|
|
103
|
+
"""Test initialization with custom parameters."""
|
|
104
|
+
client = MagicMock()
|
|
105
|
+
agent = MockMCPAgent(
|
|
106
|
+
mcp_client=client,
|
|
107
|
+
allowed_tools=["tool1", "tool2"],
|
|
108
|
+
disallowed_tools=["bad_tool"],
|
|
109
|
+
initial_screenshot=True,
|
|
110
|
+
system_prompt="Custom prompt",
|
|
111
|
+
lifecycle_tools=["custom_setup", "custom_eval"],
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
assert agent.mcp_client == client
|
|
115
|
+
assert agent.allowed_tools == ["tool1", "tool2"]
|
|
116
|
+
assert agent.disallowed_tools == ["bad_tool"]
|
|
117
|
+
assert agent.initial_screenshot is True
|
|
118
|
+
assert agent.system_prompt == "Custom prompt"
|
|
119
|
+
assert agent.lifecycle_tools == ["custom_setup", "custom_eval"]
|
|
120
|
+
|
|
121
|
+
@pytest.mark.asyncio
|
|
122
|
+
async def test_init_no_client_no_task(self):
|
|
123
|
+
"""Test initialize fails without client and without task."""
|
|
124
|
+
|
|
125
|
+
# Create a minimal concrete implementation to test the ValueError
|
|
126
|
+
class TestAgent(MCPAgent):
|
|
127
|
+
async def create_initial_messages(
|
|
128
|
+
self, prompt: str, initial_screenshot: bool = False
|
|
129
|
+
) -> list[dict[str, Any]]:
|
|
130
|
+
return []
|
|
131
|
+
|
|
132
|
+
async def format_tool_results(
|
|
133
|
+
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
|
|
134
|
+
) -> list[dict[str, Any]]:
|
|
135
|
+
return []
|
|
136
|
+
|
|
137
|
+
async def get_response(self, messages: list[dict[str, Any]]) -> dict[str, Any]:
|
|
138
|
+
return {"content": "test"}
|
|
139
|
+
|
|
140
|
+
async def get_system_messages(self) -> list[Any]:
|
|
141
|
+
return []
|
|
142
|
+
|
|
143
|
+
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
|
|
144
|
+
return []
|
|
145
|
+
|
|
146
|
+
# Agent can be created with None client
|
|
147
|
+
agent = TestAgent(mcp_client=None)
|
|
148
|
+
|
|
149
|
+
# But initialize should fail without client or task
|
|
150
|
+
with pytest.raises(ValueError, match="No MCPClient"):
|
|
151
|
+
await agent.initialize()
|
|
152
|
+
|
|
153
|
+
@pytest.mark.asyncio
|
|
154
|
+
async def test_initialize_with_sessions(self):
|
|
155
|
+
"""Test initialize with existing sessions."""
|
|
156
|
+
agent = MockMCPAgent()
|
|
157
|
+
|
|
158
|
+
# Create proper async mock for session
|
|
159
|
+
mock_session = MagicMock()
|
|
160
|
+
|
|
161
|
+
# Set up the connector and client_session structure
|
|
162
|
+
mock_session.connector = MagicMock()
|
|
163
|
+
mock_session.connector.client_session = MagicMock()
|
|
164
|
+
|
|
165
|
+
# Mock list_tools on the client_session
|
|
166
|
+
async def mock_list_tools():
|
|
167
|
+
return types.ListToolsResult(
|
|
168
|
+
tools=[
|
|
169
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
170
|
+
types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
|
|
171
|
+
types.Tool(
|
|
172
|
+
name="setup", description="Setup tool", inputSchema={"type": "object"}
|
|
173
|
+
),
|
|
174
|
+
]
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
mock_session.connector.client_session.list_tools = mock_list_tools
|
|
178
|
+
|
|
179
|
+
assert agent.mcp_client is not None
|
|
180
|
+
|
|
181
|
+
# Mock the list_tools method on mcp_client to return the tools
|
|
182
|
+
agent.mcp_client.list_tools = AsyncMock(
|
|
183
|
+
return_value=[
|
|
184
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
185
|
+
types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
|
|
186
|
+
types.Tool(name="setup", description="Setup tool", inputSchema={"type": "object"}),
|
|
187
|
+
]
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
await agent.initialize()
|
|
191
|
+
|
|
192
|
+
# Check available tools were populated (excludes lifecycle tools)
|
|
193
|
+
tools = agent.get_available_tools()
|
|
194
|
+
assert len(tools) == 3 # All tools (setup is not in default lifecycle tools)
|
|
195
|
+
|
|
196
|
+
# Ensure names exist in available tools
|
|
197
|
+
names = {t.name for t in tools}
|
|
198
|
+
assert {"tool1", "tool2", "setup"} <= names
|
|
199
|
+
|
|
200
|
+
@pytest.mark.asyncio
|
|
201
|
+
async def test_initialize_with_filtering(self):
|
|
202
|
+
"""Test initialize with tool filtering."""
|
|
203
|
+
agent = MockMCPAgent(allowed_tools=["tool1"], disallowed_tools=["tool3"])
|
|
204
|
+
|
|
205
|
+
# Create proper async mock for session
|
|
206
|
+
mock_session = MagicMock()
|
|
207
|
+
|
|
208
|
+
# Set up the connector and client_session structure
|
|
209
|
+
mock_session.connector = MagicMock()
|
|
210
|
+
mock_session.connector.client_session = MagicMock()
|
|
211
|
+
|
|
212
|
+
async def mock_list_tools():
|
|
213
|
+
return types.ListToolsResult(
|
|
214
|
+
tools=[
|
|
215
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
216
|
+
types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
|
|
217
|
+
types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
|
|
218
|
+
types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
|
|
219
|
+
]
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
mock_session.connector.client_session.list_tools = mock_list_tools
|
|
223
|
+
|
|
224
|
+
assert agent.mcp_client is not None
|
|
225
|
+
|
|
226
|
+
# Mock the list_tools method on mcp_client to return the tools
|
|
227
|
+
agent.mcp_client.list_tools = AsyncMock(
|
|
228
|
+
return_value=[
|
|
229
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
230
|
+
types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
|
|
231
|
+
types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
|
|
232
|
+
types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
|
|
233
|
+
]
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
await agent.initialize()
|
|
237
|
+
|
|
238
|
+
# Check filtering worked - get_available_tools excludes lifecycle tools
|
|
239
|
+
tools = agent.get_available_tools()
|
|
240
|
+
tool_names = [t.name for t in tools]
|
|
241
|
+
assert len(tools) == 1 # Only tool1 (tool2 and tool3 are filtered out)
|
|
242
|
+
assert "tool1" in tool_names
|
|
243
|
+
assert "setup" not in tool_names # Lifecycle tool excluded from available tools
|
|
244
|
+
assert "tool2" not in tool_names # Not in allowed list
|
|
245
|
+
assert "tool3" not in tool_names # In disallowed list
|
|
246
|
+
|
|
247
|
+
@pytest.mark.asyncio
|
|
248
|
+
async def test_call_tool_success(self):
|
|
249
|
+
"""Test successful tool call."""
|
|
250
|
+
agent = MockMCPAgent()
|
|
251
|
+
|
|
252
|
+
# Initialize with a tool
|
|
253
|
+
mock_session = MagicMock()
|
|
254
|
+
mock_session.connector = MagicMock()
|
|
255
|
+
mock_session.connector.client_session = MagicMock()
|
|
256
|
+
|
|
257
|
+
async def mock_list_tools():
|
|
258
|
+
return types.ListToolsResult(
|
|
259
|
+
tools=[
|
|
260
|
+
types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
|
|
261
|
+
]
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
mock_session.connector.client_session.list_tools = mock_list_tools
|
|
265
|
+
|
|
266
|
+
# Mock the call_tool method on the client session
|
|
267
|
+
mock_result = types.CallToolResult(
|
|
268
|
+
content=[types.TextContent(type="text", text="Tool result")], isError=False
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
async def mock_call_tool(name, args):
|
|
272
|
+
return mock_result
|
|
273
|
+
|
|
274
|
+
mock_session.connector.client_session.call_tool = mock_call_tool
|
|
275
|
+
|
|
276
|
+
assert agent.mcp_client is not None
|
|
277
|
+
|
|
278
|
+
# Mock the client's call_tool method directly
|
|
279
|
+
agent.mcp_client.call_tool = AsyncMock(return_value=mock_result)
|
|
280
|
+
|
|
281
|
+
# Mock the list_tools method to return the test tool
|
|
282
|
+
agent.mcp_client.list_tools = AsyncMock(
|
|
283
|
+
return_value=[
|
|
284
|
+
types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
|
|
285
|
+
]
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
await agent.initialize()
|
|
289
|
+
|
|
290
|
+
# Call the tool
|
|
291
|
+
tool_call = MCPToolCall(name="test_tool", arguments={"param": "value"})
|
|
292
|
+
results = await agent.call_tools(tool_call)
|
|
293
|
+
|
|
294
|
+
assert len(results) == 1
|
|
295
|
+
assert results[0] == mock_result
|
|
296
|
+
assert not results[0].isError
|
|
297
|
+
|
|
298
|
+
@pytest.mark.asyncio
|
|
299
|
+
async def test_call_tool_not_found(self):
|
|
300
|
+
"""Test calling non-existent tool."""
|
|
301
|
+
agent = MockMCPAgent()
|
|
302
|
+
|
|
303
|
+
# Initialize without tools
|
|
304
|
+
mock_session = MagicMock()
|
|
305
|
+
|
|
306
|
+
async def mock_list_tools():
|
|
307
|
+
return types.ListToolsResult(tools=[])
|
|
308
|
+
|
|
309
|
+
mock_session.list_tools = mock_list_tools
|
|
310
|
+
assert agent.mcp_client is not None
|
|
311
|
+
|
|
312
|
+
await agent.initialize()
|
|
313
|
+
|
|
314
|
+
# Try to call unknown tool - call_tools doesn't raise for unknown tools
|
|
315
|
+
tool_call = MCPToolCall(name="unknown_tool", arguments={})
|
|
316
|
+
await agent.call_tools(tool_call)
|
|
317
|
+
|
|
318
|
+
@pytest.mark.asyncio
|
|
319
|
+
async def test_call_tool_no_name(self):
|
|
320
|
+
"""Test calling tool without name."""
|
|
321
|
+
# MCPToolCall accepts empty names
|
|
322
|
+
agent = MockMCPAgent()
|
|
323
|
+
tool_call = MCPToolCall(name="", arguments={})
|
|
324
|
+
|
|
325
|
+
# call_tools doesn't validate empty names, it will return error
|
|
326
|
+
await agent.call_tools(tool_call)
|
|
327
|
+
|
|
328
|
+
def test_get_tool_schemas(self):
|
|
329
|
+
"""Test getting tool schemas."""
|
|
330
|
+
agent = MockMCPAgent()
|
|
331
|
+
|
|
332
|
+
# Add setup to lifecycle tools to test filtering
|
|
333
|
+
agent.lifecycle_tools = ["setup"]
|
|
334
|
+
|
|
335
|
+
agent._available_tools = [
|
|
336
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
337
|
+
types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
|
|
338
|
+
]
|
|
339
|
+
|
|
340
|
+
schemas = agent.get_tool_schemas()
|
|
341
|
+
|
|
342
|
+
# Should include non-lifecycle tools
|
|
343
|
+
assert len(schemas) == 1
|
|
344
|
+
assert schemas[0]["name"] == "tool1"
|
|
345
|
+
|
|
346
|
+
def test_get_tools_by_server(self):
|
|
347
|
+
"""Test getting tools grouped by server."""
|
|
348
|
+
agent = MockMCPAgent()
|
|
349
|
+
|
|
350
|
+
# Set up tools from different servers
|
|
351
|
+
tool1 = types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"})
|
|
352
|
+
tool2 = types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"})
|
|
353
|
+
|
|
354
|
+
agent._available_tools = [tool1, tool2]
|
|
355
|
+
tools = agent.get_available_tools()
|
|
356
|
+
assert {t.name for t in tools} == {"tool1", "tool2"}
|
|
357
|
+
|
|
358
|
+
@pytest.mark.asyncio
|
|
359
|
+
async def test_executor_integration(self):
|
|
360
|
+
"""Test integration with BaseExecutor for simulated actions."""
|
|
361
|
+
agent = MockMCPAgent()
|
|
362
|
+
|
|
363
|
+
# Test various executor actions
|
|
364
|
+
click_result = await agent.executor.click(100, 200, take_screenshot=False)
|
|
365
|
+
assert click_result.output is not None
|
|
366
|
+
assert "[SIMULATED] Click at (100, 200)" in click_result.output
|
|
367
|
+
|
|
368
|
+
type_result = await agent.executor.write("Test input", take_screenshot=False)
|
|
369
|
+
assert type_result.output is not None
|
|
370
|
+
assert "[SIMULATED] Type 'Test input'" in type_result.output
|
|
371
|
+
|
|
372
|
+
scroll_result = await agent.executor.scroll(x=50, y=50, scroll_y=5, take_screenshot=False)
|
|
373
|
+
assert scroll_result.output is not None
|
|
374
|
+
assert "[SIMULATED] Scroll" in scroll_result.output
|
|
375
|
+
|
|
376
|
+
# Test screenshot
|
|
377
|
+
screenshot = await agent.executor.screenshot()
|
|
378
|
+
assert isinstance(screenshot, str)
|
|
379
|
+
assert screenshot.startswith("iVBORw0KGgo") # PNG header
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
class MockAgentExtended(MCPAgent):
|
|
383
|
+
"""Mock agent for testing with predefined responses."""
|
|
384
|
+
|
|
385
|
+
metadata: ClassVar[dict[str, Any]] = {} # Optional metadata for MCP config
|
|
386
|
+
|
|
387
|
+
def __init__(self, responses=None, **kwargs):
|
|
388
|
+
super().__init__(**kwargs)
|
|
389
|
+
self.responses = responses or []
|
|
390
|
+
self.call_count = 0
|
|
391
|
+
|
|
392
|
+
async def create_initial_messages(
|
|
393
|
+
self, prompt: str, initial_screenshot: bool = False
|
|
394
|
+
) -> list[dict[str, Any]]:
|
|
395
|
+
"""Create initial messages."""
|
|
396
|
+
messages = [{"role": "user", "content": prompt}]
|
|
397
|
+
if initial_screenshot:
|
|
398
|
+
# capture_screenshot doesn't exist, just mock it
|
|
399
|
+
screenshot = "mock_screenshot_data"
|
|
400
|
+
messages.append({"role": "assistant", "content": f"Screenshot: {screenshot}"})
|
|
401
|
+
return messages
|
|
402
|
+
|
|
403
|
+
async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
|
|
404
|
+
"""Return predefined responses - must be async."""
|
|
405
|
+
if self.call_count < len(self.responses):
|
|
406
|
+
response_dict = self.responses[self.call_count]
|
|
407
|
+
self.call_count += 1
|
|
408
|
+
# Convert dict to AgentResponse
|
|
409
|
+
return AgentResponse(
|
|
410
|
+
content=response_dict.get("content", ""),
|
|
411
|
+
tool_calls=response_dict.get("tool_calls", []),
|
|
412
|
+
done=response_dict.get("done", not bool(response_dict.get("tool_calls"))),
|
|
413
|
+
)
|
|
414
|
+
return AgentResponse(content="Done", tool_calls=[], done=True)
|
|
415
|
+
|
|
416
|
+
async def format_tool_results(
|
|
417
|
+
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
|
|
418
|
+
) -> list[dict[str, Any]]:
|
|
419
|
+
"""Format tool results."""
|
|
420
|
+
formatted = []
|
|
421
|
+
for tool_call, result in zip(tool_calls, tool_results):
|
|
422
|
+
formatted.append({"role": "tool", "name": tool_call.name, "content": str(result)})
|
|
423
|
+
return formatted
|
|
424
|
+
|
|
425
|
+
async def create_user_message(self, text: str) -> Any:
|
|
426
|
+
"""Create user message."""
|
|
427
|
+
return {"role": "user", "content": text}
|
|
428
|
+
|
|
429
|
+
async def get_system_messages(self) -> list[Any]:
|
|
430
|
+
"""Mock get system messages."""
|
|
431
|
+
return []
|
|
432
|
+
|
|
433
|
+
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
|
|
434
|
+
"""Mock format blocks."""
|
|
435
|
+
formatted = []
|
|
436
|
+
for block in blocks:
|
|
437
|
+
if isinstance(block, types.TextContent):
|
|
438
|
+
formatted.append({"type": "text", "text": block.text})
|
|
439
|
+
elif isinstance(block, types.ImageContent):
|
|
440
|
+
formatted.append({"type": "image", "data": block.data})
|
|
441
|
+
elif hasattr(block, "type"):
|
|
442
|
+
formatted.append({"type": getattr(block, "type", "unknown")})
|
|
443
|
+
return formatted
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
class TestMCPAgentExtended:
|
|
447
|
+
"""Extended tests for MCPAgent."""
|
|
448
|
+
|
|
449
|
+
@pytest.fixture
|
|
450
|
+
def mock_client(self):
|
|
451
|
+
"""Create a mock MCP client."""
|
|
452
|
+
client = MagicMock()
|
|
453
|
+
client.get_all_active_sessions = MagicMock(return_value={})
|
|
454
|
+
client.initialize = AsyncMock()
|
|
455
|
+
client.list_tools = AsyncMock(return_value=[])
|
|
456
|
+
client.call_tool = AsyncMock(
|
|
457
|
+
return_value=types.CallToolResult(
|
|
458
|
+
content=[types.TextContent(type="text", text="Success")],
|
|
459
|
+
isError=False,
|
|
460
|
+
)
|
|
461
|
+
)
|
|
462
|
+
return client
|
|
463
|
+
|
|
464
|
+
@pytest.fixture
|
|
465
|
+
def agent_with_tools(self, mock_client):
|
|
466
|
+
"""Create agent with mock tools."""
|
|
467
|
+
mock_client.list_tools = AsyncMock(
|
|
468
|
+
return_value=[
|
|
469
|
+
types.Tool(name="screenshot", description="Take screenshot", inputSchema={}),
|
|
470
|
+
types.Tool(name="click", description="Click at coordinates", inputSchema={}),
|
|
471
|
+
types.Tool(name="type", description="Type text", inputSchema={}),
|
|
472
|
+
types.Tool(name="bad_tool", description="A tool that fails", inputSchema={}),
|
|
473
|
+
]
|
|
474
|
+
)
|
|
475
|
+
return MockAgentExtended(mcp_client=mock_client)
|
|
476
|
+
|
|
477
|
+
@pytest.mark.asyncio
|
|
478
|
+
async def test_run_with_task_object(self, agent_with_tools):
|
|
479
|
+
"""Test running agent with Task object."""
|
|
480
|
+
from hud.types import MCPToolResult
|
|
481
|
+
|
|
482
|
+
task = Task(
|
|
483
|
+
id="test_task",
|
|
484
|
+
prompt="Click the button",
|
|
485
|
+
mcp_config={"test_server": {"url": "http://localhost:8080"}},
|
|
486
|
+
setup_tool={"name": "navigate", "arguments": {"url": "https://example.com"}}, # type: ignore[arg-type]
|
|
487
|
+
evaluate_tool={"name": "check_result", "arguments": {}}, # type: ignore[arg-type]
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
# Set up responses
|
|
491
|
+
agent_with_tools.responses = [
|
|
492
|
+
{
|
|
493
|
+
"role": "assistant",
|
|
494
|
+
"content": "I'll click the button",
|
|
495
|
+
"tool_calls": [MCPToolCall(name="click", arguments={"x": 100, "y": 200})],
|
|
496
|
+
}
|
|
497
|
+
]
|
|
498
|
+
|
|
499
|
+
# Mock the evaluation to return a reward
|
|
500
|
+
agent_with_tools.mcp_client.call_tool = AsyncMock(
|
|
501
|
+
side_effect=[
|
|
502
|
+
# Setup tool
|
|
503
|
+
MCPToolResult(
|
|
504
|
+
content=[types.TextContent(type="text", text="Navigated")],
|
|
505
|
+
isError=False,
|
|
506
|
+
),
|
|
507
|
+
# Click tool
|
|
508
|
+
MCPToolResult(
|
|
509
|
+
content=[types.TextContent(type="text", text="Clicked")],
|
|
510
|
+
isError=False,
|
|
511
|
+
),
|
|
512
|
+
# Evaluate tool with reward
|
|
513
|
+
MCPToolResult(
|
|
514
|
+
content=[types.TextContent(type="text", text="Success")],
|
|
515
|
+
isError=False,
|
|
516
|
+
structuredContent={"reward": 1.0},
|
|
517
|
+
),
|
|
518
|
+
]
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
result = await agent_with_tools.run(task)
|
|
522
|
+
|
|
523
|
+
assert isinstance(result, Trace)
|
|
524
|
+
assert result.reward == 1.0
|
|
525
|
+
assert not result.isError
|
|
526
|
+
assert result.done
|
|
527
|
+
|
|
528
|
+
@pytest.mark.asyncio
|
|
529
|
+
async def test_run_with_setup_error(self, agent_with_tools):
|
|
530
|
+
"""Test task execution with setup phase error."""
|
|
531
|
+
from hud.types import MCPToolResult
|
|
532
|
+
|
|
533
|
+
task = Task(
|
|
534
|
+
id="test_task",
|
|
535
|
+
prompt="Do something",
|
|
536
|
+
mcp_config={"test_server": {"url": "http://localhost:8080"}},
|
|
537
|
+
setup_tool={"name": "bad_setup", "arguments": {}}, # type: ignore[arg-type]
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
# Mock setup tool to fail
|
|
541
|
+
agent_with_tools.mcp_client.call_tool = AsyncMock(
|
|
542
|
+
return_value=MCPToolResult(
|
|
543
|
+
content=[types.TextContent(type="text", text="Setup failed")],
|
|
544
|
+
isError=True,
|
|
545
|
+
)
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
result = await agent_with_tools.run(task)
|
|
549
|
+
|
|
550
|
+
assert isinstance(result, Trace)
|
|
551
|
+
assert result.isError
|
|
552
|
+
# Error content is the string representation of the MCPToolResult list
|
|
553
|
+
assert result.content is not None
|
|
554
|
+
assert "Setup failed" in result.content
|
|
555
|
+
assert "MCPToolResult" in result.content
|
|
556
|
+
|
|
557
|
+
@pytest.mark.asyncio
|
|
558
|
+
async def test_run_with_multiple_setup_tools(self, agent_with_tools):
|
|
559
|
+
"""Test task with multiple setup tools."""
|
|
560
|
+
|
|
561
|
+
task = Task(
|
|
562
|
+
id="test_task",
|
|
563
|
+
prompt="Test multiple setup",
|
|
564
|
+
mcp_config={"test_server": {"url": "http://localhost:8080"}},
|
|
565
|
+
setup_tool=[
|
|
566
|
+
MCPToolCall(name="setup1", arguments={}),
|
|
567
|
+
MCPToolCall(name="setup2", arguments={}),
|
|
568
|
+
],
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
agent_with_tools.responses = [{"role": "assistant", "content": "Done", "tool_calls": []}]
|
|
572
|
+
|
|
573
|
+
setup_calls = []
|
|
574
|
+
agent_with_tools.mcp_client.call_tool = AsyncMock(
|
|
575
|
+
side_effect=lambda tool_call: setup_calls.append(tool_call)
|
|
576
|
+
or MCPToolResult(
|
|
577
|
+
content=[types.TextContent(type="text", text=f"{tool_call.name} done")],
|
|
578
|
+
isError=False,
|
|
579
|
+
)
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
result = await agent_with_tools.run(task)
|
|
583
|
+
|
|
584
|
+
# Check that the tool names match
|
|
585
|
+
setup_names = [call.name for call in setup_calls]
|
|
586
|
+
assert "setup1" in setup_names
|
|
587
|
+
assert "setup2" in setup_names
|
|
588
|
+
assert not result.isError
|
|
589
|
+
|
|
590
|
+
@pytest.mark.asyncio
|
|
591
|
+
async def test_allowed_tools_filtering(self, mock_client):
|
|
592
|
+
"""Test that allowed_tools filters available tools."""
|
|
593
|
+
mock_client.list_tools = AsyncMock(
|
|
594
|
+
return_value=[
|
|
595
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={}),
|
|
596
|
+
types.Tool(name="tool2", description="Tool 2", inputSchema={}),
|
|
597
|
+
types.Tool(name="tool3", description="Tool 3", inputSchema={}),
|
|
598
|
+
]
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
agent = MockAgentExtended(mcp_client=mock_client, allowed_tools=["tool1", "tool3"])
|
|
602
|
+
await agent.initialize("test")
|
|
603
|
+
|
|
604
|
+
available_names = [tool.name for tool in agent._available_tools]
|
|
605
|
+
assert "tool1" in available_names
|
|
606
|
+
assert "tool3" in available_names
|
|
607
|
+
assert "tool2" not in available_names
|
|
608
|
+
|
|
609
|
+
@pytest.mark.asyncio
|
|
610
|
+
async def test_disallowed_tools_filtering(self, mock_client):
|
|
611
|
+
"""Test that disallowed_tools filters available tools."""
|
|
612
|
+
mock_client.list_tools = AsyncMock(
|
|
613
|
+
return_value=[
|
|
614
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={}),
|
|
615
|
+
types.Tool(name="tool2", description="Tool 2", inputSchema={}),
|
|
616
|
+
types.Tool(name="tool3", description="Tool 3", inputSchema={}),
|
|
617
|
+
]
|
|
618
|
+
)
|
|
619
|
+
|
|
620
|
+
agent = MockAgentExtended(mcp_client=mock_client, disallowed_tools=["tool2"])
|
|
621
|
+
await agent.initialize("test")
|
|
622
|
+
|
|
623
|
+
available_names = [tool.name for tool in agent._available_tools]
|
|
624
|
+
assert "tool1" in available_names
|
|
625
|
+
assert "tool3" in available_names
|
|
626
|
+
assert "tool2" not in available_names
|
|
627
|
+
|
|
628
|
+
@pytest.mark.asyncio
|
|
629
|
+
async def test_lifecycle_tools(self, mock_client):
|
|
630
|
+
"""Test lifecycle tools are called in run_prompt."""
|
|
631
|
+
# Lifecycle tools are specified by name, not as objects
|
|
632
|
+
agent = MockAgentExtended(
|
|
633
|
+
mcp_client=mock_client,
|
|
634
|
+
lifecycle_tools=["screenshot"], # Use tool name
|
|
635
|
+
responses=[{"role": "assistant", "content": "Done", "tool_calls": []}],
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
# Add screenshot tool to available tools
|
|
639
|
+
mock_client.list_tools = AsyncMock(
|
|
640
|
+
return_value=[
|
|
641
|
+
types.Tool(name="screenshot", description="Take screenshot", inputSchema={})
|
|
642
|
+
]
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
# Initialize to make tools available
|
|
646
|
+
await agent.initialize()
|
|
647
|
+
|
|
648
|
+
result = await agent.run("Test lifecycle", max_steps=1)
|
|
649
|
+
assert not result.isError
|
|
650
|
+
|
|
651
|
+
# This test is commented out as screenshot history management may have changed
|
|
652
|
+
# @pytest.mark.asyncio
|
|
653
|
+
# async def test_screenshot_history_management(self, agent_with_tools):
|
|
654
|
+
# """Test screenshot history is maintained."""
|
|
655
|
+
# agent_with_tools.initial_screenshot = True
|
|
656
|
+
|
|
657
|
+
# # Set up responses with tool calls
|
|
658
|
+
# agent_with_tools.responses = [
|
|
659
|
+
# {
|
|
660
|
+
# "role": "assistant",
|
|
661
|
+
# "content": "Action 1",
|
|
662
|
+
# "tool_calls": [MCPToolCall(name="click", arguments={"x": 1, "y": 1})],
|
|
663
|
+
# },
|
|
664
|
+
# {
|
|
665
|
+
# "role": "assistant",
|
|
666
|
+
# "content": "Action 2",
|
|
667
|
+
# "tool_calls": [MCPToolCall(name="click", arguments={"x": 2, "y": 2})],
|
|
668
|
+
# },
|
|
669
|
+
# {
|
|
670
|
+
# "role": "assistant",
|
|
671
|
+
# "content": "Action 3",
|
|
672
|
+
# "tool_calls": [MCPToolCall(name="click", arguments={"x": 3, "y": 3})],
|
|
673
|
+
# },
|
|
674
|
+
# ]
|
|
675
|
+
|
|
676
|
+
# await agent_with_tools.run("Test screenshots", max_steps=3)
|
|
677
|
+
|
|
678
|
+
# # Should have screenshots in history
|
|
679
|
+
# assert len(agent_with_tools.screenshot_history) > 0
|
|
680
|
+
|
|
681
|
+
@pytest.mark.asyncio
|
|
682
|
+
async def test_run_with_invalid_prompt_type(self, agent_with_tools):
|
|
683
|
+
"""Test run with invalid prompt type raises TypeError."""
|
|
684
|
+
with pytest.raises(TypeError, match="prompt_or_task must be str or Task"):
|
|
685
|
+
await agent_with_tools.run(123) # Invalid type
|
|
686
|
+
|
|
687
|
+
@pytest.mark.asyncio
|
|
688
|
+
async def test_evaluate_phase_with_multiple_tools(self, agent_with_tools):
|
|
689
|
+
"""Test evaluation phase with multiple evaluation tools."""
|
|
690
|
+
from hud.types import MCPToolResult
|
|
691
|
+
|
|
692
|
+
task = Task(
|
|
693
|
+
id="test_task",
|
|
694
|
+
prompt="Test evaluation",
|
|
695
|
+
mcp_config={"test_server": {"url": "http://localhost:8080"}},
|
|
696
|
+
evaluate_tool=[
|
|
697
|
+
MCPToolCall(name="eval1", arguments={}),
|
|
698
|
+
MCPToolCall(name="eval2", arguments={"reward": True}),
|
|
699
|
+
],
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
agent_with_tools.responses = [{"role": "assistant", "content": "Done", "tool_calls": []}]
|
|
703
|
+
|
|
704
|
+
eval_calls = []
|
|
705
|
+
agent_with_tools.mcp_client.call_tool = AsyncMock(
|
|
706
|
+
side_effect=lambda tool_call: eval_calls.append(tool_call)
|
|
707
|
+
or MCPToolResult(
|
|
708
|
+
content=[types.TextContent(type="text", text=f"{tool_call.name} result")],
|
|
709
|
+
isError=False,
|
|
710
|
+
structuredContent={"reward": 0.5} if tool_call.name == "eval1" else {"reward": 1.0},
|
|
711
|
+
)
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
result = await agent_with_tools.run(task)
|
|
715
|
+
|
|
716
|
+
# Check that the tool names match
|
|
717
|
+
eval_names = [call.name for call in eval_calls]
|
|
718
|
+
assert "eval1" in eval_names
|
|
719
|
+
assert "eval2" in eval_names
|
|
720
|
+
assert result.reward == 0.5 # From eval1 (first evaluation tool)
|
|
721
|
+
|
|
722
|
+
@pytest.mark.asyncio
|
|
723
|
+
async def test_trace_population_on_error(self, agent_with_tools):
|
|
724
|
+
"""Test that trace is populated on task execution error."""
|
|
725
|
+
|
|
726
|
+
task = Task(
|
|
727
|
+
id="test_task",
|
|
728
|
+
prompt="Test error",
|
|
729
|
+
mcp_config={"test_server": {"url": "http://localhost:8080"}},
|
|
730
|
+
setup_tool={"name": "failing_setup", "arguments": {}}, # type: ignore[arg-type]
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
# Make setup fail with exception
|
|
734
|
+
agent_with_tools.mcp_client.call_tool = AsyncMock(side_effect=Exception("Setup explosion"))
|
|
735
|
+
|
|
736
|
+
result = await agent_with_tools.run(task)
|
|
737
|
+
|
|
738
|
+
assert result.isError
|
|
739
|
+
# Error content is the string representation of the MCPToolResult list
|
|
740
|
+
assert "Setup explosion" in result.content
|
|
741
|
+
assert "MCPToolResult" in result.content
|
|
742
|
+
assert result.done
|