hud-python 0.2.9__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +14 -5
- hud/env/docker_client.py +1 -1
- hud/env/environment.py +13 -8
- hud/env/local_docker_client.py +1 -1
- hud/env/remote_client.py +1 -1
- hud/env/remote_docker_client.py +2 -2
- hud/exceptions.py +2 -1
- hud/mcp_agent/__init__.py +15 -0
- hud/mcp_agent/base.py +723 -0
- hud/mcp_agent/claude.py +316 -0
- hud/mcp_agent/langchain.py +231 -0
- hud/mcp_agent/openai.py +318 -0
- hud/mcp_agent/tests/__init__.py +1 -0
- hud/mcp_agent/tests/test_base.py +437 -0
- hud/settings.py +14 -2
- hud/task.py +4 -0
- hud/telemetry/__init__.py +11 -7
- hud/telemetry/_trace.py +82 -71
- hud/telemetry/context.py +9 -27
- hud/telemetry/exporter.py +6 -5
- hud/telemetry/instrumentation/mcp.py +174 -410
- hud/telemetry/mcp_models.py +13 -74
- hud/telemetry/tests/test_context.py +9 -6
- hud/telemetry/tests/test_trace.py +92 -61
- hud/tools/__init__.py +21 -0
- hud/tools/base.py +65 -0
- hud/tools/bash.py +137 -0
- hud/tools/computer/__init__.py +13 -0
- hud/tools/computer/anthropic.py +411 -0
- hud/tools/computer/hud.py +315 -0
- hud/tools/computer/openai.py +283 -0
- hud/tools/edit.py +290 -0
- hud/tools/executors/__init__.py +13 -0
- hud/tools/executors/base.py +331 -0
- hud/tools/executors/pyautogui.py +585 -0
- hud/tools/executors/tests/__init__.py +1 -0
- hud/tools/executors/tests/test_base_executor.py +338 -0
- hud/tools/executors/tests/test_pyautogui_executor.py +162 -0
- hud/tools/executors/xdo.py +503 -0
- hud/tools/helper/README.md +56 -0
- hud/tools/helper/__init__.py +9 -0
- hud/tools/helper/mcp_server.py +78 -0
- hud/tools/helper/server_initialization.py +115 -0
- hud/tools/helper/utils.py +58 -0
- hud/tools/playwright_tool.py +373 -0
- hud/tools/tests/__init__.py +3 -0
- hud/tools/tests/test_bash.py +152 -0
- hud/tools/tests/test_computer.py +52 -0
- hud/tools/tests/test_computer_actions.py +34 -0
- hud/tools/tests/test_edit.py +233 -0
- hud/tools/tests/test_init.py +27 -0
- hud/tools/tests/test_playwright_tool.py +183 -0
- hud/tools/tests/test_tools.py +154 -0
- hud/tools/tests/test_utils.py +156 -0
- hud/tools/utils.py +50 -0
- hud/types.py +10 -1
- hud/utils/tests/test_init.py +21 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.2.9.dist-info → hud_python-0.3.0.dist-info}/METADATA +9 -6
- hud_python-0.3.0.dist-info/RECORD +124 -0
- hud_python-0.2.9.dist-info/RECORD +0 -85
- {hud_python-0.2.9.dist-info → hud_python-0.3.0.dist-info}/WHEEL +0 -0
- {hud_python-0.2.9.dist-info → hud_python-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
"""Tests for BaseMCPAgent using simulated actions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
from unittest.mock import MagicMock
|
|
7
|
+
|
|
8
|
+
import pytest
|
|
9
|
+
from mcp import types
|
|
10
|
+
|
|
11
|
+
from hud.mcp_agent.base import BaseMCPAgent
|
|
12
|
+
from hud.tools.executors.base import BaseExecutor
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from hud.task import Task
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MockMCPAgent(BaseMCPAgent):
|
|
19
|
+
"""Concrete implementation of BaseMCPAgent for testing."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
22
|
+
super().__init__(**kwargs)
|
|
23
|
+
self.executor = BaseExecutor() # Use simulated executor
|
|
24
|
+
self._messages = []
|
|
25
|
+
|
|
26
|
+
async def run(self, task: Task) -> list[dict[str, Any]]:
|
|
27
|
+
"""Mock run method."""
|
|
28
|
+
return self._messages
|
|
29
|
+
|
|
30
|
+
def create_initial_messages(
|
|
31
|
+
self, prompt: str, screenshot: str | None = None
|
|
32
|
+
) -> list[dict[str, Any]]:
|
|
33
|
+
"""Mock create initial messages."""
|
|
34
|
+
messages = [{"role": "user", "content": prompt}]
|
|
35
|
+
if screenshot:
|
|
36
|
+
messages.append({"role": "assistant", "content": f"Screenshot: {screenshot}"})
|
|
37
|
+
return messages
|
|
38
|
+
|
|
39
|
+
def get_model_response(self, messages: list[dict[str, Any]]) -> dict[str, Any]:
|
|
40
|
+
"""Mock get model response."""
|
|
41
|
+
return {"role": "assistant", "content": "Mock response"}
|
|
42
|
+
|
|
43
|
+
def format_tool_results(
|
|
44
|
+
self,
|
|
45
|
+
results: list[tuple[str, Any]],
|
|
46
|
+
screenshot: str | None = None,
|
|
47
|
+
assistant_msg: dict[str, Any] | None = None,
|
|
48
|
+
) -> list[dict[str, Any]]:
|
|
49
|
+
"""Mock format tool results."""
|
|
50
|
+
formatted = []
|
|
51
|
+
for tool_name, result in results:
|
|
52
|
+
formatted.append({"role": "tool", "name": tool_name, "content": str(result)})
|
|
53
|
+
if screenshot:
|
|
54
|
+
formatted.append({"role": "screenshot", "content": screenshot})
|
|
55
|
+
return formatted
|
|
56
|
+
|
|
57
|
+
async def create_user_message(self, text: str) -> Any:
|
|
58
|
+
"""Mock create user message."""
|
|
59
|
+
return {"role": "user", "content": text}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class TestBaseMCPAgent:
|
|
63
|
+
"""Tests for BaseMCPAgent with simulated actions."""
|
|
64
|
+
|
|
65
|
+
def test_init_defaults(self):
|
|
66
|
+
"""Test initialization with default values."""
|
|
67
|
+
agent = MockMCPAgent()
|
|
68
|
+
|
|
69
|
+
assert agent.client is not None
|
|
70
|
+
assert agent.allowed_tools is None
|
|
71
|
+
assert agent.disallowed_tools == []
|
|
72
|
+
assert agent.initial_screenshot is False
|
|
73
|
+
assert agent.max_screenshot_history == 3
|
|
74
|
+
assert agent.append_tool_system_prompt is True
|
|
75
|
+
assert agent.custom_system_prompt is None
|
|
76
|
+
assert agent.lifecycle_tools == {"setup": "setup", "evaluate": "evaluate"}
|
|
77
|
+
|
|
78
|
+
def test_init_with_params(self):
|
|
79
|
+
"""Test initialization with custom parameters."""
|
|
80
|
+
client = MagicMock()
|
|
81
|
+
agent = MockMCPAgent(
|
|
82
|
+
client=client,
|
|
83
|
+
allowed_tools=["tool1", "tool2"],
|
|
84
|
+
disallowed_tools=["bad_tool"],
|
|
85
|
+
initial_screenshot=True,
|
|
86
|
+
max_screenshot_history=5,
|
|
87
|
+
append_tool_system_prompt=False,
|
|
88
|
+
custom_system_prompt="Custom prompt",
|
|
89
|
+
lifecycle_tools={"setup": "custom_setup", "evaluate": "custom_eval"},
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
assert agent.client == client
|
|
93
|
+
assert agent.allowed_tools == ["tool1", "tool2"]
|
|
94
|
+
assert agent.disallowed_tools == ["bad_tool"]
|
|
95
|
+
assert agent.initial_screenshot is True
|
|
96
|
+
assert agent.max_screenshot_history == 5
|
|
97
|
+
assert agent.append_tool_system_prompt is False
|
|
98
|
+
assert agent.custom_system_prompt == "Custom prompt"
|
|
99
|
+
assert agent.lifecycle_tools == {"setup": "custom_setup", "evaluate": "custom_eval"}
|
|
100
|
+
|
|
101
|
+
@pytest.mark.asyncio
|
|
102
|
+
async def test_initialize_no_client(self):
|
|
103
|
+
"""Test initialize fails without client."""
|
|
104
|
+
agent = MockMCPAgent()
|
|
105
|
+
agent.client = None
|
|
106
|
+
|
|
107
|
+
with pytest.raises(ValueError, match="Client is not initialized"):
|
|
108
|
+
await agent.initialize()
|
|
109
|
+
|
|
110
|
+
@pytest.mark.asyncio
|
|
111
|
+
async def test_initialize_with_sessions(self):
|
|
112
|
+
"""Test initialize with existing sessions."""
|
|
113
|
+
agent = MockMCPAgent()
|
|
114
|
+
|
|
115
|
+
# Create proper async mock for session
|
|
116
|
+
mock_session = MagicMock()
|
|
117
|
+
|
|
118
|
+
# Set up the connector and client_session structure
|
|
119
|
+
mock_session.connector = MagicMock()
|
|
120
|
+
mock_session.connector.client_session = MagicMock()
|
|
121
|
+
|
|
122
|
+
# Mock list_tools on the client_session
|
|
123
|
+
async def mock_list_tools():
|
|
124
|
+
return types.ListToolsResult(
|
|
125
|
+
tools=[
|
|
126
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
127
|
+
types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
|
|
128
|
+
types.Tool(
|
|
129
|
+
name="setup", description="Setup tool", inputSchema={"type": "object"}
|
|
130
|
+
),
|
|
131
|
+
]
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
mock_session.connector.client_session.list_tools = mock_list_tools
|
|
135
|
+
|
|
136
|
+
assert agent.client is not None
|
|
137
|
+
agent.client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
|
|
138
|
+
|
|
139
|
+
await agent.initialize()
|
|
140
|
+
|
|
141
|
+
# Check available tools were populated (excludes lifecycle tools)
|
|
142
|
+
tools = agent.get_available_tools()
|
|
143
|
+
assert len(tools) == 2 # tool1 and tool2 (setup is excluded as lifecycle tool)
|
|
144
|
+
|
|
145
|
+
# Check tool map was populated (includes all tools)
|
|
146
|
+
tool_map = agent.get_tool_map()
|
|
147
|
+
assert len(tool_map) == 3
|
|
148
|
+
assert "tool1" in tool_map
|
|
149
|
+
assert "tool2" in tool_map
|
|
150
|
+
assert "setup" in tool_map
|
|
151
|
+
|
|
152
|
+
@pytest.mark.asyncio
|
|
153
|
+
async def test_initialize_with_filtering(self):
|
|
154
|
+
"""Test initialize with tool filtering."""
|
|
155
|
+
agent = MockMCPAgent(allowed_tools=["tool1"], disallowed_tools=["tool3"])
|
|
156
|
+
|
|
157
|
+
# Create proper async mock for session
|
|
158
|
+
mock_session = MagicMock()
|
|
159
|
+
|
|
160
|
+
# Set up the connector and client_session structure
|
|
161
|
+
mock_session.connector = MagicMock()
|
|
162
|
+
mock_session.connector.client_session = MagicMock()
|
|
163
|
+
|
|
164
|
+
async def mock_list_tools():
|
|
165
|
+
return types.ListToolsResult(
|
|
166
|
+
tools=[
|
|
167
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
168
|
+
types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
|
|
169
|
+
types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
|
|
170
|
+
types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
|
|
171
|
+
]
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
mock_session.connector.client_session.list_tools = mock_list_tools
|
|
175
|
+
|
|
176
|
+
assert agent.client is not None
|
|
177
|
+
agent.client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
|
|
178
|
+
|
|
179
|
+
await agent.initialize()
|
|
180
|
+
|
|
181
|
+
# Check filtering worked - get_available_tools excludes lifecycle tools
|
|
182
|
+
tools = agent.get_available_tools()
|
|
183
|
+
tool_names = [t.name for t in tools]
|
|
184
|
+
assert len(tools) == 1 # Only tool1 (setup is excluded as lifecycle tool)
|
|
185
|
+
assert "tool1" in tool_names
|
|
186
|
+
assert "setup" not in tool_names # Lifecycle tool excluded from available tools
|
|
187
|
+
assert "tool2" not in tool_names # Not in allowed list
|
|
188
|
+
assert "tool3" not in tool_names # In disallowed list
|
|
189
|
+
|
|
190
|
+
@pytest.mark.asyncio
|
|
191
|
+
async def test_call_tool_success(self):
|
|
192
|
+
"""Test successful tool call."""
|
|
193
|
+
agent = MockMCPAgent()
|
|
194
|
+
|
|
195
|
+
# Initialize with a tool
|
|
196
|
+
mock_session = MagicMock()
|
|
197
|
+
mock_session.connector = MagicMock()
|
|
198
|
+
mock_session.connector.client_session = MagicMock()
|
|
199
|
+
|
|
200
|
+
async def mock_list_tools():
|
|
201
|
+
return types.ListToolsResult(
|
|
202
|
+
tools=[
|
|
203
|
+
types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
|
|
204
|
+
]
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
mock_session.connector.client_session.list_tools = mock_list_tools
|
|
208
|
+
|
|
209
|
+
# Mock the call_tool method on the client session
|
|
210
|
+
mock_result = types.CallToolResult(
|
|
211
|
+
content=[types.TextContent(type="text", text="Tool result")], isError=False
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
async def mock_call_tool(name, args):
|
|
215
|
+
return mock_result
|
|
216
|
+
|
|
217
|
+
mock_session.connector.client_session.call_tool = mock_call_tool
|
|
218
|
+
|
|
219
|
+
assert agent.client is not None
|
|
220
|
+
agent.client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
|
|
221
|
+
agent.client.get_session = MagicMock(return_value=mock_session)
|
|
222
|
+
|
|
223
|
+
await agent.initialize()
|
|
224
|
+
|
|
225
|
+
# Call the tool
|
|
226
|
+
result = await agent.call_tool({"name": "test_tool", "arguments": {"param": "value"}})
|
|
227
|
+
|
|
228
|
+
assert result == mock_result
|
|
229
|
+
assert not result.isError
|
|
230
|
+
|
|
231
|
+
@pytest.mark.asyncio
|
|
232
|
+
async def test_call_tool_not_found(self):
|
|
233
|
+
"""Test calling non-existent tool."""
|
|
234
|
+
agent = MockMCPAgent()
|
|
235
|
+
|
|
236
|
+
# Initialize without tools
|
|
237
|
+
mock_session = MagicMock()
|
|
238
|
+
|
|
239
|
+
async def mock_list_tools():
|
|
240
|
+
return types.ListToolsResult(tools=[])
|
|
241
|
+
|
|
242
|
+
mock_session.list_tools = mock_list_tools
|
|
243
|
+
assert agent.client is not None
|
|
244
|
+
agent.client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
|
|
245
|
+
|
|
246
|
+
await agent.initialize()
|
|
247
|
+
|
|
248
|
+
# Try to call unknown tool
|
|
249
|
+
with pytest.raises(ValueError, match="Tool 'unknown_tool' not found"):
|
|
250
|
+
await agent.call_tool({"name": "unknown_tool", "arguments": {}})
|
|
251
|
+
|
|
252
|
+
@pytest.mark.asyncio
|
|
253
|
+
async def test_call_tool_no_name(self):
|
|
254
|
+
"""Test calling tool without name."""
|
|
255
|
+
agent = MockMCPAgent()
|
|
256
|
+
|
|
257
|
+
with pytest.raises(ValueError, match="Tool call must have a 'name' field"):
|
|
258
|
+
await agent.call_tool({"arguments": {}})
|
|
259
|
+
|
|
260
|
+
def test_get_system_prompt_default(self):
|
|
261
|
+
"""Test get_system_prompt with default settings."""
|
|
262
|
+
agent = MockMCPAgent()
|
|
263
|
+
|
|
264
|
+
# Add some tools
|
|
265
|
+
agent._available_tools = [
|
|
266
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
267
|
+
types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
|
|
268
|
+
]
|
|
269
|
+
|
|
270
|
+
prompt = agent.get_system_prompt()
|
|
271
|
+
|
|
272
|
+
# Should include ALL tool descriptions (including lifecycle tools)
|
|
273
|
+
assert "tool1" in prompt
|
|
274
|
+
assert "Tool 1" in prompt
|
|
275
|
+
assert "setup" in prompt
|
|
276
|
+
assert "Setup" in prompt
|
|
277
|
+
|
|
278
|
+
def test_get_system_prompt_custom(self):
|
|
279
|
+
"""Test get_system_prompt with custom prompt."""
|
|
280
|
+
agent = MockMCPAgent(
|
|
281
|
+
custom_system_prompt="My custom prompt", append_tool_system_prompt=False
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
prompt = agent.get_system_prompt()
|
|
285
|
+
assert prompt == "My custom prompt"
|
|
286
|
+
|
|
287
|
+
def test_has_computer_tools(self):
|
|
288
|
+
"""Test checking for computer tools."""
|
|
289
|
+
agent = MockMCPAgent()
|
|
290
|
+
|
|
291
|
+
# No tools
|
|
292
|
+
assert not agent.has_computer_tools()
|
|
293
|
+
|
|
294
|
+
# With computer tool
|
|
295
|
+
agent._available_tools = [
|
|
296
|
+
types.Tool(name="computer", description="Computer", inputSchema={"type": "object"})
|
|
297
|
+
]
|
|
298
|
+
assert agent.has_computer_tools()
|
|
299
|
+
|
|
300
|
+
# With screenshot tool
|
|
301
|
+
agent._available_tools = [
|
|
302
|
+
types.Tool(name="screenshot", description="Screenshot", inputSchema={"type": "object"})
|
|
303
|
+
]
|
|
304
|
+
assert agent.has_computer_tools()
|
|
305
|
+
|
|
306
|
+
def test_get_tool_schemas(self):
|
|
307
|
+
"""Test getting tool schemas."""
|
|
308
|
+
agent = MockMCPAgent()
|
|
309
|
+
|
|
310
|
+
agent._available_tools = [
|
|
311
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
312
|
+
types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
|
|
313
|
+
]
|
|
314
|
+
|
|
315
|
+
schemas = agent.get_tool_schemas()
|
|
316
|
+
|
|
317
|
+
# Should include non-lifecycle tools
|
|
318
|
+
assert len(schemas) == 1
|
|
319
|
+
assert schemas[0]["name"] == "tool1"
|
|
320
|
+
|
|
321
|
+
@pytest.mark.asyncio
|
|
322
|
+
async def test_capture_screenshot_no_tool(self):
|
|
323
|
+
"""Test screenshot capture without screenshot tool."""
|
|
324
|
+
agent = MockMCPAgent()
|
|
325
|
+
|
|
326
|
+
screenshot = await agent.capture_screenshot()
|
|
327
|
+
assert screenshot is None
|
|
328
|
+
|
|
329
|
+
@pytest.mark.asyncio
|
|
330
|
+
async def test_capture_screenshot_with_tool(self):
|
|
331
|
+
"""Test screenshot capture with screenshot tool."""
|
|
332
|
+
agent = MockMCPAgent()
|
|
333
|
+
|
|
334
|
+
# Set up screenshot tool
|
|
335
|
+
mock_session = MagicMock()
|
|
336
|
+
mock_session.connector = MagicMock()
|
|
337
|
+
mock_session.connector.client_session = MagicMock()
|
|
338
|
+
|
|
339
|
+
async def mock_list_tools():
|
|
340
|
+
return types.ListToolsResult(
|
|
341
|
+
tools=[
|
|
342
|
+
types.Tool(
|
|
343
|
+
name="screenshot", description="Screenshot", inputSchema={"type": "object"}
|
|
344
|
+
)
|
|
345
|
+
]
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
mock_session.connector.client_session.list_tools = mock_list_tools
|
|
349
|
+
|
|
350
|
+
# Mock screenshot result
|
|
351
|
+
mock_result = types.CallToolResult(
|
|
352
|
+
content=[
|
|
353
|
+
types.ImageContent(type="image", data="base64imagedata", mimeType="image/png")
|
|
354
|
+
],
|
|
355
|
+
isError=False,
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
async def mock_call_tool(name, args):
|
|
359
|
+
return mock_result
|
|
360
|
+
|
|
361
|
+
mock_session.connector.client_session.call_tool = mock_call_tool
|
|
362
|
+
|
|
363
|
+
assert agent.client is not None
|
|
364
|
+
agent.client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
|
|
365
|
+
agent.client.get_session = MagicMock(return_value=mock_session)
|
|
366
|
+
|
|
367
|
+
await agent.initialize()
|
|
368
|
+
|
|
369
|
+
screenshot = await agent.capture_screenshot()
|
|
370
|
+
assert screenshot == "base64imagedata"
|
|
371
|
+
|
|
372
|
+
def test_process_tool_results_extracts_text(self):
|
|
373
|
+
"""Test processing tool results extracts text content."""
|
|
374
|
+
agent = MockMCPAgent()
|
|
375
|
+
|
|
376
|
+
# Create a proper CallToolResult object
|
|
377
|
+
result = types.CallToolResult(
|
|
378
|
+
content=[
|
|
379
|
+
types.TextContent(type="text", text="Result text"),
|
|
380
|
+
types.ImageContent(type="image", data="imagedata", mimeType="image/png"),
|
|
381
|
+
],
|
|
382
|
+
isError=False,
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
tool_results = [{"tool_name": "test_tool", "result": result}]
|
|
386
|
+
|
|
387
|
+
processed = agent.process_tool_results(tool_results)
|
|
388
|
+
|
|
389
|
+
assert "text" in processed
|
|
390
|
+
assert "Result text" in processed["text"]
|
|
391
|
+
assert "results" in processed
|
|
392
|
+
assert len(processed["results"]) == 1
|
|
393
|
+
|
|
394
|
+
def test_get_tools_by_server(self):
|
|
395
|
+
"""Test getting tools grouped by server."""
|
|
396
|
+
agent = MockMCPAgent()
|
|
397
|
+
|
|
398
|
+
# Set up tools from different servers
|
|
399
|
+
tool1 = types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"})
|
|
400
|
+
tool2 = types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"})
|
|
401
|
+
|
|
402
|
+
agent._available_tools = [tool1, tool2]
|
|
403
|
+
agent._tool_map = {
|
|
404
|
+
"tool1": ("server1", tool1),
|
|
405
|
+
"tool2": ("server2", tool2),
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
tools_by_server = agent.get_tools_by_server()
|
|
409
|
+
|
|
410
|
+
assert len(tools_by_server) == 2
|
|
411
|
+
assert "server1" in tools_by_server
|
|
412
|
+
assert "server2" in tools_by_server
|
|
413
|
+
assert tools_by_server["server1"] == [tool1]
|
|
414
|
+
assert tools_by_server["server2"] == [tool2]
|
|
415
|
+
|
|
416
|
+
@pytest.mark.asyncio
|
|
417
|
+
async def test_executor_integration(self):
|
|
418
|
+
"""Test integration with BaseExecutor for simulated actions."""
|
|
419
|
+
agent = MockMCPAgent()
|
|
420
|
+
|
|
421
|
+
# Test various executor actions
|
|
422
|
+
click_result = await agent.executor.click(100, 200, take_screenshot=False)
|
|
423
|
+
assert click_result.output is not None
|
|
424
|
+
assert "[SIMULATED] Click at (100, 200)" in click_result.output
|
|
425
|
+
|
|
426
|
+
type_result = await agent.executor.type("Test input", take_screenshot=False)
|
|
427
|
+
assert type_result.output is not None
|
|
428
|
+
assert "[SIMULATED] Type 'Test input'" in type_result.output
|
|
429
|
+
|
|
430
|
+
scroll_result = await agent.executor.scroll(x=50, y=50, scroll_y=5, take_screenshot=False)
|
|
431
|
+
assert scroll_result.output is not None
|
|
432
|
+
assert "[SIMULATED] Scroll" in scroll_result.output
|
|
433
|
+
|
|
434
|
+
# Test screenshot
|
|
435
|
+
screenshot = await agent.executor.screenshot()
|
|
436
|
+
assert isinstance(screenshot, str)
|
|
437
|
+
assert screenshot.startswith("iVBORw0KGgo") # PNG header
|
hud/settings.py
CHANGED
|
@@ -41,13 +41,25 @@ class Settings(BaseSettings):
|
|
|
41
41
|
telemetry_enabled: bool = Field(
|
|
42
42
|
default=True,
|
|
43
43
|
description="Enable telemetry for the HUD SDK",
|
|
44
|
-
validation_alias="
|
|
44
|
+
validation_alias="HUD_TELEMETRY_ENABLED",
|
|
45
45
|
)
|
|
46
46
|
|
|
47
47
|
fancy_logging: bool = Field(
|
|
48
48
|
default=True,
|
|
49
49
|
description="Enable fancy logging for the HUD SDK",
|
|
50
|
-
validation_alias="
|
|
50
|
+
validation_alias="HUD_FANCY_LOGGING",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
log_stream: str = Field(
|
|
54
|
+
default="stdout",
|
|
55
|
+
description="Stream to use for logging output: 'stdout' or 'stderr'",
|
|
56
|
+
validation_alias="HUD_LOG_STREAM",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
display: str = Field(
|
|
60
|
+
default=":0",
|
|
61
|
+
description="Display to use for the HUD SDK",
|
|
62
|
+
validation_alias="HUD_DISPLAY",
|
|
51
63
|
)
|
|
52
64
|
|
|
53
65
|
|
hud/task.py
CHANGED
|
@@ -74,6 +74,8 @@ class Task(BaseModel):
|
|
|
74
74
|
# Description of the task, for extra information about its purpose and context
|
|
75
75
|
description: str | None = None
|
|
76
76
|
|
|
77
|
+
gold_file_url: str | None = None
|
|
78
|
+
|
|
77
79
|
@classmethod
|
|
78
80
|
def from_dict(cls, data: dict[str, Any]) -> Task:
|
|
79
81
|
return cls(**data)
|
|
@@ -110,6 +112,7 @@ class Task(BaseModel):
|
|
|
110
112
|
description=data.get("description"),
|
|
111
113
|
sensitive_data=data.get("sensitive_data", {}),
|
|
112
114
|
metadata=data.get("metadata", {}),
|
|
115
|
+
gold_file_url=data.get("gold_file_url"),
|
|
113
116
|
)
|
|
114
117
|
|
|
115
118
|
@classmethod
|
|
@@ -221,4 +224,5 @@ class Task(BaseModel):
|
|
|
221
224
|
"gym": parsed_gym,
|
|
222
225
|
"sensitive_data": self.sensitive_data,
|
|
223
226
|
"metadata": self.metadata,
|
|
227
|
+
"gold_file_url": self.gold_file_url,
|
|
224
228
|
}
|
hud/telemetry/__init__.py
CHANGED
|
@@ -1,21 +1,25 @@
|
|
|
1
1
|
"""
|
|
2
|
-
HUD
|
|
2
|
+
HUD Telemetry module.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
to the HUD platform for analysis.
|
|
4
|
+
Provides context managers and utilities for capturing MCP telemetry data.
|
|
6
5
|
"""
|
|
7
6
|
|
|
8
7
|
from __future__ import annotations
|
|
9
8
|
|
|
10
|
-
|
|
11
|
-
from hud.telemetry.
|
|
9
|
+
# Main trace functions
|
|
10
|
+
from hud.telemetry._trace import init_telemetry, trace, trace_open
|
|
11
|
+
from hud.telemetry.context import flush_buffer, get_current_task_run_id
|
|
12
12
|
from hud.telemetry.exporter import flush
|
|
13
13
|
|
|
14
14
|
__all__ = [
|
|
15
|
+
# Management
|
|
15
16
|
"flush",
|
|
17
|
+
"flush_buffer",
|
|
18
|
+
# Context management
|
|
16
19
|
"get_current_task_run_id",
|
|
20
|
+
# Management
|
|
17
21
|
"init_telemetry",
|
|
18
|
-
|
|
19
|
-
"set_current_task_run_id",
|
|
22
|
+
# Trace functions
|
|
20
23
|
"trace",
|
|
24
|
+
"trace_open",
|
|
21
25
|
]
|