hud-python 0.4.47__py3-none-any.whl → 0.4.49__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/agents/base.py +55 -142
- hud/agents/claude.py +5 -6
- hud/agents/grounded_openai.py +1 -1
- hud/agents/misc/integration_test_agent.py +2 -0
- hud/agents/tests/test_base.py +2 -5
- hud/cli/__init__.py +80 -215
- hud/cli/build.py +105 -45
- hud/cli/dev.py +614 -743
- hud/cli/eval.py +14 -9
- hud/cli/flows/tasks.py +100 -21
- hud/cli/init.py +18 -14
- hud/cli/push.py +27 -9
- hud/cli/rl/local_runner.py +28 -16
- hud/cli/rl/vllm.py +2 -0
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_eval.py +574 -0
- hud/cli/tests/test_mcp_server.py +6 -95
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/source_hash.py +1 -1
- hud/datasets/parallel.py +0 -12
- hud/datasets/runner.py +1 -4
- hud/rl/actor.py +4 -2
- hud/rl/distributed.py +1 -1
- hud/rl/learner.py +2 -1
- hud/rl/train.py +1 -1
- hud/server/__init__.py +2 -1
- hud/server/router.py +160 -0
- hud/server/server.py +246 -79
- hud/telemetry/trace.py +1 -1
- hud/tools/base.py +20 -10
- hud/tools/computer/__init__.py +2 -0
- hud/tools/computer/qwen.py +431 -0
- hud/tools/computer/settings.py +16 -0
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/playwright.py +1 -1
- hud/types.py +2 -3
- hud/utils/hud_console.py +43 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/METADATA +1 -1
- {hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/RECORD +45 -42
- {hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/WHEEL +0 -0
- {hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,574 @@
|
|
|
1
|
+
"""Tests for hud.cli.eval module."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from unittest.mock import AsyncMock, MagicMock, Mock, patch
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
from mcp import types
|
|
9
|
+
|
|
10
|
+
from hud.cli.eval import (
|
|
11
|
+
build_agent,
|
|
12
|
+
run_single_task,
|
|
13
|
+
)
|
|
14
|
+
from hud.types import Task, Trace
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TestBuildAgent:
|
|
18
|
+
"""Test the build_agent function."""
|
|
19
|
+
|
|
20
|
+
def test_builds_integration_test_agent(self) -> None:
|
|
21
|
+
"""
|
|
22
|
+
Test building an integration test agent.
|
|
23
|
+
"""
|
|
24
|
+
with patch("hud.agents.misc.integration_test_agent.IntegrationTestRunner") as mock_runner:
|
|
25
|
+
mock_instance = Mock()
|
|
26
|
+
mock_runner.return_value = mock_instance
|
|
27
|
+
|
|
28
|
+
# Test with verbose=False
|
|
29
|
+
result = build_agent("integration_test", verbose=False)
|
|
30
|
+
|
|
31
|
+
mock_runner.assert_called_once_with(verbose=False)
|
|
32
|
+
assert result == mock_instance
|
|
33
|
+
|
|
34
|
+
def test_builds_claude_agent(self) -> None:
|
|
35
|
+
"""
|
|
36
|
+
Test building a Claude agent with default model.
|
|
37
|
+
"""
|
|
38
|
+
with patch("hud.agents.ClaudeAgent") as mock_runner:
|
|
39
|
+
mock_instance = Mock()
|
|
40
|
+
mock_runner.return_value = mock_instance
|
|
41
|
+
|
|
42
|
+
# Test with verbose=False
|
|
43
|
+
result = build_agent("claude", verbose=False)
|
|
44
|
+
|
|
45
|
+
mock_runner.assert_called_once_with(model="claude-sonnet-4-20250514", verbose=False)
|
|
46
|
+
assert result == mock_instance
|
|
47
|
+
|
|
48
|
+
def test_builds_claude_agent_with_custom_model_and_allowed_tools(self) -> None:
|
|
49
|
+
"""
|
|
50
|
+
Test building a Claude agent with custom model name and allowed tools.
|
|
51
|
+
"""
|
|
52
|
+
with patch("hud.agents.ClaudeAgent") as mock_runner:
|
|
53
|
+
mock_instance = Mock()
|
|
54
|
+
mock_runner.return_value = mock_instance
|
|
55
|
+
|
|
56
|
+
# Test with verbose=False
|
|
57
|
+
result = build_agent(
|
|
58
|
+
"claude",
|
|
59
|
+
model="claude-sonnet-4-20250514",
|
|
60
|
+
allowed_tools=["act"],
|
|
61
|
+
verbose=True,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
mock_runner.assert_called_once_with(
|
|
65
|
+
model="claude-sonnet-4-20250514",
|
|
66
|
+
allowed_tools=["act"],
|
|
67
|
+
verbose=True,
|
|
68
|
+
)
|
|
69
|
+
assert result == mock_instance
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class TestRunSingleTask:
|
|
73
|
+
"""Test the run_single_task function."""
|
|
74
|
+
|
|
75
|
+
@pytest.mark.asyncio
|
|
76
|
+
async def test_applies_agent_config_from_task(self) -> None:
|
|
77
|
+
"""Test that task.agent_config is applied during agent initialization."""
|
|
78
|
+
mock_task = Task(
|
|
79
|
+
prompt="Test",
|
|
80
|
+
mcp_config={"local": {"url": "http://localhost:8765/mcp"}},
|
|
81
|
+
agent_config={
|
|
82
|
+
"system_prompt": "Custom instructions",
|
|
83
|
+
"allowed_tools": ["tool1", "tool2"],
|
|
84
|
+
"append_setup_output": False,
|
|
85
|
+
},
|
|
86
|
+
)
|
|
87
|
+
mock_agent = AsyncMock(
|
|
88
|
+
initialize=AsyncMock(), run=AsyncMock(return_value=Trace(reward=1.0, done=True))
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
with (
|
|
92
|
+
patch("hud.utils.tasks.load_tasks", return_value=[mock_task]),
|
|
93
|
+
patch(
|
|
94
|
+
"hud.agents.misc.integration_test_agent.IntegrationTestRunner",
|
|
95
|
+
return_value=mock_agent,
|
|
96
|
+
),
|
|
97
|
+
patch("hud.cli.eval.find_environment_dir", return_value=None),
|
|
98
|
+
patch("hud.cli.eval.hud.trace"),
|
|
99
|
+
):
|
|
100
|
+
await run_single_task("test.json", agent_type="integration_test", max_steps=10)
|
|
101
|
+
|
|
102
|
+
# Verify agent.run was called with the task containing agent_config
|
|
103
|
+
mock_agent.run.assert_called_once()
|
|
104
|
+
called_task = mock_agent.run.call_args[0][0]
|
|
105
|
+
assert called_task.agent_config == mock_task.agent_config
|
|
106
|
+
|
|
107
|
+
@pytest.mark.asyncio
|
|
108
|
+
async def test_runs_with_group_size_greater_than_one(self) -> None:
|
|
109
|
+
"""Test that group_size > 1 triggers run_tasks_grouped instead of agent.run."""
|
|
110
|
+
mock_task = Task(prompt="Test", mcp_config={"local": {"url": "http://localhost:8765/mcp"}})
|
|
111
|
+
|
|
112
|
+
with (
|
|
113
|
+
patch("hud.utils.tasks.load_tasks", return_value=[mock_task]),
|
|
114
|
+
patch("hud.cli.eval.run_tasks_grouped", new_callable=AsyncMock) as mock_grouped,
|
|
115
|
+
patch("hud.cli.eval.display_group_statistics"),
|
|
116
|
+
patch("hud.cli.eval.find_environment_dir", return_value=None),
|
|
117
|
+
patch("hud.cli.eval.hud.trace"),
|
|
118
|
+
):
|
|
119
|
+
mock_grouped.return_value = [{"task": mock_task, "rewards": [1.0, 0.5]}]
|
|
120
|
+
|
|
121
|
+
await run_single_task(
|
|
122
|
+
"test.json", agent_type="integration_test", group_size=3, max_steps=10
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Verify run_tasks_grouped was called with correct group_size
|
|
126
|
+
mock_grouped.assert_called_once()
|
|
127
|
+
assert mock_grouped.call_args.kwargs["group_size"] == 3
|
|
128
|
+
assert mock_grouped.call_args.kwargs["max_steps"] == 10
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class TestToolFiltering:
|
|
132
|
+
"""Test wildcard tool filtering via agent_config in tasks."""
|
|
133
|
+
|
|
134
|
+
@pytest.fixture
|
|
135
|
+
def mock_mcp_client(self):
|
|
136
|
+
"""Fixture for mock MCP client."""
|
|
137
|
+
client = MagicMock()
|
|
138
|
+
client.initialize = AsyncMock()
|
|
139
|
+
client.mcp_config = {"local": {"url": "http://localhost"}}
|
|
140
|
+
return client
|
|
141
|
+
|
|
142
|
+
@pytest.fixture
|
|
143
|
+
def mock_model_client(self):
|
|
144
|
+
"""Fixture for mock Anthropic client."""
|
|
145
|
+
return MagicMock()
|
|
146
|
+
|
|
147
|
+
async def _run_agent_with_tools(
|
|
148
|
+
self,
|
|
149
|
+
mock_mcp_client: MagicMock,
|
|
150
|
+
mock_model_client: MagicMock,
|
|
151
|
+
tools: list[types.Tool],
|
|
152
|
+
agent_config: dict | None = None,
|
|
153
|
+
) -> list[types.Tool]:
|
|
154
|
+
"""Helper to create agent, initialize with tools and config, return filtered tools."""
|
|
155
|
+
from hud.agents import ClaudeAgent
|
|
156
|
+
|
|
157
|
+
mock_mcp_client.list_tools = AsyncMock(return_value=tools)
|
|
158
|
+
|
|
159
|
+
task = Task(
|
|
160
|
+
prompt="Test",
|
|
161
|
+
mcp_config={"local": {"url": "http://localhost"}},
|
|
162
|
+
agent_config=agent_config or {},
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
agent = ClaudeAgent(
|
|
166
|
+
mcp_client=mock_mcp_client,
|
|
167
|
+
model_client=mock_model_client,
|
|
168
|
+
model="test",
|
|
169
|
+
validate_api_key=False,
|
|
170
|
+
)
|
|
171
|
+
await agent.initialize(task)
|
|
172
|
+
return agent.get_available_tools()
|
|
173
|
+
|
|
174
|
+
@pytest.mark.asyncio
|
|
175
|
+
async def test_no_filters_returns_all_tools(self, mock_mcp_client, mock_model_client) -> None:
|
|
176
|
+
"""Test that no filters in agent_config returns all tools."""
|
|
177
|
+
tools = [
|
|
178
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={}),
|
|
179
|
+
types.Tool(name="tool2", description="Tool 2", inputSchema={}),
|
|
180
|
+
types.Tool(name="debug_tool", description="Debug", inputSchema={}),
|
|
181
|
+
]
|
|
182
|
+
|
|
183
|
+
result = await self._run_agent_with_tools(mock_mcp_client, mock_model_client, tools)
|
|
184
|
+
|
|
185
|
+
assert len(result) == 3
|
|
186
|
+
|
|
187
|
+
@pytest.mark.asyncio
|
|
188
|
+
async def test_allowed_tools_filters_correctly(
|
|
189
|
+
self, mock_mcp_client, mock_model_client
|
|
190
|
+
) -> None:
|
|
191
|
+
"""Test that allowed_tools in agent_config filters to matching patterns."""
|
|
192
|
+
tools = [
|
|
193
|
+
types.Tool(name="screenshot_take", description="Tool 1", inputSchema={}),
|
|
194
|
+
types.Tool(name="screenshot_full", description="Tool 2", inputSchema={}),
|
|
195
|
+
types.Tool(name="click", description="Tool 3", inputSchema={}),
|
|
196
|
+
]
|
|
197
|
+
agent_config = {"allowed_tools": ["screenshot_*"]}
|
|
198
|
+
|
|
199
|
+
result = await self._run_agent_with_tools(
|
|
200
|
+
mock_mcp_client, mock_model_client, tools, agent_config
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
assert len(result) == 2
|
|
204
|
+
assert all("screenshot" in t.name for t in result)
|
|
205
|
+
|
|
206
|
+
@pytest.mark.asyncio
|
|
207
|
+
async def test_disallowed_tools_excludes_correctly(
|
|
208
|
+
self, mock_mcp_client, mock_model_client
|
|
209
|
+
) -> None:
|
|
210
|
+
"""Test that disallowed_tools in agent_config excludes matching patterns."""
|
|
211
|
+
tools = [
|
|
212
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={}),
|
|
213
|
+
types.Tool(name="debug_tool", description="Tool 2", inputSchema={}),
|
|
214
|
+
types.Tool(name="internal_secret", description="Tool 3", inputSchema={}),
|
|
215
|
+
]
|
|
216
|
+
agent_config = {"disallowed_tools": ["debug_*", "internal_*"]}
|
|
217
|
+
|
|
218
|
+
result = await self._run_agent_with_tools(
|
|
219
|
+
mock_mcp_client, mock_model_client, tools, agent_config
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
assert len(result) == 1
|
|
223
|
+
assert result[0].name == "tool1"
|
|
224
|
+
|
|
225
|
+
@pytest.mark.asyncio
|
|
226
|
+
async def test_both_filters_applies_allowed_then_disallowed(
|
|
227
|
+
self, mock_mcp_client, mock_model_client
|
|
228
|
+
) -> None:
|
|
229
|
+
"""Test that both filters in agent_config work together (disallowed takes precedence)."""
|
|
230
|
+
tools = [
|
|
231
|
+
types.Tool(name="browser_click", description="Tool 1", inputSchema={}),
|
|
232
|
+
types.Tool(name="browser_debug", description="Tool 2", inputSchema={}),
|
|
233
|
+
types.Tool(name="system_click", description="Tool 3", inputSchema={}),
|
|
234
|
+
]
|
|
235
|
+
agent_config = {"allowed_tools": ["browser_*"], "disallowed_tools": ["*_debug"]}
|
|
236
|
+
|
|
237
|
+
result = await self._run_agent_with_tools(
|
|
238
|
+
mock_mcp_client, mock_model_client, tools, agent_config
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
assert len(result) == 1
|
|
242
|
+
assert result[0].name == "browser_click"
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
class TestRunDatasetToolFiltering:
|
|
246
|
+
"""Test tool filtering via run_dataset with agent_config in both init and task."""
|
|
247
|
+
|
|
248
|
+
@pytest.fixture
|
|
249
|
+
def all_tools(self):
|
|
250
|
+
"""Fixture for a standard set of tools."""
|
|
251
|
+
return [
|
|
252
|
+
types.Tool(name="browser_click", description="Click", inputSchema={}),
|
|
253
|
+
types.Tool(name="browser_type", description="Type", inputSchema={}),
|
|
254
|
+
types.Tool(name="browser_debug", description="Debug", inputSchema={}),
|
|
255
|
+
types.Tool(name="system_screenshot", description="Screenshot", inputSchema={}),
|
|
256
|
+
types.Tool(name="system_execute", description="Execute", inputSchema={}),
|
|
257
|
+
]
|
|
258
|
+
|
|
259
|
+
@pytest.fixture
|
|
260
|
+
def captured_agent_fixture(self):
|
|
261
|
+
"""Fixture that returns a dictionary to capture the agent instance."""
|
|
262
|
+
return {"agent": None}
|
|
263
|
+
|
|
264
|
+
@pytest.fixture
|
|
265
|
+
def mock_run_context(self, captured_agent_fixture):
|
|
266
|
+
"""Fixture for mocking _run_context."""
|
|
267
|
+
|
|
268
|
+
async def _mock(self, context, max_steps=10):
|
|
269
|
+
captured_agent_fixture["agent"] = self
|
|
270
|
+
return Trace(reward=1.0, done=True, content="Done")
|
|
271
|
+
|
|
272
|
+
return _mock
|
|
273
|
+
|
|
274
|
+
@pytest.fixture
|
|
275
|
+
def mock_call_tools(self):
|
|
276
|
+
"""Fixture for mocking call_tools."""
|
|
277
|
+
|
|
278
|
+
async def _mock(self, tool_call=None):
|
|
279
|
+
return []
|
|
280
|
+
|
|
281
|
+
return _mock
|
|
282
|
+
|
|
283
|
+
@pytest.fixture
|
|
284
|
+
def mock_client_instance(self, all_tools):
|
|
285
|
+
"""Fixture for mock MCP client instance."""
|
|
286
|
+
mock_client = MagicMock()
|
|
287
|
+
mock_client.initialize = AsyncMock()
|
|
288
|
+
mock_client.list_tools = AsyncMock(return_value=all_tools)
|
|
289
|
+
mock_client.shutdown = AsyncMock()
|
|
290
|
+
mock_client.mcp_config = {"local": {"url": "http://localhost:8765/mcp"}}
|
|
291
|
+
return mock_client
|
|
292
|
+
|
|
293
|
+
@pytest.mark.asyncio
|
|
294
|
+
async def test_agent_config_intersection_union_via_run_dataset(
|
|
295
|
+
self,
|
|
296
|
+
all_tools,
|
|
297
|
+
captured_agent_fixture,
|
|
298
|
+
mock_run_context,
|
|
299
|
+
mock_call_tools,
|
|
300
|
+
mock_client_instance,
|
|
301
|
+
) -> None:
|
|
302
|
+
"""Test that allowed_tools intersect and disallowed_tools union when set in both __init__ and task.agent_config.""" # noqa: E501
|
|
303
|
+
from hud.agents import ClaudeAgent
|
|
304
|
+
from hud.datasets.runner import run_dataset
|
|
305
|
+
|
|
306
|
+
# Create a task with its own agent_config
|
|
307
|
+
task_dict = {
|
|
308
|
+
"prompt": "Test task",
|
|
309
|
+
"mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
|
|
310
|
+
"agent_config": {
|
|
311
|
+
"allowed_tools": [
|
|
312
|
+
"browser_*",
|
|
313
|
+
"system_screenshot",
|
|
314
|
+
], # Task wants browser_* and system_screenshot
|
|
315
|
+
"disallowed_tools": [
|
|
316
|
+
"*_debug",
|
|
317
|
+
"*_execute",
|
|
318
|
+
], # Task disallows *_debug and *_execute
|
|
319
|
+
},
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
# Agent config passed to __init__ via run_dataset
|
|
323
|
+
agent_init_config = {
|
|
324
|
+
"allowed_tools": ["browser_*", "system_*"], # Agent init wants browser_* and system_*
|
|
325
|
+
"disallowed_tools": ["browser_debug"], # Agent init disallows browser_debug
|
|
326
|
+
"validate_api_key": False,
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
with (
|
|
330
|
+
patch("hud.job"),
|
|
331
|
+
patch("hud.trace"),
|
|
332
|
+
patch.object(ClaudeAgent, "_run_context", mock_run_context),
|
|
333
|
+
patch.object(ClaudeAgent, "call_tools", mock_call_tools),
|
|
334
|
+
patch("hud.clients.MCPClient", return_value=mock_client_instance),
|
|
335
|
+
):
|
|
336
|
+
# Run the dataset
|
|
337
|
+
await run_dataset(
|
|
338
|
+
name="test_job",
|
|
339
|
+
dataset=[task_dict],
|
|
340
|
+
agent_class=ClaudeAgent,
|
|
341
|
+
agent_config=agent_init_config,
|
|
342
|
+
max_steps=10,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
# Verify agent was created and ran
|
|
346
|
+
captured_agent = captured_agent_fixture["agent"]
|
|
347
|
+
assert captured_agent is not None
|
|
348
|
+
|
|
349
|
+
# Get the filtered tools
|
|
350
|
+
filtered_tools = captured_agent.get_available_tools()
|
|
351
|
+
filtered_names = {tool.name for tool in filtered_tools}
|
|
352
|
+
|
|
353
|
+
# Expected behavior:
|
|
354
|
+
# 1. allowed_tools intersection: ["browser_*", "system_*"] ∩ ["browser_*", "system_screenshot"] # noqa: E501
|
|
355
|
+
# Exact string intersection: only "browser_*" is in both lists
|
|
356
|
+
# So only tools matching browser_* are allowed: browser_click, browser_type, browser_debug # noqa: E501
|
|
357
|
+
# 2. disallowed_tools union: ["browser_debug"] U ["*_debug", "*_execute"]
|
|
358
|
+
# Result: ["browser_debug", "*_debug", "*_execute"] (all patterns included)
|
|
359
|
+
# 3. Final: {browser_click, browser_type, browser_debug} - {browser_debug}
|
|
360
|
+
# Result: browser_click, browser_type
|
|
361
|
+
|
|
362
|
+
expected_tools = {"browser_click", "browser_type"}
|
|
363
|
+
assert filtered_names == expected_tools, (
|
|
364
|
+
f"Expected {expected_tools}, got {filtered_names}"
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
@pytest.mark.asyncio
|
|
368
|
+
async def test_no_allowed_tools_keeps_all_tools_except_disallowed(
|
|
369
|
+
self,
|
|
370
|
+
all_tools,
|
|
371
|
+
captured_agent_fixture,
|
|
372
|
+
mock_run_context,
|
|
373
|
+
mock_call_tools,
|
|
374
|
+
mock_client_instance,
|
|
375
|
+
) -> None:
|
|
376
|
+
"""Test that when allowed_tools is not set, all tools are available except disallowed ones.""" # noqa: E501
|
|
377
|
+
from hud.agents import ClaudeAgent
|
|
378
|
+
from hud.datasets.runner import run_dataset
|
|
379
|
+
|
|
380
|
+
# Create a task with its own agent_config (no allowed_tools)
|
|
381
|
+
task_dict = {
|
|
382
|
+
"prompt": "Test task",
|
|
383
|
+
"mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
|
|
384
|
+
"agent_config": {
|
|
385
|
+
# No allowed_tools set - should allow all tools
|
|
386
|
+
"disallowed_tools": ["*_execute"], # Task disallows *_execute
|
|
387
|
+
},
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
# Agent config passed to __init__ via run_dataset (no allowed_tools)
|
|
391
|
+
agent_init_config = {
|
|
392
|
+
# No allowed_tools set - should allow all tools
|
|
393
|
+
"disallowed_tools": ["browser_debug"], # Agent init disallows browser_debug
|
|
394
|
+
"validate_api_key": False,
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
with (
|
|
398
|
+
patch("hud.job"),
|
|
399
|
+
patch("hud.trace"),
|
|
400
|
+
patch.object(ClaudeAgent, "_run_context", mock_run_context),
|
|
401
|
+
patch.object(ClaudeAgent, "call_tools", mock_call_tools),
|
|
402
|
+
patch("hud.clients.MCPClient", return_value=mock_client_instance),
|
|
403
|
+
):
|
|
404
|
+
# Run the dataset
|
|
405
|
+
await run_dataset(
|
|
406
|
+
name="test_job",
|
|
407
|
+
dataset=[task_dict],
|
|
408
|
+
agent_class=ClaudeAgent,
|
|
409
|
+
agent_config=agent_init_config,
|
|
410
|
+
max_steps=10,
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
# Verify agent was created and ran
|
|
414
|
+
captured_agent = captured_agent_fixture["agent"]
|
|
415
|
+
assert captured_agent is not None
|
|
416
|
+
|
|
417
|
+
# Get the filtered tools
|
|
418
|
+
filtered_tools = captured_agent.get_available_tools()
|
|
419
|
+
filtered_names = {tool.name for tool in filtered_tools}
|
|
420
|
+
|
|
421
|
+
# Expected behavior:
|
|
422
|
+
# 1. allowed_tools: None (no allowed_tools set in either init or task)
|
|
423
|
+
# Result: All tools are initially allowed
|
|
424
|
+
# 2. disallowed_tools union: ["browser_debug"] U ["*_execute"]
|
|
425
|
+
# Result: ["browser_debug", "*_execute"] (all patterns included)
|
|
426
|
+
# 3. Final: {all tools} - {browser_debug, system_execute}
|
|
427
|
+
# Result: browser_click, browser_type, system_screenshot
|
|
428
|
+
|
|
429
|
+
expected_tools = {"browser_click", "browser_type", "system_screenshot"}
|
|
430
|
+
assert filtered_names == expected_tools, (
|
|
431
|
+
f"Expected {expected_tools}, got {filtered_names}"
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
class TestSystemPromptHandling:
|
|
436
|
+
"""Test system prompt handling through run_dataset flow."""
|
|
437
|
+
|
|
438
|
+
@pytest.fixture
|
|
439
|
+
def mock_mcp_client(self):
|
|
440
|
+
"""Fixture for mock MCP client."""
|
|
441
|
+
client = MagicMock()
|
|
442
|
+
client.initialize = AsyncMock()
|
|
443
|
+
client.list_tools = AsyncMock(return_value=[])
|
|
444
|
+
client.shutdown = AsyncMock()
|
|
445
|
+
client.mcp_config = {"local": {"url": "http://localhost:8765/mcp"}}
|
|
446
|
+
return client
|
|
447
|
+
|
|
448
|
+
@pytest.fixture
|
|
449
|
+
def captured_agent_fixture(self):
|
|
450
|
+
"""Fixture that returns a dictionary to capture the agent instance."""
|
|
451
|
+
return {"agent": None}
|
|
452
|
+
|
|
453
|
+
@pytest.fixture
|
|
454
|
+
def mock_run_context(self, captured_agent_fixture):
|
|
455
|
+
"""Fixture for mocking _run_context to capture agent."""
|
|
456
|
+
|
|
457
|
+
async def _mock(self, context, max_steps=10):
|
|
458
|
+
captured_agent_fixture["agent"] = self
|
|
459
|
+
return Trace(reward=1.0, done=True, content="Done")
|
|
460
|
+
|
|
461
|
+
return _mock
|
|
462
|
+
|
|
463
|
+
@pytest.fixture
|
|
464
|
+
def mock_call_tools(self):
|
|
465
|
+
"""Fixture for mocking call_tools."""
|
|
466
|
+
|
|
467
|
+
async def _mock(self, tool_call=None):
|
|
468
|
+
return []
|
|
469
|
+
|
|
470
|
+
return _mock
|
|
471
|
+
|
|
472
|
+
@pytest.mark.asyncio
|
|
473
|
+
async def test_task_system_prompt_only(
|
|
474
|
+
self, captured_agent_fixture, mock_run_context, mock_call_tools, mock_mcp_client
|
|
475
|
+
) -> None:
|
|
476
|
+
"""Test that task system_prompt is appended when agent has default system prompt."""
|
|
477
|
+
from hud.agents import ClaudeAgent
|
|
478
|
+
from hud.agents.base import GLOBAL_SYSTEM_PROMPT
|
|
479
|
+
from hud.datasets.runner import run_dataset
|
|
480
|
+
|
|
481
|
+
task_system_prompt = "Task prompt"
|
|
482
|
+
|
|
483
|
+
# Create a task with its own system_prompt in agent_config
|
|
484
|
+
task_dict = {
|
|
485
|
+
"prompt": "Test task",
|
|
486
|
+
"mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
|
|
487
|
+
"agent_config": {
|
|
488
|
+
"system_prompt": task_system_prompt,
|
|
489
|
+
},
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
# Agent config with no custom system_prompt (will use default)
|
|
493
|
+
agent_init_config = {
|
|
494
|
+
"validate_api_key": False,
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
with (
|
|
498
|
+
patch("hud.job"),
|
|
499
|
+
patch("hud.trace"),
|
|
500
|
+
patch.object(ClaudeAgent, "_run_context", mock_run_context),
|
|
501
|
+
patch.object(ClaudeAgent, "call_tools", mock_call_tools),
|
|
502
|
+
patch("hud.clients.MCPClient", return_value=mock_mcp_client),
|
|
503
|
+
):
|
|
504
|
+
# Run the dataset
|
|
505
|
+
await run_dataset(
|
|
506
|
+
name="test_job",
|
|
507
|
+
dataset=[task_dict],
|
|
508
|
+
agent_class=ClaudeAgent,
|
|
509
|
+
agent_config=agent_init_config,
|
|
510
|
+
max_steps=10,
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
# Verify agent was created and ran
|
|
514
|
+
captured_agent = captured_agent_fixture["agent"]
|
|
515
|
+
assert captured_agent is not None
|
|
516
|
+
|
|
517
|
+
# Verify the task system prompt was appended
|
|
518
|
+
assert captured_agent.system_prompt.endswith(f"\n\n{task_system_prompt}")
|
|
519
|
+
# Verify it starts with the base global system prompt
|
|
520
|
+
assert captured_agent.system_prompt.startswith(GLOBAL_SYSTEM_PROMPT)
|
|
521
|
+
|
|
522
|
+
@pytest.mark.asyncio
|
|
523
|
+
async def test_both_agent_and_task_system_prompts(
|
|
524
|
+
self, captured_agent_fixture, mock_run_context, mock_call_tools, mock_mcp_client
|
|
525
|
+
) -> None:
|
|
526
|
+
"""Test that both agent init and task system prompts are present when both are set."""
|
|
527
|
+
from hud.agents import ClaudeAgent
|
|
528
|
+
from hud.datasets.runner import run_dataset
|
|
529
|
+
|
|
530
|
+
agent_custom_prompt = "Agent init prompt"
|
|
531
|
+
task_system_prompt = "Task prompt"
|
|
532
|
+
|
|
533
|
+
# Create a task with its own system_prompt in agent_config
|
|
534
|
+
task_dict = {
|
|
535
|
+
"prompt": "Test task",
|
|
536
|
+
"mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
|
|
537
|
+
"agent_config": {
|
|
538
|
+
"system_prompt": task_system_prompt,
|
|
539
|
+
},
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
# Agent config WITH custom system_prompt
|
|
543
|
+
agent_init_config = {
|
|
544
|
+
"system_prompt": agent_custom_prompt,
|
|
545
|
+
"validate_api_key": False,
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
with (
|
|
549
|
+
patch("hud.job"),
|
|
550
|
+
patch("hud.trace"),
|
|
551
|
+
patch.object(ClaudeAgent, "_run_context", mock_run_context),
|
|
552
|
+
patch.object(ClaudeAgent, "call_tools", mock_call_tools),
|
|
553
|
+
patch("hud.clients.MCPClient", return_value=mock_mcp_client),
|
|
554
|
+
):
|
|
555
|
+
# Run the dataset
|
|
556
|
+
await run_dataset(
|
|
557
|
+
name="test_job",
|
|
558
|
+
dataset=[task_dict],
|
|
559
|
+
agent_class=ClaudeAgent,
|
|
560
|
+
agent_config=agent_init_config,
|
|
561
|
+
max_steps=10,
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
# Verify agent was created and ran
|
|
565
|
+
captured_agent = captured_agent_fixture["agent"]
|
|
566
|
+
assert captured_agent is not None
|
|
567
|
+
|
|
568
|
+
# Verify the task system prompt was appended at the end
|
|
569
|
+
assert captured_agent.system_prompt.endswith(f"\n\n{task_system_prompt}")
|
|
570
|
+
# Verify it starts with the agent custom prompt
|
|
571
|
+
assert captured_agent.system_prompt.startswith(agent_custom_prompt)
|
|
572
|
+
# Verify both prompts are present
|
|
573
|
+
assert agent_custom_prompt in captured_agent.system_prompt
|
|
574
|
+
assert task_system_prompt in captured_agent.system_prompt
|
hud/cli/tests/test_mcp_server.py
CHANGED
|
@@ -2,101 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
from
|
|
6
|
-
from unittest.mock import MagicMock, patch
|
|
5
|
+
from unittest.mock import patch
|
|
7
6
|
|
|
8
7
|
import pytest
|
|
9
8
|
|
|
10
9
|
from hud.cli.dev import (
|
|
11
|
-
create_proxy_server,
|
|
12
|
-
get_docker_cmd,
|
|
13
|
-
get_image_name,
|
|
14
10
|
run_mcp_dev_server,
|
|
15
|
-
update_pyproject_toml,
|
|
16
11
|
)
|
|
17
12
|
|
|
18
13
|
|
|
19
|
-
class TestCreateMCPServer:
|
|
20
|
-
"""Test MCP server creation."""
|
|
21
|
-
|
|
22
|
-
def test_create_mcp_server(self) -> None:
|
|
23
|
-
"""Test that MCP server is created with correct configuration."""
|
|
24
|
-
mcp = create_proxy_server(".", "test-image:latest")
|
|
25
|
-
assert mcp._mcp_server.name == "HUD Dev Proxy - test-image:latest"
|
|
26
|
-
# Proxy server doesn't define its own tools, it forwards to Docker containers
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class TestDockerUtils:
|
|
30
|
-
"""Test Docker utility functions."""
|
|
31
|
-
|
|
32
|
-
def test_get_docker_cmd(self) -> None:
|
|
33
|
-
"""Test extracting CMD from Docker image."""
|
|
34
|
-
with patch("subprocess.run") as mock_run:
|
|
35
|
-
mock_result = MagicMock()
|
|
36
|
-
mock_result.returncode = 0
|
|
37
|
-
mock_result.stdout = '["python", "-m", "server"]'
|
|
38
|
-
mock_run.return_value = mock_result
|
|
39
|
-
|
|
40
|
-
cmd = get_docker_cmd("test-image:latest")
|
|
41
|
-
assert cmd is None
|
|
42
|
-
|
|
43
|
-
def test_get_docker_cmd_failure(self) -> None:
|
|
44
|
-
"""Test handling when Docker inspect fails."""
|
|
45
|
-
import subprocess
|
|
46
|
-
|
|
47
|
-
with patch("subprocess.run") as mock_run:
|
|
48
|
-
# check=True causes CalledProcessError on non-zero return
|
|
49
|
-
mock_run.side_effect = subprocess.CalledProcessError(1, "docker inspect")
|
|
50
|
-
|
|
51
|
-
cmd = get_docker_cmd("test-image:latest")
|
|
52
|
-
assert cmd is None
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
class TestImageResolution:
|
|
56
|
-
"""Test image name resolution."""
|
|
57
|
-
|
|
58
|
-
def test_get_image_name_override(self) -> None:
|
|
59
|
-
"""Test image name with override."""
|
|
60
|
-
name, source = get_image_name(".", "custom-image:v1")
|
|
61
|
-
assert name == "custom-image:v1"
|
|
62
|
-
assert source == "override"
|
|
63
|
-
|
|
64
|
-
def test_get_image_name_from_pyproject(self, tmp_path: Path) -> None:
|
|
65
|
-
"""Test image name from pyproject.toml."""
|
|
66
|
-
pyproject = tmp_path / "pyproject.toml"
|
|
67
|
-
pyproject.write_text("""
|
|
68
|
-
[tool.hud]
|
|
69
|
-
image = "my-project:latest"
|
|
70
|
-
""")
|
|
71
|
-
|
|
72
|
-
name, source = get_image_name(str(tmp_path))
|
|
73
|
-
assert name == "my-project:latest"
|
|
74
|
-
assert source == "cache"
|
|
75
|
-
|
|
76
|
-
def test_get_image_name_auto_generate(self, tmp_path: Path) -> None:
|
|
77
|
-
"""Test auto-generated image name."""
|
|
78
|
-
test_dir = tmp_path / "my_test_project"
|
|
79
|
-
test_dir.mkdir()
|
|
80
|
-
|
|
81
|
-
name, source = get_image_name(str(test_dir))
|
|
82
|
-
assert name == "my-test-project:dev"
|
|
83
|
-
assert source == "auto"
|
|
84
|
-
|
|
85
|
-
def test_update_pyproject_toml(self, tmp_path: Path) -> None:
|
|
86
|
-
"""Test updating pyproject.toml with image name."""
|
|
87
|
-
pyproject = tmp_path / "pyproject.toml"
|
|
88
|
-
pyproject.write_text("""
|
|
89
|
-
[project]
|
|
90
|
-
name = "test"
|
|
91
|
-
""")
|
|
92
|
-
|
|
93
|
-
update_pyproject_toml(str(tmp_path), "new-image:v1", silent=True)
|
|
94
|
-
|
|
95
|
-
content = pyproject.read_text()
|
|
96
|
-
assert "[tool.hud]" in content
|
|
97
|
-
assert 'image = "new-image:v1"' in content
|
|
98
|
-
|
|
99
|
-
|
|
100
14
|
class TestRunMCPDevServer:
|
|
101
15
|
"""Test the main server runner."""
|
|
102
16
|
|
|
@@ -110,16 +24,13 @@ class TestRunMCPDevServer:
|
|
|
110
24
|
pytest.raises(click.Abort),
|
|
111
25
|
):
|
|
112
26
|
run_mcp_dev_server(
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
build=False,
|
|
116
|
-
no_cache=False,
|
|
117
|
-
transport="http",
|
|
27
|
+
module=".",
|
|
28
|
+
stdio=False,
|
|
118
29
|
port=8765,
|
|
119
|
-
no_reload=False,
|
|
120
30
|
verbose=False,
|
|
121
31
|
inspector=False,
|
|
122
|
-
no_logs=False,
|
|
123
|
-
docker_args=[],
|
|
124
32
|
interactive=False,
|
|
33
|
+
watch=[],
|
|
34
|
+
docker=False,
|
|
35
|
+
docker_args=[],
|
|
125
36
|
)
|