hud-python 0.4.48__py3-none-any.whl → 0.4.50__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/agents/base.py +40 -34
- hud/agents/grounded_openai.py +1 -1
- hud/cli/__init__.py +78 -213
- hud/cli/build.py +105 -45
- hud/cli/dev.py +614 -743
- hud/cli/flows/tasks.py +98 -17
- hud/cli/init.py +18 -14
- hud/cli/push.py +27 -9
- hud/cli/rl/local_runner.py +3 -3
- hud/cli/tests/test_eval.py +168 -119
- hud/cli/tests/test_mcp_server.py +6 -95
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/source_hash.py +1 -1
- hud/server/__init__.py +2 -1
- hud/server/router.py +160 -0
- hud/server/server.py +246 -79
- hud/tools/base.py +9 -1
- hud/tools/bash.py +2 -2
- hud/tools/edit.py +3 -7
- hud/utils/hud_console.py +43 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.48.dist-info → hud_python-0.4.50.dist-info}/METADATA +1 -1
- {hud_python-0.4.48.dist-info → hud_python-0.4.50.dist-info}/RECORD +27 -26
- {hud_python-0.4.48.dist-info → hud_python-0.4.50.dist-info}/WHEEL +0 -0
- {hud_python-0.4.48.dist-info → hud_python-0.4.50.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.48.dist-info → hud_python-0.4.50.dist-info}/licenses/LICENSE +0 -0
hud/cli/tests/test_eval.py
CHANGED
|
@@ -7,9 +7,13 @@ from unittest.mock import AsyncMock, MagicMock, Mock, patch
|
|
|
7
7
|
import pytest
|
|
8
8
|
from mcp import types
|
|
9
9
|
|
|
10
|
-
from hud.cli.eval import
|
|
10
|
+
from hud.cli.eval import (
|
|
11
|
+
build_agent,
|
|
12
|
+
run_single_task,
|
|
13
|
+
)
|
|
11
14
|
from hud.types import Task, Trace
|
|
12
15
|
|
|
16
|
+
|
|
13
17
|
class TestBuildAgent:
|
|
14
18
|
"""Test the build_agent function."""
|
|
15
19
|
|
|
@@ -20,10 +24,10 @@ class TestBuildAgent:
|
|
|
20
24
|
with patch("hud.agents.misc.integration_test_agent.IntegrationTestRunner") as mock_runner:
|
|
21
25
|
mock_instance = Mock()
|
|
22
26
|
mock_runner.return_value = mock_instance
|
|
23
|
-
|
|
27
|
+
|
|
24
28
|
# Test with verbose=False
|
|
25
29
|
result = build_agent("integration_test", verbose=False)
|
|
26
|
-
|
|
30
|
+
|
|
27
31
|
mock_runner.assert_called_once_with(verbose=False)
|
|
28
32
|
assert result == mock_instance
|
|
29
33
|
|
|
@@ -34,14 +38,11 @@ class TestBuildAgent:
|
|
|
34
38
|
with patch("hud.agents.ClaudeAgent") as mock_runner:
|
|
35
39
|
mock_instance = Mock()
|
|
36
40
|
mock_runner.return_value = mock_instance
|
|
37
|
-
|
|
41
|
+
|
|
38
42
|
# Test with verbose=False
|
|
39
43
|
result = build_agent("claude", verbose=False)
|
|
40
|
-
|
|
41
|
-
mock_runner.assert_called_once_with(
|
|
42
|
-
model="claude-sonnet-4-20250514",
|
|
43
|
-
verbose=False
|
|
44
|
-
)
|
|
44
|
+
|
|
45
|
+
mock_runner.assert_called_once_with(model="claude-sonnet-4-20250514", verbose=False)
|
|
45
46
|
assert result == mock_instance
|
|
46
47
|
|
|
47
48
|
def test_builds_claude_agent_with_custom_model_and_allowed_tools(self) -> None:
|
|
@@ -51,7 +52,7 @@ class TestBuildAgent:
|
|
|
51
52
|
with patch("hud.agents.ClaudeAgent") as mock_runner:
|
|
52
53
|
mock_instance = Mock()
|
|
53
54
|
mock_runner.return_value = mock_instance
|
|
54
|
-
|
|
55
|
+
|
|
55
56
|
# Test with verbose=False
|
|
56
57
|
result = build_agent(
|
|
57
58
|
"claude",
|
|
@@ -59,7 +60,7 @@ class TestBuildAgent:
|
|
|
59
60
|
allowed_tools=["act"],
|
|
60
61
|
verbose=True,
|
|
61
62
|
)
|
|
62
|
-
|
|
63
|
+
|
|
63
64
|
mock_runner.assert_called_once_with(
|
|
64
65
|
model="claude-sonnet-4-20250514",
|
|
65
66
|
allowed_tools=["act"],
|
|
@@ -81,19 +82,23 @@ class TestRunSingleTask:
|
|
|
81
82
|
"system_prompt": "Custom instructions",
|
|
82
83
|
"allowed_tools": ["tool1", "tool2"],
|
|
83
84
|
"append_setup_output": False,
|
|
84
|
-
}
|
|
85
|
+
},
|
|
85
86
|
)
|
|
86
87
|
mock_agent = AsyncMock(
|
|
87
|
-
initialize=AsyncMock(),
|
|
88
|
-
run=AsyncMock(return_value=Trace(reward=1.0, done=True))
|
|
88
|
+
initialize=AsyncMock(), run=AsyncMock(return_value=Trace(reward=1.0, done=True))
|
|
89
89
|
)
|
|
90
|
-
|
|
91
|
-
with
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
90
|
+
|
|
91
|
+
with (
|
|
92
|
+
patch("hud.utils.tasks.load_tasks", return_value=[mock_task]),
|
|
93
|
+
patch(
|
|
94
|
+
"hud.agents.misc.integration_test_agent.IntegrationTestRunner",
|
|
95
|
+
return_value=mock_agent,
|
|
96
|
+
),
|
|
97
|
+
patch("hud.cli.eval.find_environment_dir", return_value=None),
|
|
98
|
+
patch("hud.cli.eval.hud.trace"),
|
|
99
|
+
):
|
|
95
100
|
await run_single_task("test.json", agent_type="integration_test", max_steps=10)
|
|
96
|
-
|
|
101
|
+
|
|
97
102
|
# Verify agent.run was called with the task containing agent_config
|
|
98
103
|
mock_agent.run.assert_called_once()
|
|
99
104
|
called_task = mock_agent.run.call_args[0][0]
|
|
@@ -103,17 +108,20 @@ class TestRunSingleTask:
|
|
|
103
108
|
async def test_runs_with_group_size_greater_than_one(self) -> None:
|
|
104
109
|
"""Test that group_size > 1 triggers run_tasks_grouped instead of agent.run."""
|
|
105
110
|
mock_task = Task(prompt="Test", mcp_config={"local": {"url": "http://localhost:8765/mcp"}})
|
|
106
|
-
|
|
107
|
-
with
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
111
|
+
|
|
112
|
+
with (
|
|
113
|
+
patch("hud.utils.tasks.load_tasks", return_value=[mock_task]),
|
|
114
|
+
patch("hud.cli.eval.run_tasks_grouped", new_callable=AsyncMock) as mock_grouped,
|
|
115
|
+
patch("hud.cli.eval.display_group_statistics"),
|
|
116
|
+
patch("hud.cli.eval.find_environment_dir", return_value=None),
|
|
117
|
+
patch("hud.cli.eval.hud.trace"),
|
|
118
|
+
):
|
|
113
119
|
mock_grouped.return_value = [{"task": mock_task, "rewards": [1.0, 0.5]}]
|
|
114
|
-
|
|
115
|
-
await run_single_task(
|
|
116
|
-
|
|
120
|
+
|
|
121
|
+
await run_single_task(
|
|
122
|
+
"test.json", agent_type="integration_test", group_size=3, max_steps=10
|
|
123
|
+
)
|
|
124
|
+
|
|
117
125
|
# Verify run_tasks_grouped was called with correct group_size
|
|
118
126
|
mock_grouped.assert_called_once()
|
|
119
127
|
assert mock_grouped.call_args.kwargs["group_size"] == 3
|
|
@@ -145,20 +153,20 @@ class TestToolFiltering:
|
|
|
145
153
|
) -> list[types.Tool]:
|
|
146
154
|
"""Helper to create agent, initialize with tools and config, return filtered tools."""
|
|
147
155
|
from hud.agents import ClaudeAgent
|
|
148
|
-
|
|
156
|
+
|
|
149
157
|
mock_mcp_client.list_tools = AsyncMock(return_value=tools)
|
|
150
|
-
|
|
158
|
+
|
|
151
159
|
task = Task(
|
|
152
160
|
prompt="Test",
|
|
153
161
|
mcp_config={"local": {"url": "http://localhost"}},
|
|
154
|
-
agent_config=agent_config or {}
|
|
162
|
+
agent_config=agent_config or {},
|
|
155
163
|
)
|
|
156
|
-
|
|
164
|
+
|
|
157
165
|
agent = ClaudeAgent(
|
|
158
166
|
mcp_client=mock_mcp_client,
|
|
159
167
|
model_client=mock_model_client,
|
|
160
168
|
model="test",
|
|
161
|
-
validate_api_key=False
|
|
169
|
+
validate_api_key=False,
|
|
162
170
|
)
|
|
163
171
|
await agent.initialize(task)
|
|
164
172
|
return agent.get_available_tools()
|
|
@@ -171,13 +179,15 @@ class TestToolFiltering:
|
|
|
171
179
|
types.Tool(name="tool2", description="Tool 2", inputSchema={}),
|
|
172
180
|
types.Tool(name="debug_tool", description="Debug", inputSchema={}),
|
|
173
181
|
]
|
|
174
|
-
|
|
182
|
+
|
|
175
183
|
result = await self._run_agent_with_tools(mock_mcp_client, mock_model_client, tools)
|
|
176
|
-
|
|
184
|
+
|
|
177
185
|
assert len(result) == 3
|
|
178
186
|
|
|
179
187
|
@pytest.mark.asyncio
|
|
180
|
-
async def test_allowed_tools_filters_correctly(
|
|
188
|
+
async def test_allowed_tools_filters_correctly(
|
|
189
|
+
self, mock_mcp_client, mock_model_client
|
|
190
|
+
) -> None:
|
|
181
191
|
"""Test that allowed_tools in agent_config filters to matching patterns."""
|
|
182
192
|
tools = [
|
|
183
193
|
types.Tool(name="screenshot_take", description="Tool 1", inputSchema={}),
|
|
@@ -185,14 +195,18 @@ class TestToolFiltering:
|
|
|
185
195
|
types.Tool(name="click", description="Tool 3", inputSchema={}),
|
|
186
196
|
]
|
|
187
197
|
agent_config = {"allowed_tools": ["screenshot_*"]}
|
|
188
|
-
|
|
189
|
-
result = await self._run_agent_with_tools(
|
|
190
|
-
|
|
198
|
+
|
|
199
|
+
result = await self._run_agent_with_tools(
|
|
200
|
+
mock_mcp_client, mock_model_client, tools, agent_config
|
|
201
|
+
)
|
|
202
|
+
|
|
191
203
|
assert len(result) == 2
|
|
192
204
|
assert all("screenshot" in t.name for t in result)
|
|
193
205
|
|
|
194
206
|
@pytest.mark.asyncio
|
|
195
|
-
async def test_disallowed_tools_excludes_correctly(
|
|
207
|
+
async def test_disallowed_tools_excludes_correctly(
|
|
208
|
+
self, mock_mcp_client, mock_model_client
|
|
209
|
+
) -> None:
|
|
196
210
|
"""Test that disallowed_tools in agent_config excludes matching patterns."""
|
|
197
211
|
tools = [
|
|
198
212
|
types.Tool(name="tool1", description="Tool 1", inputSchema={}),
|
|
@@ -200,27 +214,30 @@ class TestToolFiltering:
|
|
|
200
214
|
types.Tool(name="internal_secret", description="Tool 3", inputSchema={}),
|
|
201
215
|
]
|
|
202
216
|
agent_config = {"disallowed_tools": ["debug_*", "internal_*"]}
|
|
203
|
-
|
|
204
|
-
result = await self._run_agent_with_tools(
|
|
205
|
-
|
|
217
|
+
|
|
218
|
+
result = await self._run_agent_with_tools(
|
|
219
|
+
mock_mcp_client, mock_model_client, tools, agent_config
|
|
220
|
+
)
|
|
221
|
+
|
|
206
222
|
assert len(result) == 1
|
|
207
223
|
assert result[0].name == "tool1"
|
|
208
224
|
|
|
209
225
|
@pytest.mark.asyncio
|
|
210
|
-
async def test_both_filters_applies_allowed_then_disallowed(
|
|
226
|
+
async def test_both_filters_applies_allowed_then_disallowed(
|
|
227
|
+
self, mock_mcp_client, mock_model_client
|
|
228
|
+
) -> None:
|
|
211
229
|
"""Test that both filters in agent_config work together (disallowed takes precedence)."""
|
|
212
230
|
tools = [
|
|
213
231
|
types.Tool(name="browser_click", description="Tool 1", inputSchema={}),
|
|
214
232
|
types.Tool(name="browser_debug", description="Tool 2", inputSchema={}),
|
|
215
233
|
types.Tool(name="system_click", description="Tool 3", inputSchema={}),
|
|
216
234
|
]
|
|
217
|
-
agent_config = {
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
235
|
+
agent_config = {"allowed_tools": ["browser_*"], "disallowed_tools": ["*_debug"]}
|
|
236
|
+
|
|
237
|
+
result = await self._run_agent_with_tools(
|
|
238
|
+
mock_mcp_client, mock_model_client, tools, agent_config
|
|
239
|
+
)
|
|
240
|
+
|
|
224
241
|
assert len(result) == 1
|
|
225
242
|
assert result[0].name == "browser_click"
|
|
226
243
|
|
|
@@ -247,16 +264,20 @@ class TestRunDatasetToolFiltering:
|
|
|
247
264
|
@pytest.fixture
|
|
248
265
|
def mock_run_context(self, captured_agent_fixture):
|
|
249
266
|
"""Fixture for mocking _run_context."""
|
|
267
|
+
|
|
250
268
|
async def _mock(self, context, max_steps=10):
|
|
251
269
|
captured_agent_fixture["agent"] = self
|
|
252
270
|
return Trace(reward=1.0, done=True, content="Done")
|
|
271
|
+
|
|
253
272
|
return _mock
|
|
254
273
|
|
|
255
274
|
@pytest.fixture
|
|
256
275
|
def mock_call_tools(self):
|
|
257
276
|
"""Fixture for mocking call_tools."""
|
|
277
|
+
|
|
258
278
|
async def _mock(self, tool_call=None):
|
|
259
279
|
return []
|
|
280
|
+
|
|
260
281
|
return _mock
|
|
261
282
|
|
|
262
283
|
@pytest.fixture
|
|
@@ -271,35 +292,47 @@ class TestRunDatasetToolFiltering:
|
|
|
271
292
|
|
|
272
293
|
@pytest.mark.asyncio
|
|
273
294
|
async def test_agent_config_intersection_union_via_run_dataset(
|
|
274
|
-
self,
|
|
295
|
+
self,
|
|
296
|
+
all_tools,
|
|
297
|
+
captured_agent_fixture,
|
|
298
|
+
mock_run_context,
|
|
299
|
+
mock_call_tools,
|
|
300
|
+
mock_client_instance,
|
|
275
301
|
) -> None:
|
|
276
|
-
"""Test that allowed_tools intersect and disallowed_tools union when set in both __init__ and task.agent_config."""
|
|
302
|
+
"""Test that allowed_tools intersect and disallowed_tools union when set in both __init__ and task.agent_config.""" # noqa: E501
|
|
277
303
|
from hud.agents import ClaudeAgent
|
|
278
304
|
from hud.datasets.runner import run_dataset
|
|
279
|
-
|
|
305
|
+
|
|
280
306
|
# Create a task with its own agent_config
|
|
281
307
|
task_dict = {
|
|
282
308
|
"prompt": "Test task",
|
|
283
309
|
"mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
|
|
284
310
|
"agent_config": {
|
|
285
|
-
"allowed_tools": [
|
|
286
|
-
|
|
287
|
-
|
|
311
|
+
"allowed_tools": [
|
|
312
|
+
"browser_*",
|
|
313
|
+
"system_screenshot",
|
|
314
|
+
], # Task wants browser_* and system_screenshot
|
|
315
|
+
"disallowed_tools": [
|
|
316
|
+
"*_debug",
|
|
317
|
+
"*_execute",
|
|
318
|
+
], # Task disallows *_debug and *_execute
|
|
319
|
+
},
|
|
288
320
|
}
|
|
289
|
-
|
|
321
|
+
|
|
290
322
|
# Agent config passed to __init__ via run_dataset
|
|
291
323
|
agent_init_config = {
|
|
292
324
|
"allowed_tools": ["browser_*", "system_*"], # Agent init wants browser_* and system_*
|
|
293
325
|
"disallowed_tools": ["browser_debug"], # Agent init disallows browser_debug
|
|
294
326
|
"validate_api_key": False,
|
|
295
327
|
}
|
|
296
|
-
|
|
297
|
-
with
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
328
|
+
|
|
329
|
+
with (
|
|
330
|
+
patch("hud.job"),
|
|
331
|
+
patch("hud.trace"),
|
|
332
|
+
patch.object(ClaudeAgent, "_run_context", mock_run_context),
|
|
333
|
+
patch.object(ClaudeAgent, "call_tools", mock_call_tools),
|
|
334
|
+
patch("hud.clients.MCPClient", return_value=mock_client_instance),
|
|
335
|
+
):
|
|
303
336
|
# Run the dataset
|
|
304
337
|
await run_dataset(
|
|
305
338
|
name="test_job",
|
|
@@ -308,35 +341,42 @@ class TestRunDatasetToolFiltering:
|
|
|
308
341
|
agent_config=agent_init_config,
|
|
309
342
|
max_steps=10,
|
|
310
343
|
)
|
|
311
|
-
|
|
344
|
+
|
|
312
345
|
# Verify agent was created and ran
|
|
313
346
|
captured_agent = captured_agent_fixture["agent"]
|
|
314
347
|
assert captured_agent is not None
|
|
315
|
-
|
|
348
|
+
|
|
316
349
|
# Get the filtered tools
|
|
317
350
|
filtered_tools = captured_agent.get_available_tools()
|
|
318
351
|
filtered_names = {tool.name for tool in filtered_tools}
|
|
319
|
-
|
|
352
|
+
|
|
320
353
|
# Expected behavior:
|
|
321
|
-
# 1. allowed_tools intersection: ["browser_*", "system_*"] ∩ ["browser_*", "system_screenshot"]
|
|
354
|
+
# 1. allowed_tools intersection: ["browser_*", "system_*"] ∩ ["browser_*", "system_screenshot"] # noqa: E501
|
|
322
355
|
# Exact string intersection: only "browser_*" is in both lists
|
|
323
|
-
# So only tools matching browser_* are allowed: browser_click, browser_type, browser_debug
|
|
324
|
-
# 2. disallowed_tools union: ["browser_debug"]
|
|
356
|
+
# So only tools matching browser_* are allowed: browser_click, browser_type, browser_debug # noqa: E501
|
|
357
|
+
# 2. disallowed_tools union: ["browser_debug"] U ["*_debug", "*_execute"]
|
|
325
358
|
# Result: ["browser_debug", "*_debug", "*_execute"] (all patterns included)
|
|
326
359
|
# 3. Final: {browser_click, browser_type, browser_debug} - {browser_debug}
|
|
327
360
|
# Result: browser_click, browser_type
|
|
328
|
-
|
|
361
|
+
|
|
329
362
|
expected_tools = {"browser_click", "browser_type"}
|
|
330
|
-
assert filtered_names == expected_tools,
|
|
363
|
+
assert filtered_names == expected_tools, (
|
|
364
|
+
f"Expected {expected_tools}, got {filtered_names}"
|
|
365
|
+
)
|
|
331
366
|
|
|
332
367
|
@pytest.mark.asyncio
|
|
333
368
|
async def test_no_allowed_tools_keeps_all_tools_except_disallowed(
|
|
334
|
-
self,
|
|
369
|
+
self,
|
|
370
|
+
all_tools,
|
|
371
|
+
captured_agent_fixture,
|
|
372
|
+
mock_run_context,
|
|
373
|
+
mock_call_tools,
|
|
374
|
+
mock_client_instance,
|
|
335
375
|
) -> None:
|
|
336
|
-
"""Test that when allowed_tools is not set, all tools are available except disallowed ones."""
|
|
376
|
+
"""Test that when allowed_tools is not set, all tools are available except disallowed ones.""" # noqa: E501
|
|
337
377
|
from hud.agents import ClaudeAgent
|
|
338
378
|
from hud.datasets.runner import run_dataset
|
|
339
|
-
|
|
379
|
+
|
|
340
380
|
# Create a task with its own agent_config (no allowed_tools)
|
|
341
381
|
task_dict = {
|
|
342
382
|
"prompt": "Test task",
|
|
@@ -344,22 +384,23 @@ class TestRunDatasetToolFiltering:
|
|
|
344
384
|
"agent_config": {
|
|
345
385
|
# No allowed_tools set - should allow all tools
|
|
346
386
|
"disallowed_tools": ["*_execute"], # Task disallows *_execute
|
|
347
|
-
}
|
|
387
|
+
},
|
|
348
388
|
}
|
|
349
|
-
|
|
389
|
+
|
|
350
390
|
# Agent config passed to __init__ via run_dataset (no allowed_tools)
|
|
351
391
|
agent_init_config = {
|
|
352
392
|
# No allowed_tools set - should allow all tools
|
|
353
393
|
"disallowed_tools": ["browser_debug"], # Agent init disallows browser_debug
|
|
354
394
|
"validate_api_key": False,
|
|
355
395
|
}
|
|
356
|
-
|
|
357
|
-
with
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
396
|
+
|
|
397
|
+
with (
|
|
398
|
+
patch("hud.job"),
|
|
399
|
+
patch("hud.trace"),
|
|
400
|
+
patch.object(ClaudeAgent, "_run_context", mock_run_context),
|
|
401
|
+
patch.object(ClaudeAgent, "call_tools", mock_call_tools),
|
|
402
|
+
patch("hud.clients.MCPClient", return_value=mock_client_instance),
|
|
403
|
+
):
|
|
363
404
|
# Run the dataset
|
|
364
405
|
await run_dataset(
|
|
365
406
|
name="test_job",
|
|
@@ -368,25 +409,27 @@ class TestRunDatasetToolFiltering:
|
|
|
368
409
|
agent_config=agent_init_config,
|
|
369
410
|
max_steps=10,
|
|
370
411
|
)
|
|
371
|
-
|
|
412
|
+
|
|
372
413
|
# Verify agent was created and ran
|
|
373
414
|
captured_agent = captured_agent_fixture["agent"]
|
|
374
415
|
assert captured_agent is not None
|
|
375
|
-
|
|
416
|
+
|
|
376
417
|
# Get the filtered tools
|
|
377
418
|
filtered_tools = captured_agent.get_available_tools()
|
|
378
419
|
filtered_names = {tool.name for tool in filtered_tools}
|
|
379
|
-
|
|
420
|
+
|
|
380
421
|
# Expected behavior:
|
|
381
422
|
# 1. allowed_tools: None (no allowed_tools set in either init or task)
|
|
382
423
|
# Result: All tools are initially allowed
|
|
383
|
-
# 2. disallowed_tools union: ["browser_debug"]
|
|
424
|
+
# 2. disallowed_tools union: ["browser_debug"] U ["*_execute"]
|
|
384
425
|
# Result: ["browser_debug", "*_execute"] (all patterns included)
|
|
385
426
|
# 3. Final: {all tools} - {browser_debug, system_execute}
|
|
386
427
|
# Result: browser_click, browser_type, system_screenshot
|
|
387
|
-
|
|
428
|
+
|
|
388
429
|
expected_tools = {"browser_click", "browser_type", "system_screenshot"}
|
|
389
|
-
assert filtered_names == expected_tools,
|
|
430
|
+
assert filtered_names == expected_tools, (
|
|
431
|
+
f"Expected {expected_tools}, got {filtered_names}"
|
|
432
|
+
)
|
|
390
433
|
|
|
391
434
|
|
|
392
435
|
class TestSystemPromptHandling:
|
|
@@ -410,16 +453,20 @@ class TestSystemPromptHandling:
|
|
|
410
453
|
@pytest.fixture
|
|
411
454
|
def mock_run_context(self, captured_agent_fixture):
|
|
412
455
|
"""Fixture for mocking _run_context to capture agent."""
|
|
456
|
+
|
|
413
457
|
async def _mock(self, context, max_steps=10):
|
|
414
458
|
captured_agent_fixture["agent"] = self
|
|
415
459
|
return Trace(reward=1.0, done=True, content="Done")
|
|
460
|
+
|
|
416
461
|
return _mock
|
|
417
462
|
|
|
418
463
|
@pytest.fixture
|
|
419
464
|
def mock_call_tools(self):
|
|
420
465
|
"""Fixture for mocking call_tools."""
|
|
466
|
+
|
|
421
467
|
async def _mock(self, tool_call=None):
|
|
422
468
|
return []
|
|
469
|
+
|
|
423
470
|
return _mock
|
|
424
471
|
|
|
425
472
|
@pytest.mark.asyncio
|
|
@@ -430,29 +477,30 @@ class TestSystemPromptHandling:
|
|
|
430
477
|
from hud.agents import ClaudeAgent
|
|
431
478
|
from hud.agents.base import GLOBAL_SYSTEM_PROMPT
|
|
432
479
|
from hud.datasets.runner import run_dataset
|
|
433
|
-
|
|
480
|
+
|
|
434
481
|
task_system_prompt = "Task prompt"
|
|
435
|
-
|
|
482
|
+
|
|
436
483
|
# Create a task with its own system_prompt in agent_config
|
|
437
484
|
task_dict = {
|
|
438
485
|
"prompt": "Test task",
|
|
439
486
|
"mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
|
|
440
487
|
"agent_config": {
|
|
441
488
|
"system_prompt": task_system_prompt,
|
|
442
|
-
}
|
|
489
|
+
},
|
|
443
490
|
}
|
|
444
|
-
|
|
491
|
+
|
|
445
492
|
# Agent config with no custom system_prompt (will use default)
|
|
446
493
|
agent_init_config = {
|
|
447
494
|
"validate_api_key": False,
|
|
448
495
|
}
|
|
449
|
-
|
|
450
|
-
with
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
496
|
+
|
|
497
|
+
with (
|
|
498
|
+
patch("hud.job"),
|
|
499
|
+
patch("hud.trace"),
|
|
500
|
+
patch.object(ClaudeAgent, "_run_context", mock_run_context),
|
|
501
|
+
patch.object(ClaudeAgent, "call_tools", mock_call_tools),
|
|
502
|
+
patch("hud.clients.MCPClient", return_value=mock_mcp_client),
|
|
503
|
+
):
|
|
456
504
|
# Run the dataset
|
|
457
505
|
await run_dataset(
|
|
458
506
|
name="test_job",
|
|
@@ -461,11 +509,11 @@ class TestSystemPromptHandling:
|
|
|
461
509
|
agent_config=agent_init_config,
|
|
462
510
|
max_steps=10,
|
|
463
511
|
)
|
|
464
|
-
|
|
512
|
+
|
|
465
513
|
# Verify agent was created and ran
|
|
466
514
|
captured_agent = captured_agent_fixture["agent"]
|
|
467
515
|
assert captured_agent is not None
|
|
468
|
-
|
|
516
|
+
|
|
469
517
|
# Verify the task system prompt was appended
|
|
470
518
|
assert captured_agent.system_prompt.endswith(f"\n\n{task_system_prompt}")
|
|
471
519
|
# Verify it starts with the base global system prompt
|
|
@@ -478,31 +526,32 @@ class TestSystemPromptHandling:
|
|
|
478
526
|
"""Test that both agent init and task system prompts are present when both are set."""
|
|
479
527
|
from hud.agents import ClaudeAgent
|
|
480
528
|
from hud.datasets.runner import run_dataset
|
|
481
|
-
|
|
529
|
+
|
|
482
530
|
agent_custom_prompt = "Agent init prompt"
|
|
483
531
|
task_system_prompt = "Task prompt"
|
|
484
|
-
|
|
532
|
+
|
|
485
533
|
# Create a task with its own system_prompt in agent_config
|
|
486
534
|
task_dict = {
|
|
487
535
|
"prompt": "Test task",
|
|
488
536
|
"mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
|
|
489
537
|
"agent_config": {
|
|
490
538
|
"system_prompt": task_system_prompt,
|
|
491
|
-
}
|
|
539
|
+
},
|
|
492
540
|
}
|
|
493
|
-
|
|
541
|
+
|
|
494
542
|
# Agent config WITH custom system_prompt
|
|
495
543
|
agent_init_config = {
|
|
496
544
|
"system_prompt": agent_custom_prompt,
|
|
497
545
|
"validate_api_key": False,
|
|
498
546
|
}
|
|
499
|
-
|
|
500
|
-
with
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
547
|
+
|
|
548
|
+
with (
|
|
549
|
+
patch("hud.job"),
|
|
550
|
+
patch("hud.trace"),
|
|
551
|
+
patch.object(ClaudeAgent, "_run_context", mock_run_context),
|
|
552
|
+
patch.object(ClaudeAgent, "call_tools", mock_call_tools),
|
|
553
|
+
patch("hud.clients.MCPClient", return_value=mock_mcp_client),
|
|
554
|
+
):
|
|
506
555
|
# Run the dataset
|
|
507
556
|
await run_dataset(
|
|
508
557
|
name="test_job",
|
|
@@ -511,11 +560,11 @@ class TestSystemPromptHandling:
|
|
|
511
560
|
agent_config=agent_init_config,
|
|
512
561
|
max_steps=10,
|
|
513
562
|
)
|
|
514
|
-
|
|
563
|
+
|
|
515
564
|
# Verify agent was created and ran
|
|
516
565
|
captured_agent = captured_agent_fixture["agent"]
|
|
517
566
|
assert captured_agent is not None
|
|
518
|
-
|
|
567
|
+
|
|
519
568
|
# Verify the task system prompt was appended at the end
|
|
520
569
|
assert captured_agent.system_prompt.endswith(f"\n\n{task_system_prompt}")
|
|
521
570
|
# Verify it starts with the agent custom prompt
|