hud-python 0.4.47__py3-none-any.whl → 0.4.49__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (45) hide show
  1. hud/agents/base.py +55 -142
  2. hud/agents/claude.py +5 -6
  3. hud/agents/grounded_openai.py +1 -1
  4. hud/agents/misc/integration_test_agent.py +2 -0
  5. hud/agents/tests/test_base.py +2 -5
  6. hud/cli/__init__.py +80 -215
  7. hud/cli/build.py +105 -45
  8. hud/cli/dev.py +614 -743
  9. hud/cli/eval.py +14 -9
  10. hud/cli/flows/tasks.py +100 -21
  11. hud/cli/init.py +18 -14
  12. hud/cli/push.py +27 -9
  13. hud/cli/rl/local_runner.py +28 -16
  14. hud/cli/rl/vllm.py +2 -0
  15. hud/cli/tests/test_analyze_metadata.py +3 -2
  16. hud/cli/tests/test_eval.py +574 -0
  17. hud/cli/tests/test_mcp_server.py +6 -95
  18. hud/cli/tests/test_utils.py +1 -1
  19. hud/cli/utils/env_check.py +9 -9
  20. hud/cli/utils/source_hash.py +1 -1
  21. hud/datasets/parallel.py +0 -12
  22. hud/datasets/runner.py +1 -4
  23. hud/rl/actor.py +4 -2
  24. hud/rl/distributed.py +1 -1
  25. hud/rl/learner.py +2 -1
  26. hud/rl/train.py +1 -1
  27. hud/server/__init__.py +2 -1
  28. hud/server/router.py +160 -0
  29. hud/server/server.py +246 -79
  30. hud/telemetry/trace.py +1 -1
  31. hud/tools/base.py +20 -10
  32. hud/tools/computer/__init__.py +2 -0
  33. hud/tools/computer/qwen.py +431 -0
  34. hud/tools/computer/settings.py +16 -0
  35. hud/tools/executors/pyautogui.py +1 -1
  36. hud/tools/playwright.py +1 -1
  37. hud/types.py +2 -3
  38. hud/utils/hud_console.py +43 -0
  39. hud/utils/tests/test_version.py +1 -1
  40. hud/version.py +1 -1
  41. {hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/METADATA +1 -1
  42. {hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/RECORD +45 -42
  43. {hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/WHEEL +0 -0
  44. {hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/entry_points.txt +0 -0
  45. {hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,574 @@
1
+ """Tests for hud.cli.eval module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from unittest.mock import AsyncMock, MagicMock, Mock, patch
6
+
7
+ import pytest
8
+ from mcp import types
9
+
10
+ from hud.cli.eval import (
11
+ build_agent,
12
+ run_single_task,
13
+ )
14
+ from hud.types import Task, Trace
15
+
16
+
17
+ class TestBuildAgent:
18
+ """Test the build_agent function."""
19
+
20
+ def test_builds_integration_test_agent(self) -> None:
21
+ """
22
+ Test building an integration test agent.
23
+ """
24
+ with patch("hud.agents.misc.integration_test_agent.IntegrationTestRunner") as mock_runner:
25
+ mock_instance = Mock()
26
+ mock_runner.return_value = mock_instance
27
+
28
+ # Test with verbose=False
29
+ result = build_agent("integration_test", verbose=False)
30
+
31
+ mock_runner.assert_called_once_with(verbose=False)
32
+ assert result == mock_instance
33
+
34
+ def test_builds_claude_agent(self) -> None:
35
+ """
36
+ Test building a Claude agent with default model.
37
+ """
38
+ with patch("hud.agents.ClaudeAgent") as mock_runner:
39
+ mock_instance = Mock()
40
+ mock_runner.return_value = mock_instance
41
+
42
+ # Test with verbose=False
43
+ result = build_agent("claude", verbose=False)
44
+
45
+ mock_runner.assert_called_once_with(model="claude-sonnet-4-20250514", verbose=False)
46
+ assert result == mock_instance
47
+
48
+ def test_builds_claude_agent_with_custom_model_and_allowed_tools(self) -> None:
49
+ """
50
+ Test building a Claude agent with custom model name and allowed tools.
51
+ """
52
+ with patch("hud.agents.ClaudeAgent") as mock_runner:
53
+ mock_instance = Mock()
54
+ mock_runner.return_value = mock_instance
55
+
56
+ # Test with verbose=False
57
+ result = build_agent(
58
+ "claude",
59
+ model="claude-sonnet-4-20250514",
60
+ allowed_tools=["act"],
61
+ verbose=True,
62
+ )
63
+
64
+ mock_runner.assert_called_once_with(
65
+ model="claude-sonnet-4-20250514",
66
+ allowed_tools=["act"],
67
+ verbose=True,
68
+ )
69
+ assert result == mock_instance
70
+
71
+
72
+ class TestRunSingleTask:
73
+ """Test the run_single_task function."""
74
+
75
+ @pytest.mark.asyncio
76
+ async def test_applies_agent_config_from_task(self) -> None:
77
+ """Test that task.agent_config is applied during agent initialization."""
78
+ mock_task = Task(
79
+ prompt="Test",
80
+ mcp_config={"local": {"url": "http://localhost:8765/mcp"}},
81
+ agent_config={
82
+ "system_prompt": "Custom instructions",
83
+ "allowed_tools": ["tool1", "tool2"],
84
+ "append_setup_output": False,
85
+ },
86
+ )
87
+ mock_agent = AsyncMock(
88
+ initialize=AsyncMock(), run=AsyncMock(return_value=Trace(reward=1.0, done=True))
89
+ )
90
+
91
+ with (
92
+ patch("hud.utils.tasks.load_tasks", return_value=[mock_task]),
93
+ patch(
94
+ "hud.agents.misc.integration_test_agent.IntegrationTestRunner",
95
+ return_value=mock_agent,
96
+ ),
97
+ patch("hud.cli.eval.find_environment_dir", return_value=None),
98
+ patch("hud.cli.eval.hud.trace"),
99
+ ):
100
+ await run_single_task("test.json", agent_type="integration_test", max_steps=10)
101
+
102
+ # Verify agent.run was called with the task containing agent_config
103
+ mock_agent.run.assert_called_once()
104
+ called_task = mock_agent.run.call_args[0][0]
105
+ assert called_task.agent_config == mock_task.agent_config
106
+
107
+ @pytest.mark.asyncio
108
+ async def test_runs_with_group_size_greater_than_one(self) -> None:
109
+ """Test that group_size > 1 triggers run_tasks_grouped instead of agent.run."""
110
+ mock_task = Task(prompt="Test", mcp_config={"local": {"url": "http://localhost:8765/mcp"}})
111
+
112
+ with (
113
+ patch("hud.utils.tasks.load_tasks", return_value=[mock_task]),
114
+ patch("hud.cli.eval.run_tasks_grouped", new_callable=AsyncMock) as mock_grouped,
115
+ patch("hud.cli.eval.display_group_statistics"),
116
+ patch("hud.cli.eval.find_environment_dir", return_value=None),
117
+ patch("hud.cli.eval.hud.trace"),
118
+ ):
119
+ mock_grouped.return_value = [{"task": mock_task, "rewards": [1.0, 0.5]}]
120
+
121
+ await run_single_task(
122
+ "test.json", agent_type="integration_test", group_size=3, max_steps=10
123
+ )
124
+
125
+ # Verify run_tasks_grouped was called with correct group_size
126
+ mock_grouped.assert_called_once()
127
+ assert mock_grouped.call_args.kwargs["group_size"] == 3
128
+ assert mock_grouped.call_args.kwargs["max_steps"] == 10
129
+
130
+
131
+ class TestToolFiltering:
132
+ """Test wildcard tool filtering via agent_config in tasks."""
133
+
134
+ @pytest.fixture
135
+ def mock_mcp_client(self):
136
+ """Fixture for mock MCP client."""
137
+ client = MagicMock()
138
+ client.initialize = AsyncMock()
139
+ client.mcp_config = {"local": {"url": "http://localhost"}}
140
+ return client
141
+
142
+ @pytest.fixture
143
+ def mock_model_client(self):
144
+ """Fixture for mock Anthropic client."""
145
+ return MagicMock()
146
+
147
+ async def _run_agent_with_tools(
148
+ self,
149
+ mock_mcp_client: MagicMock,
150
+ mock_model_client: MagicMock,
151
+ tools: list[types.Tool],
152
+ agent_config: dict | None = None,
153
+ ) -> list[types.Tool]:
154
+ """Helper to create agent, initialize with tools and config, return filtered tools."""
155
+ from hud.agents import ClaudeAgent
156
+
157
+ mock_mcp_client.list_tools = AsyncMock(return_value=tools)
158
+
159
+ task = Task(
160
+ prompt="Test",
161
+ mcp_config={"local": {"url": "http://localhost"}},
162
+ agent_config=agent_config or {},
163
+ )
164
+
165
+ agent = ClaudeAgent(
166
+ mcp_client=mock_mcp_client,
167
+ model_client=mock_model_client,
168
+ model="test",
169
+ validate_api_key=False,
170
+ )
171
+ await agent.initialize(task)
172
+ return agent.get_available_tools()
173
+
174
+ @pytest.mark.asyncio
175
+ async def test_no_filters_returns_all_tools(self, mock_mcp_client, mock_model_client) -> None:
176
+ """Test that no filters in agent_config returns all tools."""
177
+ tools = [
178
+ types.Tool(name="tool1", description="Tool 1", inputSchema={}),
179
+ types.Tool(name="tool2", description="Tool 2", inputSchema={}),
180
+ types.Tool(name="debug_tool", description="Debug", inputSchema={}),
181
+ ]
182
+
183
+ result = await self._run_agent_with_tools(mock_mcp_client, mock_model_client, tools)
184
+
185
+ assert len(result) == 3
186
+
187
+ @pytest.mark.asyncio
188
+ async def test_allowed_tools_filters_correctly(
189
+ self, mock_mcp_client, mock_model_client
190
+ ) -> None:
191
+ """Test that allowed_tools in agent_config filters to matching patterns."""
192
+ tools = [
193
+ types.Tool(name="screenshot_take", description="Tool 1", inputSchema={}),
194
+ types.Tool(name="screenshot_full", description="Tool 2", inputSchema={}),
195
+ types.Tool(name="click", description="Tool 3", inputSchema={}),
196
+ ]
197
+ agent_config = {"allowed_tools": ["screenshot_*"]}
198
+
199
+ result = await self._run_agent_with_tools(
200
+ mock_mcp_client, mock_model_client, tools, agent_config
201
+ )
202
+
203
+ assert len(result) == 2
204
+ assert all("screenshot" in t.name for t in result)
205
+
206
+ @pytest.mark.asyncio
207
+ async def test_disallowed_tools_excludes_correctly(
208
+ self, mock_mcp_client, mock_model_client
209
+ ) -> None:
210
+ """Test that disallowed_tools in agent_config excludes matching patterns."""
211
+ tools = [
212
+ types.Tool(name="tool1", description="Tool 1", inputSchema={}),
213
+ types.Tool(name="debug_tool", description="Tool 2", inputSchema={}),
214
+ types.Tool(name="internal_secret", description="Tool 3", inputSchema={}),
215
+ ]
216
+ agent_config = {"disallowed_tools": ["debug_*", "internal_*"]}
217
+
218
+ result = await self._run_agent_with_tools(
219
+ mock_mcp_client, mock_model_client, tools, agent_config
220
+ )
221
+
222
+ assert len(result) == 1
223
+ assert result[0].name == "tool1"
224
+
225
+ @pytest.mark.asyncio
226
+ async def test_both_filters_applies_allowed_then_disallowed(
227
+ self, mock_mcp_client, mock_model_client
228
+ ) -> None:
229
+ """Test that both filters in agent_config work together (disallowed takes precedence)."""
230
+ tools = [
231
+ types.Tool(name="browser_click", description="Tool 1", inputSchema={}),
232
+ types.Tool(name="browser_debug", description="Tool 2", inputSchema={}),
233
+ types.Tool(name="system_click", description="Tool 3", inputSchema={}),
234
+ ]
235
+ agent_config = {"allowed_tools": ["browser_*"], "disallowed_tools": ["*_debug"]}
236
+
237
+ result = await self._run_agent_with_tools(
238
+ mock_mcp_client, mock_model_client, tools, agent_config
239
+ )
240
+
241
+ assert len(result) == 1
242
+ assert result[0].name == "browser_click"
243
+
244
+
245
+ class TestRunDatasetToolFiltering:
246
+ """Test tool filtering via run_dataset with agent_config in both init and task."""
247
+
248
+ @pytest.fixture
249
+ def all_tools(self):
250
+ """Fixture for a standard set of tools."""
251
+ return [
252
+ types.Tool(name="browser_click", description="Click", inputSchema={}),
253
+ types.Tool(name="browser_type", description="Type", inputSchema={}),
254
+ types.Tool(name="browser_debug", description="Debug", inputSchema={}),
255
+ types.Tool(name="system_screenshot", description="Screenshot", inputSchema={}),
256
+ types.Tool(name="system_execute", description="Execute", inputSchema={}),
257
+ ]
258
+
259
+ @pytest.fixture
260
+ def captured_agent_fixture(self):
261
+ """Fixture that returns a dictionary to capture the agent instance."""
262
+ return {"agent": None}
263
+
264
+ @pytest.fixture
265
+ def mock_run_context(self, captured_agent_fixture):
266
+ """Fixture for mocking _run_context."""
267
+
268
+ async def _mock(self, context, max_steps=10):
269
+ captured_agent_fixture["agent"] = self
270
+ return Trace(reward=1.0, done=True, content="Done")
271
+
272
+ return _mock
273
+
274
+ @pytest.fixture
275
+ def mock_call_tools(self):
276
+ """Fixture for mocking call_tools."""
277
+
278
+ async def _mock(self, tool_call=None):
279
+ return []
280
+
281
+ return _mock
282
+
283
+ @pytest.fixture
284
+ def mock_client_instance(self, all_tools):
285
+ """Fixture for mock MCP client instance."""
286
+ mock_client = MagicMock()
287
+ mock_client.initialize = AsyncMock()
288
+ mock_client.list_tools = AsyncMock(return_value=all_tools)
289
+ mock_client.shutdown = AsyncMock()
290
+ mock_client.mcp_config = {"local": {"url": "http://localhost:8765/mcp"}}
291
+ return mock_client
292
+
293
+ @pytest.mark.asyncio
294
+ async def test_agent_config_intersection_union_via_run_dataset(
295
+ self,
296
+ all_tools,
297
+ captured_agent_fixture,
298
+ mock_run_context,
299
+ mock_call_tools,
300
+ mock_client_instance,
301
+ ) -> None:
302
+ """Test that allowed_tools intersect and disallowed_tools union when set in both __init__ and task.agent_config.""" # noqa: E501
303
+ from hud.agents import ClaudeAgent
304
+ from hud.datasets.runner import run_dataset
305
+
306
+ # Create a task with its own agent_config
307
+ task_dict = {
308
+ "prompt": "Test task",
309
+ "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
310
+ "agent_config": {
311
+ "allowed_tools": [
312
+ "browser_*",
313
+ "system_screenshot",
314
+ ], # Task wants browser_* and system_screenshot
315
+ "disallowed_tools": [
316
+ "*_debug",
317
+ "*_execute",
318
+ ], # Task disallows *_debug and *_execute
319
+ },
320
+ }
321
+
322
+ # Agent config passed to __init__ via run_dataset
323
+ agent_init_config = {
324
+ "allowed_tools": ["browser_*", "system_*"], # Agent init wants browser_* and system_*
325
+ "disallowed_tools": ["browser_debug"], # Agent init disallows browser_debug
326
+ "validate_api_key": False,
327
+ }
328
+
329
+ with (
330
+ patch("hud.job"),
331
+ patch("hud.trace"),
332
+ patch.object(ClaudeAgent, "_run_context", mock_run_context),
333
+ patch.object(ClaudeAgent, "call_tools", mock_call_tools),
334
+ patch("hud.clients.MCPClient", return_value=mock_client_instance),
335
+ ):
336
+ # Run the dataset
337
+ await run_dataset(
338
+ name="test_job",
339
+ dataset=[task_dict],
340
+ agent_class=ClaudeAgent,
341
+ agent_config=agent_init_config,
342
+ max_steps=10,
343
+ )
344
+
345
+ # Verify agent was created and ran
346
+ captured_agent = captured_agent_fixture["agent"]
347
+ assert captured_agent is not None
348
+
349
+ # Get the filtered tools
350
+ filtered_tools = captured_agent.get_available_tools()
351
+ filtered_names = {tool.name for tool in filtered_tools}
352
+
353
+ # Expected behavior:
354
+ # 1. allowed_tools intersection: ["browser_*", "system_*"] ∩ ["browser_*", "system_screenshot"] # noqa: E501
355
+ # Exact string intersection: only "browser_*" is in both lists
356
+ # So only tools matching browser_* are allowed: browser_click, browser_type, browser_debug # noqa: E501
357
+ # 2. disallowed_tools union: ["browser_debug"] U ["*_debug", "*_execute"]
358
+ # Result: ["browser_debug", "*_debug", "*_execute"] (all patterns included)
359
+ # 3. Final: {browser_click, browser_type, browser_debug} - {browser_debug}
360
+ # Result: browser_click, browser_type
361
+
362
+ expected_tools = {"browser_click", "browser_type"}
363
+ assert filtered_names == expected_tools, (
364
+ f"Expected {expected_tools}, got {filtered_names}"
365
+ )
366
+
367
+ @pytest.mark.asyncio
368
+ async def test_no_allowed_tools_keeps_all_tools_except_disallowed(
369
+ self,
370
+ all_tools,
371
+ captured_agent_fixture,
372
+ mock_run_context,
373
+ mock_call_tools,
374
+ mock_client_instance,
375
+ ) -> None:
376
+ """Test that when allowed_tools is not set, all tools are available except disallowed ones.""" # noqa: E501
377
+ from hud.agents import ClaudeAgent
378
+ from hud.datasets.runner import run_dataset
379
+
380
+ # Create a task with its own agent_config (no allowed_tools)
381
+ task_dict = {
382
+ "prompt": "Test task",
383
+ "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
384
+ "agent_config": {
385
+ # No allowed_tools set - should allow all tools
386
+ "disallowed_tools": ["*_execute"], # Task disallows *_execute
387
+ },
388
+ }
389
+
390
+ # Agent config passed to __init__ via run_dataset (no allowed_tools)
391
+ agent_init_config = {
392
+ # No allowed_tools set - should allow all tools
393
+ "disallowed_tools": ["browser_debug"], # Agent init disallows browser_debug
394
+ "validate_api_key": False,
395
+ }
396
+
397
+ with (
398
+ patch("hud.job"),
399
+ patch("hud.trace"),
400
+ patch.object(ClaudeAgent, "_run_context", mock_run_context),
401
+ patch.object(ClaudeAgent, "call_tools", mock_call_tools),
402
+ patch("hud.clients.MCPClient", return_value=mock_client_instance),
403
+ ):
404
+ # Run the dataset
405
+ await run_dataset(
406
+ name="test_job",
407
+ dataset=[task_dict],
408
+ agent_class=ClaudeAgent,
409
+ agent_config=agent_init_config,
410
+ max_steps=10,
411
+ )
412
+
413
+ # Verify agent was created and ran
414
+ captured_agent = captured_agent_fixture["agent"]
415
+ assert captured_agent is not None
416
+
417
+ # Get the filtered tools
418
+ filtered_tools = captured_agent.get_available_tools()
419
+ filtered_names = {tool.name for tool in filtered_tools}
420
+
421
+ # Expected behavior:
422
+ # 1. allowed_tools: None (no allowed_tools set in either init or task)
423
+ # Result: All tools are initially allowed
424
+ # 2. disallowed_tools union: ["browser_debug"] U ["*_execute"]
425
+ # Result: ["browser_debug", "*_execute"] (all patterns included)
426
+ # 3. Final: {all tools} - {browser_debug, system_execute}
427
+ # Result: browser_click, browser_type, system_screenshot
428
+
429
+ expected_tools = {"browser_click", "browser_type", "system_screenshot"}
430
+ assert filtered_names == expected_tools, (
431
+ f"Expected {expected_tools}, got {filtered_names}"
432
+ )
433
+
434
+
435
+ class TestSystemPromptHandling:
436
+ """Test system prompt handling through run_dataset flow."""
437
+
438
+ @pytest.fixture
439
+ def mock_mcp_client(self):
440
+ """Fixture for mock MCP client."""
441
+ client = MagicMock()
442
+ client.initialize = AsyncMock()
443
+ client.list_tools = AsyncMock(return_value=[])
444
+ client.shutdown = AsyncMock()
445
+ client.mcp_config = {"local": {"url": "http://localhost:8765/mcp"}}
446
+ return client
447
+
448
+ @pytest.fixture
449
+ def captured_agent_fixture(self):
450
+ """Fixture that returns a dictionary to capture the agent instance."""
451
+ return {"agent": None}
452
+
453
+ @pytest.fixture
454
+ def mock_run_context(self, captured_agent_fixture):
455
+ """Fixture for mocking _run_context to capture agent."""
456
+
457
+ async def _mock(self, context, max_steps=10):
458
+ captured_agent_fixture["agent"] = self
459
+ return Trace(reward=1.0, done=True, content="Done")
460
+
461
+ return _mock
462
+
463
+ @pytest.fixture
464
+ def mock_call_tools(self):
465
+ """Fixture for mocking call_tools."""
466
+
467
+ async def _mock(self, tool_call=None):
468
+ return []
469
+
470
+ return _mock
471
+
472
+ @pytest.mark.asyncio
473
+ async def test_task_system_prompt_only(
474
+ self, captured_agent_fixture, mock_run_context, mock_call_tools, mock_mcp_client
475
+ ) -> None:
476
+ """Test that task system_prompt is appended when agent has default system prompt."""
477
+ from hud.agents import ClaudeAgent
478
+ from hud.agents.base import GLOBAL_SYSTEM_PROMPT
479
+ from hud.datasets.runner import run_dataset
480
+
481
+ task_system_prompt = "Task prompt"
482
+
483
+ # Create a task with its own system_prompt in agent_config
484
+ task_dict = {
485
+ "prompt": "Test task",
486
+ "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
487
+ "agent_config": {
488
+ "system_prompt": task_system_prompt,
489
+ },
490
+ }
491
+
492
+ # Agent config with no custom system_prompt (will use default)
493
+ agent_init_config = {
494
+ "validate_api_key": False,
495
+ }
496
+
497
+ with (
498
+ patch("hud.job"),
499
+ patch("hud.trace"),
500
+ patch.object(ClaudeAgent, "_run_context", mock_run_context),
501
+ patch.object(ClaudeAgent, "call_tools", mock_call_tools),
502
+ patch("hud.clients.MCPClient", return_value=mock_mcp_client),
503
+ ):
504
+ # Run the dataset
505
+ await run_dataset(
506
+ name="test_job",
507
+ dataset=[task_dict],
508
+ agent_class=ClaudeAgent,
509
+ agent_config=agent_init_config,
510
+ max_steps=10,
511
+ )
512
+
513
+ # Verify agent was created and ran
514
+ captured_agent = captured_agent_fixture["agent"]
515
+ assert captured_agent is not None
516
+
517
+ # Verify the task system prompt was appended
518
+ assert captured_agent.system_prompt.endswith(f"\n\n{task_system_prompt}")
519
+ # Verify it starts with the base global system prompt
520
+ assert captured_agent.system_prompt.startswith(GLOBAL_SYSTEM_PROMPT)
521
+
522
+ @pytest.mark.asyncio
523
+ async def test_both_agent_and_task_system_prompts(
524
+ self, captured_agent_fixture, mock_run_context, mock_call_tools, mock_mcp_client
525
+ ) -> None:
526
+ """Test that both agent init and task system prompts are present when both are set."""
527
+ from hud.agents import ClaudeAgent
528
+ from hud.datasets.runner import run_dataset
529
+
530
+ agent_custom_prompt = "Agent init prompt"
531
+ task_system_prompt = "Task prompt"
532
+
533
+ # Create a task with its own system_prompt in agent_config
534
+ task_dict = {
535
+ "prompt": "Test task",
536
+ "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
537
+ "agent_config": {
538
+ "system_prompt": task_system_prompt,
539
+ },
540
+ }
541
+
542
+ # Agent config WITH custom system_prompt
543
+ agent_init_config = {
544
+ "system_prompt": agent_custom_prompt,
545
+ "validate_api_key": False,
546
+ }
547
+
548
+ with (
549
+ patch("hud.job"),
550
+ patch("hud.trace"),
551
+ patch.object(ClaudeAgent, "_run_context", mock_run_context),
552
+ patch.object(ClaudeAgent, "call_tools", mock_call_tools),
553
+ patch("hud.clients.MCPClient", return_value=mock_mcp_client),
554
+ ):
555
+ # Run the dataset
556
+ await run_dataset(
557
+ name="test_job",
558
+ dataset=[task_dict],
559
+ agent_class=ClaudeAgent,
560
+ agent_config=agent_init_config,
561
+ max_steps=10,
562
+ )
563
+
564
+ # Verify agent was created and ran
565
+ captured_agent = captured_agent_fixture["agent"]
566
+ assert captured_agent is not None
567
+
568
+ # Verify the task system prompt was appended at the end
569
+ assert captured_agent.system_prompt.endswith(f"\n\n{task_system_prompt}")
570
+ # Verify it starts with the agent custom prompt
571
+ assert captured_agent.system_prompt.startswith(agent_custom_prompt)
572
+ # Verify both prompts are present
573
+ assert agent_custom_prompt in captured_agent.system_prompt
574
+ assert task_system_prompt in captured_agent.system_prompt
@@ -2,101 +2,15 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from pathlib import Path # noqa: TC003
6
- from unittest.mock import MagicMock, patch
5
+ from unittest.mock import patch
7
6
 
8
7
  import pytest
9
8
 
10
9
  from hud.cli.dev import (
11
- create_proxy_server,
12
- get_docker_cmd,
13
- get_image_name,
14
10
  run_mcp_dev_server,
15
- update_pyproject_toml,
16
11
  )
17
12
 
18
13
 
19
- class TestCreateMCPServer:
20
- """Test MCP server creation."""
21
-
22
- def test_create_mcp_server(self) -> None:
23
- """Test that MCP server is created with correct configuration."""
24
- mcp = create_proxy_server(".", "test-image:latest")
25
- assert mcp._mcp_server.name == "HUD Dev Proxy - test-image:latest"
26
- # Proxy server doesn't define its own tools, it forwards to Docker containers
27
-
28
-
29
- class TestDockerUtils:
30
- """Test Docker utility functions."""
31
-
32
- def test_get_docker_cmd(self) -> None:
33
- """Test extracting CMD from Docker image."""
34
- with patch("subprocess.run") as mock_run:
35
- mock_result = MagicMock()
36
- mock_result.returncode = 0
37
- mock_result.stdout = '["python", "-m", "server"]'
38
- mock_run.return_value = mock_result
39
-
40
- cmd = get_docker_cmd("test-image:latest")
41
- assert cmd is None
42
-
43
- def test_get_docker_cmd_failure(self) -> None:
44
- """Test handling when Docker inspect fails."""
45
- import subprocess
46
-
47
- with patch("subprocess.run") as mock_run:
48
- # check=True causes CalledProcessError on non-zero return
49
- mock_run.side_effect = subprocess.CalledProcessError(1, "docker inspect")
50
-
51
- cmd = get_docker_cmd("test-image:latest")
52
- assert cmd is None
53
-
54
-
55
- class TestImageResolution:
56
- """Test image name resolution."""
57
-
58
- def test_get_image_name_override(self) -> None:
59
- """Test image name with override."""
60
- name, source = get_image_name(".", "custom-image:v1")
61
- assert name == "custom-image:v1"
62
- assert source == "override"
63
-
64
- def test_get_image_name_from_pyproject(self, tmp_path: Path) -> None:
65
- """Test image name from pyproject.toml."""
66
- pyproject = tmp_path / "pyproject.toml"
67
- pyproject.write_text("""
68
- [tool.hud]
69
- image = "my-project:latest"
70
- """)
71
-
72
- name, source = get_image_name(str(tmp_path))
73
- assert name == "my-project:latest"
74
- assert source == "cache"
75
-
76
- def test_get_image_name_auto_generate(self, tmp_path: Path) -> None:
77
- """Test auto-generated image name."""
78
- test_dir = tmp_path / "my_test_project"
79
- test_dir.mkdir()
80
-
81
- name, source = get_image_name(str(test_dir))
82
- assert name == "my-test-project:dev"
83
- assert source == "auto"
84
-
85
- def test_update_pyproject_toml(self, tmp_path: Path) -> None:
86
- """Test updating pyproject.toml with image name."""
87
- pyproject = tmp_path / "pyproject.toml"
88
- pyproject.write_text("""
89
- [project]
90
- name = "test"
91
- """)
92
-
93
- update_pyproject_toml(str(tmp_path), "new-image:v1", silent=True)
94
-
95
- content = pyproject.read_text()
96
- assert "[tool.hud]" in content
97
- assert 'image = "new-image:v1"' in content
98
-
99
-
100
14
  class TestRunMCPDevServer:
101
15
  """Test the main server runner."""
102
16
 
@@ -110,16 +24,13 @@ class TestRunMCPDevServer:
110
24
  pytest.raises(click.Abort),
111
25
  ):
112
26
  run_mcp_dev_server(
113
- directory=".",
114
- image="missing:latest",
115
- build=False,
116
- no_cache=False,
117
- transport="http",
27
+ module=".",
28
+ stdio=False,
118
29
  port=8765,
119
- no_reload=False,
120
30
  verbose=False,
121
31
  inspector=False,
122
- no_logs=False,
123
- docker_args=[],
124
32
  interactive=False,
33
+ watch=[],
34
+ docker=False,
35
+ docker_args=[],
125
36
  )