hud-python 0.4.48__py3-none-any.whl → 0.4.50__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

@@ -7,9 +7,13 @@ from unittest.mock import AsyncMock, MagicMock, Mock, patch
7
7
  import pytest
8
8
  from mcp import types
9
9
 
10
- from hud.cli.eval import build_agent, eval_command, get_available_models, run_full_dataset, run_single_task
10
+ from hud.cli.eval import (
11
+ build_agent,
12
+ run_single_task,
13
+ )
11
14
  from hud.types import Task, Trace
12
15
 
16
+
13
17
  class TestBuildAgent:
14
18
  """Test the build_agent function."""
15
19
 
@@ -20,10 +24,10 @@ class TestBuildAgent:
20
24
  with patch("hud.agents.misc.integration_test_agent.IntegrationTestRunner") as mock_runner:
21
25
  mock_instance = Mock()
22
26
  mock_runner.return_value = mock_instance
23
-
27
+
24
28
  # Test with verbose=False
25
29
  result = build_agent("integration_test", verbose=False)
26
-
30
+
27
31
  mock_runner.assert_called_once_with(verbose=False)
28
32
  assert result == mock_instance
29
33
 
@@ -34,14 +38,11 @@ class TestBuildAgent:
34
38
  with patch("hud.agents.ClaudeAgent") as mock_runner:
35
39
  mock_instance = Mock()
36
40
  mock_runner.return_value = mock_instance
37
-
41
+
38
42
  # Test with verbose=False
39
43
  result = build_agent("claude", verbose=False)
40
-
41
- mock_runner.assert_called_once_with(
42
- model="claude-sonnet-4-20250514",
43
- verbose=False
44
- )
44
+
45
+ mock_runner.assert_called_once_with(model="claude-sonnet-4-20250514", verbose=False)
45
46
  assert result == mock_instance
46
47
 
47
48
  def test_builds_claude_agent_with_custom_model_and_allowed_tools(self) -> None:
@@ -51,7 +52,7 @@ class TestBuildAgent:
51
52
  with patch("hud.agents.ClaudeAgent") as mock_runner:
52
53
  mock_instance = Mock()
53
54
  mock_runner.return_value = mock_instance
54
-
55
+
55
56
  # Test with verbose=False
56
57
  result = build_agent(
57
58
  "claude",
@@ -59,7 +60,7 @@ class TestBuildAgent:
59
60
  allowed_tools=["act"],
60
61
  verbose=True,
61
62
  )
62
-
63
+
63
64
  mock_runner.assert_called_once_with(
64
65
  model="claude-sonnet-4-20250514",
65
66
  allowed_tools=["act"],
@@ -81,19 +82,23 @@ class TestRunSingleTask:
81
82
  "system_prompt": "Custom instructions",
82
83
  "allowed_tools": ["tool1", "tool2"],
83
84
  "append_setup_output": False,
84
- }
85
+ },
85
86
  )
86
87
  mock_agent = AsyncMock(
87
- initialize=AsyncMock(),
88
- run=AsyncMock(return_value=Trace(reward=1.0, done=True))
88
+ initialize=AsyncMock(), run=AsyncMock(return_value=Trace(reward=1.0, done=True))
89
89
  )
90
-
91
- with patch("hud.utils.tasks.load_tasks", return_value=[mock_task]), \
92
- patch("hud.agents.misc.integration_test_agent.IntegrationTestRunner", return_value=mock_agent), \
93
- patch("hud.cli.eval.find_environment_dir", return_value=None), \
94
- patch("hud.cli.eval.hud.trace"):
90
+
91
+ with (
92
+ patch("hud.utils.tasks.load_tasks", return_value=[mock_task]),
93
+ patch(
94
+ "hud.agents.misc.integration_test_agent.IntegrationTestRunner",
95
+ return_value=mock_agent,
96
+ ),
97
+ patch("hud.cli.eval.find_environment_dir", return_value=None),
98
+ patch("hud.cli.eval.hud.trace"),
99
+ ):
95
100
  await run_single_task("test.json", agent_type="integration_test", max_steps=10)
96
-
101
+
97
102
  # Verify agent.run was called with the task containing agent_config
98
103
  mock_agent.run.assert_called_once()
99
104
  called_task = mock_agent.run.call_args[0][0]
@@ -103,17 +108,20 @@ class TestRunSingleTask:
103
108
  async def test_runs_with_group_size_greater_than_one(self) -> None:
104
109
  """Test that group_size > 1 triggers run_tasks_grouped instead of agent.run."""
105
110
  mock_task = Task(prompt="Test", mcp_config={"local": {"url": "http://localhost:8765/mcp"}})
106
-
107
- with patch("hud.utils.tasks.load_tasks", return_value=[mock_task]), \
108
- patch("hud.cli.eval.run_tasks_grouped", new_callable=AsyncMock) as mock_grouped, \
109
- patch("hud.cli.eval.display_group_statistics"), \
110
- patch("hud.cli.eval.find_environment_dir", return_value=None), \
111
- patch("hud.cli.eval.hud.trace"):
112
-
111
+
112
+ with (
113
+ patch("hud.utils.tasks.load_tasks", return_value=[mock_task]),
114
+ patch("hud.cli.eval.run_tasks_grouped", new_callable=AsyncMock) as mock_grouped,
115
+ patch("hud.cli.eval.display_group_statistics"),
116
+ patch("hud.cli.eval.find_environment_dir", return_value=None),
117
+ patch("hud.cli.eval.hud.trace"),
118
+ ):
113
119
  mock_grouped.return_value = [{"task": mock_task, "rewards": [1.0, 0.5]}]
114
-
115
- await run_single_task("test.json", agent_type="integration_test", group_size=3, max_steps=10)
116
-
120
+
121
+ await run_single_task(
122
+ "test.json", agent_type="integration_test", group_size=3, max_steps=10
123
+ )
124
+
117
125
  # Verify run_tasks_grouped was called with correct group_size
118
126
  mock_grouped.assert_called_once()
119
127
  assert mock_grouped.call_args.kwargs["group_size"] == 3
@@ -145,20 +153,20 @@ class TestToolFiltering:
145
153
  ) -> list[types.Tool]:
146
154
  """Helper to create agent, initialize with tools and config, return filtered tools."""
147
155
  from hud.agents import ClaudeAgent
148
-
156
+
149
157
  mock_mcp_client.list_tools = AsyncMock(return_value=tools)
150
-
158
+
151
159
  task = Task(
152
160
  prompt="Test",
153
161
  mcp_config={"local": {"url": "http://localhost"}},
154
- agent_config=agent_config or {}
162
+ agent_config=agent_config or {},
155
163
  )
156
-
164
+
157
165
  agent = ClaudeAgent(
158
166
  mcp_client=mock_mcp_client,
159
167
  model_client=mock_model_client,
160
168
  model="test",
161
- validate_api_key=False
169
+ validate_api_key=False,
162
170
  )
163
171
  await agent.initialize(task)
164
172
  return agent.get_available_tools()
@@ -171,13 +179,15 @@ class TestToolFiltering:
171
179
  types.Tool(name="tool2", description="Tool 2", inputSchema={}),
172
180
  types.Tool(name="debug_tool", description="Debug", inputSchema={}),
173
181
  ]
174
-
182
+
175
183
  result = await self._run_agent_with_tools(mock_mcp_client, mock_model_client, tools)
176
-
184
+
177
185
  assert len(result) == 3
178
186
 
179
187
  @pytest.mark.asyncio
180
- async def test_allowed_tools_filters_correctly(self, mock_mcp_client, mock_model_client) -> None:
188
+ async def test_allowed_tools_filters_correctly(
189
+ self, mock_mcp_client, mock_model_client
190
+ ) -> None:
181
191
  """Test that allowed_tools in agent_config filters to matching patterns."""
182
192
  tools = [
183
193
  types.Tool(name="screenshot_take", description="Tool 1", inputSchema={}),
@@ -185,14 +195,18 @@ class TestToolFiltering:
185
195
  types.Tool(name="click", description="Tool 3", inputSchema={}),
186
196
  ]
187
197
  agent_config = {"allowed_tools": ["screenshot_*"]}
188
-
189
- result = await self._run_agent_with_tools(mock_mcp_client, mock_model_client, tools, agent_config)
190
-
198
+
199
+ result = await self._run_agent_with_tools(
200
+ mock_mcp_client, mock_model_client, tools, agent_config
201
+ )
202
+
191
203
  assert len(result) == 2
192
204
  assert all("screenshot" in t.name for t in result)
193
205
 
194
206
  @pytest.mark.asyncio
195
- async def test_disallowed_tools_excludes_correctly(self, mock_mcp_client, mock_model_client) -> None:
207
+ async def test_disallowed_tools_excludes_correctly(
208
+ self, mock_mcp_client, mock_model_client
209
+ ) -> None:
196
210
  """Test that disallowed_tools in agent_config excludes matching patterns."""
197
211
  tools = [
198
212
  types.Tool(name="tool1", description="Tool 1", inputSchema={}),
@@ -200,27 +214,30 @@ class TestToolFiltering:
200
214
  types.Tool(name="internal_secret", description="Tool 3", inputSchema={}),
201
215
  ]
202
216
  agent_config = {"disallowed_tools": ["debug_*", "internal_*"]}
203
-
204
- result = await self._run_agent_with_tools(mock_mcp_client, mock_model_client, tools, agent_config)
205
-
217
+
218
+ result = await self._run_agent_with_tools(
219
+ mock_mcp_client, mock_model_client, tools, agent_config
220
+ )
221
+
206
222
  assert len(result) == 1
207
223
  assert result[0].name == "tool1"
208
224
 
209
225
  @pytest.mark.asyncio
210
- async def test_both_filters_applies_allowed_then_disallowed(self, mock_mcp_client, mock_model_client) -> None:
226
+ async def test_both_filters_applies_allowed_then_disallowed(
227
+ self, mock_mcp_client, mock_model_client
228
+ ) -> None:
211
229
  """Test that both filters in agent_config work together (disallowed takes precedence)."""
212
230
  tools = [
213
231
  types.Tool(name="browser_click", description="Tool 1", inputSchema={}),
214
232
  types.Tool(name="browser_debug", description="Tool 2", inputSchema={}),
215
233
  types.Tool(name="system_click", description="Tool 3", inputSchema={}),
216
234
  ]
217
- agent_config = {
218
- "allowed_tools": ["browser_*"],
219
- "disallowed_tools": ["*_debug"]
220
- }
221
-
222
- result = await self._run_agent_with_tools(mock_mcp_client, mock_model_client, tools, agent_config)
223
-
235
+ agent_config = {"allowed_tools": ["browser_*"], "disallowed_tools": ["*_debug"]}
236
+
237
+ result = await self._run_agent_with_tools(
238
+ mock_mcp_client, mock_model_client, tools, agent_config
239
+ )
240
+
224
241
  assert len(result) == 1
225
242
  assert result[0].name == "browser_click"
226
243
 
@@ -247,16 +264,20 @@ class TestRunDatasetToolFiltering:
247
264
  @pytest.fixture
248
265
  def mock_run_context(self, captured_agent_fixture):
249
266
  """Fixture for mocking _run_context."""
267
+
250
268
  async def _mock(self, context, max_steps=10):
251
269
  captured_agent_fixture["agent"] = self
252
270
  return Trace(reward=1.0, done=True, content="Done")
271
+
253
272
  return _mock
254
273
 
255
274
  @pytest.fixture
256
275
  def mock_call_tools(self):
257
276
  """Fixture for mocking call_tools."""
277
+
258
278
  async def _mock(self, tool_call=None):
259
279
  return []
280
+
260
281
  return _mock
261
282
 
262
283
  @pytest.fixture
@@ -271,35 +292,47 @@ class TestRunDatasetToolFiltering:
271
292
 
272
293
  @pytest.mark.asyncio
273
294
  async def test_agent_config_intersection_union_via_run_dataset(
274
- self, all_tools, captured_agent_fixture, mock_run_context, mock_call_tools, mock_client_instance
295
+ self,
296
+ all_tools,
297
+ captured_agent_fixture,
298
+ mock_run_context,
299
+ mock_call_tools,
300
+ mock_client_instance,
275
301
  ) -> None:
276
- """Test that allowed_tools intersect and disallowed_tools union when set in both __init__ and task.agent_config."""
302
+ """Test that allowed_tools intersect and disallowed_tools union when set in both __init__ and task.agent_config.""" # noqa: E501
277
303
  from hud.agents import ClaudeAgent
278
304
  from hud.datasets.runner import run_dataset
279
-
305
+
280
306
  # Create a task with its own agent_config
281
307
  task_dict = {
282
308
  "prompt": "Test task",
283
309
  "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
284
310
  "agent_config": {
285
- "allowed_tools": ["browser_*", "system_screenshot"], # Task wants browser_* and system_screenshot
286
- "disallowed_tools": ["*_debug", "*_execute"], # Task disallows *_debug and *_execute
287
- }
311
+ "allowed_tools": [
312
+ "browser_*",
313
+ "system_screenshot",
314
+ ], # Task wants browser_* and system_screenshot
315
+ "disallowed_tools": [
316
+ "*_debug",
317
+ "*_execute",
318
+ ], # Task disallows *_debug and *_execute
319
+ },
288
320
  }
289
-
321
+
290
322
  # Agent config passed to __init__ via run_dataset
291
323
  agent_init_config = {
292
324
  "allowed_tools": ["browser_*", "system_*"], # Agent init wants browser_* and system_*
293
325
  "disallowed_tools": ["browser_debug"], # Agent init disallows browser_debug
294
326
  "validate_api_key": False,
295
327
  }
296
-
297
- with patch("hud.job"), \
298
- patch("hud.trace"), \
299
- patch.object(ClaudeAgent, "_run_context", mock_run_context), \
300
- patch.object(ClaudeAgent, "call_tools", mock_call_tools), \
301
- patch("hud.clients.MCPClient", return_value=mock_client_instance):
302
-
328
+
329
+ with (
330
+ patch("hud.job"),
331
+ patch("hud.trace"),
332
+ patch.object(ClaudeAgent, "_run_context", mock_run_context),
333
+ patch.object(ClaudeAgent, "call_tools", mock_call_tools),
334
+ patch("hud.clients.MCPClient", return_value=mock_client_instance),
335
+ ):
303
336
  # Run the dataset
304
337
  await run_dataset(
305
338
  name="test_job",
@@ -308,35 +341,42 @@ class TestRunDatasetToolFiltering:
308
341
  agent_config=agent_init_config,
309
342
  max_steps=10,
310
343
  )
311
-
344
+
312
345
  # Verify agent was created and ran
313
346
  captured_agent = captured_agent_fixture["agent"]
314
347
  assert captured_agent is not None
315
-
348
+
316
349
  # Get the filtered tools
317
350
  filtered_tools = captured_agent.get_available_tools()
318
351
  filtered_names = {tool.name for tool in filtered_tools}
319
-
352
+
320
353
  # Expected behavior:
321
- # 1. allowed_tools intersection: ["browser_*", "system_*"] ∩ ["browser_*", "system_screenshot"]
354
+ # 1. allowed_tools intersection: ["browser_*", "system_*"] ∩ ["browser_*", "system_screenshot"] # noqa: E501
322
355
  # Exact string intersection: only "browser_*" is in both lists
323
- # So only tools matching browser_* are allowed: browser_click, browser_type, browser_debug
324
- # 2. disallowed_tools union: ["browser_debug"] ["*_debug", "*_execute"]
356
+ # So only tools matching browser_* are allowed: browser_click, browser_type, browser_debug # noqa: E501
357
+ # 2. disallowed_tools union: ["browser_debug"] U ["*_debug", "*_execute"]
325
358
  # Result: ["browser_debug", "*_debug", "*_execute"] (all patterns included)
326
359
  # 3. Final: {browser_click, browser_type, browser_debug} - {browser_debug}
327
360
  # Result: browser_click, browser_type
328
-
361
+
329
362
  expected_tools = {"browser_click", "browser_type"}
330
- assert filtered_names == expected_tools, f"Expected {expected_tools}, got {filtered_names}"
363
+ assert filtered_names == expected_tools, (
364
+ f"Expected {expected_tools}, got {filtered_names}"
365
+ )
331
366
 
332
367
  @pytest.mark.asyncio
333
368
  async def test_no_allowed_tools_keeps_all_tools_except_disallowed(
334
- self, all_tools, captured_agent_fixture, mock_run_context, mock_call_tools, mock_client_instance
369
+ self,
370
+ all_tools,
371
+ captured_agent_fixture,
372
+ mock_run_context,
373
+ mock_call_tools,
374
+ mock_client_instance,
335
375
  ) -> None:
336
- """Test that when allowed_tools is not set, all tools are available except disallowed ones."""
376
+ """Test that when allowed_tools is not set, all tools are available except disallowed ones.""" # noqa: E501
337
377
  from hud.agents import ClaudeAgent
338
378
  from hud.datasets.runner import run_dataset
339
-
379
+
340
380
  # Create a task with its own agent_config (no allowed_tools)
341
381
  task_dict = {
342
382
  "prompt": "Test task",
@@ -344,22 +384,23 @@ class TestRunDatasetToolFiltering:
344
384
  "agent_config": {
345
385
  # No allowed_tools set - should allow all tools
346
386
  "disallowed_tools": ["*_execute"], # Task disallows *_execute
347
- }
387
+ },
348
388
  }
349
-
389
+
350
390
  # Agent config passed to __init__ via run_dataset (no allowed_tools)
351
391
  agent_init_config = {
352
392
  # No allowed_tools set - should allow all tools
353
393
  "disallowed_tools": ["browser_debug"], # Agent init disallows browser_debug
354
394
  "validate_api_key": False,
355
395
  }
356
-
357
- with patch("hud.job"), \
358
- patch("hud.trace"), \
359
- patch.object(ClaudeAgent, "_run_context", mock_run_context), \
360
- patch.object(ClaudeAgent, "call_tools", mock_call_tools), \
361
- patch("hud.clients.MCPClient", return_value=mock_client_instance):
362
-
396
+
397
+ with (
398
+ patch("hud.job"),
399
+ patch("hud.trace"),
400
+ patch.object(ClaudeAgent, "_run_context", mock_run_context),
401
+ patch.object(ClaudeAgent, "call_tools", mock_call_tools),
402
+ patch("hud.clients.MCPClient", return_value=mock_client_instance),
403
+ ):
363
404
  # Run the dataset
364
405
  await run_dataset(
365
406
  name="test_job",
@@ -368,25 +409,27 @@ class TestRunDatasetToolFiltering:
368
409
  agent_config=agent_init_config,
369
410
  max_steps=10,
370
411
  )
371
-
412
+
372
413
  # Verify agent was created and ran
373
414
  captured_agent = captured_agent_fixture["agent"]
374
415
  assert captured_agent is not None
375
-
416
+
376
417
  # Get the filtered tools
377
418
  filtered_tools = captured_agent.get_available_tools()
378
419
  filtered_names = {tool.name for tool in filtered_tools}
379
-
420
+
380
421
  # Expected behavior:
381
422
  # 1. allowed_tools: None (no allowed_tools set in either init or task)
382
423
  # Result: All tools are initially allowed
383
- # 2. disallowed_tools union: ["browser_debug"] ["*_execute"]
424
+ # 2. disallowed_tools union: ["browser_debug"] U ["*_execute"]
384
425
  # Result: ["browser_debug", "*_execute"] (all patterns included)
385
426
  # 3. Final: {all tools} - {browser_debug, system_execute}
386
427
  # Result: browser_click, browser_type, system_screenshot
387
-
428
+
388
429
  expected_tools = {"browser_click", "browser_type", "system_screenshot"}
389
- assert filtered_names == expected_tools, f"Expected {expected_tools}, got {filtered_names}"
430
+ assert filtered_names == expected_tools, (
431
+ f"Expected {expected_tools}, got {filtered_names}"
432
+ )
390
433
 
391
434
 
392
435
  class TestSystemPromptHandling:
@@ -410,16 +453,20 @@ class TestSystemPromptHandling:
410
453
  @pytest.fixture
411
454
  def mock_run_context(self, captured_agent_fixture):
412
455
  """Fixture for mocking _run_context to capture agent."""
456
+
413
457
  async def _mock(self, context, max_steps=10):
414
458
  captured_agent_fixture["agent"] = self
415
459
  return Trace(reward=1.0, done=True, content="Done")
460
+
416
461
  return _mock
417
462
 
418
463
  @pytest.fixture
419
464
  def mock_call_tools(self):
420
465
  """Fixture for mocking call_tools."""
466
+
421
467
  async def _mock(self, tool_call=None):
422
468
  return []
469
+
423
470
  return _mock
424
471
 
425
472
  @pytest.mark.asyncio
@@ -430,29 +477,30 @@ class TestSystemPromptHandling:
430
477
  from hud.agents import ClaudeAgent
431
478
  from hud.agents.base import GLOBAL_SYSTEM_PROMPT
432
479
  from hud.datasets.runner import run_dataset
433
-
480
+
434
481
  task_system_prompt = "Task prompt"
435
-
482
+
436
483
  # Create a task with its own system_prompt in agent_config
437
484
  task_dict = {
438
485
  "prompt": "Test task",
439
486
  "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
440
487
  "agent_config": {
441
488
  "system_prompt": task_system_prompt,
442
- }
489
+ },
443
490
  }
444
-
491
+
445
492
  # Agent config with no custom system_prompt (will use default)
446
493
  agent_init_config = {
447
494
  "validate_api_key": False,
448
495
  }
449
-
450
- with patch("hud.job"), \
451
- patch("hud.trace"), \
452
- patch.object(ClaudeAgent, "_run_context", mock_run_context), \
453
- patch.object(ClaudeAgent, "call_tools", mock_call_tools), \
454
- patch("hud.clients.MCPClient", return_value=mock_mcp_client):
455
-
496
+
497
+ with (
498
+ patch("hud.job"),
499
+ patch("hud.trace"),
500
+ patch.object(ClaudeAgent, "_run_context", mock_run_context),
501
+ patch.object(ClaudeAgent, "call_tools", mock_call_tools),
502
+ patch("hud.clients.MCPClient", return_value=mock_mcp_client),
503
+ ):
456
504
  # Run the dataset
457
505
  await run_dataset(
458
506
  name="test_job",
@@ -461,11 +509,11 @@ class TestSystemPromptHandling:
461
509
  agent_config=agent_init_config,
462
510
  max_steps=10,
463
511
  )
464
-
512
+
465
513
  # Verify agent was created and ran
466
514
  captured_agent = captured_agent_fixture["agent"]
467
515
  assert captured_agent is not None
468
-
516
+
469
517
  # Verify the task system prompt was appended
470
518
  assert captured_agent.system_prompt.endswith(f"\n\n{task_system_prompt}")
471
519
  # Verify it starts with the base global system prompt
@@ -478,31 +526,32 @@ class TestSystemPromptHandling:
478
526
  """Test that both agent init and task system prompts are present when both are set."""
479
527
  from hud.agents import ClaudeAgent
480
528
  from hud.datasets.runner import run_dataset
481
-
529
+
482
530
  agent_custom_prompt = "Agent init prompt"
483
531
  task_system_prompt = "Task prompt"
484
-
532
+
485
533
  # Create a task with its own system_prompt in agent_config
486
534
  task_dict = {
487
535
  "prompt": "Test task",
488
536
  "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
489
537
  "agent_config": {
490
538
  "system_prompt": task_system_prompt,
491
- }
539
+ },
492
540
  }
493
-
541
+
494
542
  # Agent config WITH custom system_prompt
495
543
  agent_init_config = {
496
544
  "system_prompt": agent_custom_prompt,
497
545
  "validate_api_key": False,
498
546
  }
499
-
500
- with patch("hud.job"), \
501
- patch("hud.trace"), \
502
- patch.object(ClaudeAgent, "_run_context", mock_run_context), \
503
- patch.object(ClaudeAgent, "call_tools", mock_call_tools), \
504
- patch("hud.clients.MCPClient", return_value=mock_mcp_client):
505
-
547
+
548
+ with (
549
+ patch("hud.job"),
550
+ patch("hud.trace"),
551
+ patch.object(ClaudeAgent, "_run_context", mock_run_context),
552
+ patch.object(ClaudeAgent, "call_tools", mock_call_tools),
553
+ patch("hud.clients.MCPClient", return_value=mock_mcp_client),
554
+ ):
506
555
  # Run the dataset
507
556
  await run_dataset(
508
557
  name="test_job",
@@ -511,11 +560,11 @@ class TestSystemPromptHandling:
511
560
  agent_config=agent_init_config,
512
561
  max_steps=10,
513
562
  )
514
-
563
+
515
564
  # Verify agent was created and ran
516
565
  captured_agent = captured_agent_fixture["agent"]
517
566
  assert captured_agent is not None
518
-
567
+
519
568
  # Verify the task system prompt was appended at the end
520
569
  assert captured_agent.system_prompt.endswith(f"\n\n{task_system_prompt}")
521
570
  # Verify it starts with the agent custom prompt