hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
@@ -1,738 +1,416 @@
1
- """Tests for BaseMCPAgent using simulated actions."""
1
+ """Tests for MCPAgent base class with v5 EvalContext pattern."""
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
5
  from typing import Any, ClassVar
6
- from unittest.mock import MagicMock
7
-
8
- # Import AsyncMock from unittest.mock if available (Python 3.8+)
9
- try:
10
- from unittest.mock import AsyncMock
11
- except ImportError:
12
- # Fallback for older Python versions
13
- from unittest.mock import MagicMock as AsyncMock
14
6
 
15
7
  import pytest
16
8
  from mcp import types
17
9
 
18
10
  from hud.agents import MCPAgent
19
- from hud.datasets import Task
20
- from hud.tools.executors.base import BaseExecutor
21
- from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
11
+ from hud.agents.base import BaseCreateParams
12
+ from hud.environment.router import ToolRouter
13
+ from hud.eval.context import EvalContext
14
+ from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
15
+
16
+
17
+ class MockConfig(BaseAgentConfig):
18
+ model_name: str = "MockAgent"
19
+ model: str = "mock-model"
20
+
21
+
22
+ class MockCreateParams(BaseCreateParams, MockConfig):
23
+ pass
24
+
25
+
26
+ class MockEvalContext(EvalContext):
27
+ """Mock EvalContext for testing."""
28
+
29
+ def __init__(
30
+ self,
31
+ prompt: str = "Test prompt",
32
+ tools: list[types.Tool] | None = None,
33
+ ) -> None:
34
+ # Core attributes
35
+ self.prompt = prompt
36
+ self._tools = tools or [
37
+ types.Tool(name="test_tool", description="A test tool", inputSchema={}),
38
+ types.Tool(name="another_tool", description="Another tool", inputSchema={}),
39
+ ]
40
+ self._submitted: str | None = None
41
+ self.reward: float | None = None
42
+ self._tool_calls: list[tuple[str, dict[str, Any]]] = []
43
+
44
+ # Environment attributes
45
+ self._router = ToolRouter()
46
+ self._agent_include: list[str] | None = None
47
+ self._agent_exclude: list[str] | None = None
48
+
49
+ # EvalContext attributes
50
+ self._task = None
51
+ self.trace_id = "test-trace-id"
52
+ self.eval_name = "test-eval"
53
+ self.job_id: str | None = None
54
+ self.group_id: str | None = None
55
+ self.index = 0
56
+ self.variants: dict[str, Any] = {}
57
+ self.answer: str | None = None
58
+ self.system_prompt: str | None = None
59
+ self.error: BaseException | None = None
60
+ self.metadata: dict[str, Any] = {}
61
+ self.results: list[Any] = []
62
+ self._is_summary = False
63
+
64
+ def as_tools(self) -> list[types.Tool]:
65
+ return self._tools
66
+
67
+ @property
68
+ def has_scenario(self) -> bool:
69
+ return True
70
+
71
+ async def list_tools(self) -> list[types.Tool]:
72
+ return self._tools
73
+
74
+ async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
75
+ # Parse the call
76
+ if isinstance(call, tuple):
77
+ name, args = call[0], call[1] if len(call) > 1 else {}
78
+ elif hasattr(call, "name"):
79
+ name, args = call.name, getattr(call, "arguments", {}) or {}
80
+ else:
81
+ name, args = str(call), kwargs
82
+ self._tool_calls.append((name, args))
83
+ return MCPToolResult(
84
+ content=[types.TextContent(type="text", text=f"Result from {name}")],
85
+ isError=False,
86
+ )
87
+
88
+ async def submit(self, answer: str) -> None:
89
+ self._submitted = answer
22
90
 
23
91
 
24
92
  class MockMCPAgent(MCPAgent):
25
- """Concrete implementation of BaseMCPAgent for testing."""
26
-
27
- metadata: ClassVar[dict[str, Any]] = {} # Optional metadata for MCP config
28
-
29
- def __init__(self, mcp_client: Any = None, **kwargs: Any) -> None:
30
- if mcp_client is None:
31
- # Create a mock client if none provided
32
- mcp_client = MagicMock()
33
- mcp_client.get_available_tools = MagicMock(return_value=[])
34
- mcp_client.initialize = AsyncMock()
35
- mcp_client.list_tools = AsyncMock(return_value=[])
36
- mcp_client.mcp_config = {"test_server": {"url": "http://localhost"}}
37
- super().__init__(mcp_client=mcp_client, **kwargs)
38
- self.executor = BaseExecutor() # Use simulated executor
39
- self._messages = []
40
-
41
- async def run(self, task: Task) -> list[dict[str, Any]]:
42
- """Mock run method."""
43
- return self._messages
44
-
45
- async def create_initial_messages(
46
- self, prompt: str, initial_screenshot: bool = False
47
- ) -> list[dict[str, Any]]:
48
- """Mock create initial messages."""
49
- messages = [{"role": "user", "content": prompt}]
50
- if initial_screenshot:
51
- messages.append({"role": "assistant", "content": "Screenshot: mock_screenshot"})
52
- return messages
93
+ """Concrete implementation of MCPAgent for testing."""
94
+
95
+ metadata: ClassVar[dict[str, Any] | None] = {}
96
+ config_cls: ClassVar[type[BaseAgentConfig]] = MockConfig
97
+
98
+ def __init__(self, **kwargs: Any) -> None:
99
+ params = MockCreateParams(**kwargs)
100
+ super().__init__(params)
101
+ self._response = AgentResponse(content="Mock response", tool_calls=[], done=True)
102
+
103
+ def set_response(self, response: AgentResponse) -> None:
104
+ self._response = response
53
105
 
54
106
  async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
55
- """Mock get response."""
56
- return AgentResponse(content="Mock response", tool_calls=[], done=True)
107
+ return self._response
57
108
 
58
109
  async def format_tool_results(
59
110
  self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
60
111
  ) -> list[dict[str, Any]]:
61
- """Mock format tool results."""
62
112
  formatted = []
63
- for tool_call, result in zip(tool_calls, tool_results):
113
+ for tool_call, result in zip(tool_calls, tool_results, strict=True):
64
114
  formatted.append({"role": "tool", "name": tool_call.name, "content": str(result)})
65
115
  return formatted
66
116
 
67
- async def create_user_message(self, text: str) -> Any:
68
- """Mock create user message."""
69
- return {"role": "user", "content": text}
70
-
71
117
  async def get_system_messages(self) -> list[Any]:
72
- """Mock get system messages."""
73
118
  return []
74
119
 
75
120
  async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
76
- """Mock format blocks."""
77
- formatted = []
78
- for block in blocks:
79
- if isinstance(block, types.TextContent):
80
- formatted.append({"type": "text", "text": block.text})
81
- elif isinstance(block, types.ImageContent):
82
- formatted.append({"type": "image", "data": block.data})
83
- elif hasattr(block, "type"):
84
- formatted.append({"type": getattr(block, "type", "unknown")})
85
- return formatted
121
+ return [{"type": "text", "text": getattr(b, "text", "")} for b in blocks]
86
122
 
87
123
 
88
- class TestBaseMCPAgent:
89
- """Tests for BaseMCPAgent with simulated actions."""
124
+ class TestMCPAgentInit:
125
+ """Tests for MCPAgent initialization."""
90
126
 
91
- def test_init_defaults(self):
92
- """Test initialization with default values."""
127
+ def test_init_defaults(self) -> None:
128
+ """Test agent initializes with default config."""
93
129
  agent = MockMCPAgent()
130
+ assert agent.ctx is None
131
+ assert agent._initialized is False
132
+ assert agent.system_prompt is None
94
133
 
95
- assert agent.mcp_client is not None
96
- assert agent.allowed_tools is None
97
- assert agent.disallowed_tools == []
98
- assert agent.initial_screenshot is True
99
- assert agent.system_prompt is not None # Default system prompt is set
100
-
101
- def test_init_with_params(self):
102
- """Test initialization with custom parameters."""
103
- client = MagicMock()
104
- agent = MockMCPAgent(
105
- mcp_client=client,
106
- allowed_tools=["tool1", "tool2"],
107
- disallowed_tools=["bad_tool"],
108
- initial_screenshot=True,
109
- system_prompt="Custom prompt",
110
- )
111
-
112
- assert agent.mcp_client == client
113
- assert agent.allowed_tools == ["tool1", "tool2"]
114
- assert agent.disallowed_tools == ["bad_tool"]
115
- assert agent.initial_screenshot is True
134
+ def test_init_with_system_prompt(self) -> None:
135
+ """Test agent with custom system prompt."""
136
+ agent = MockMCPAgent(system_prompt="Custom prompt")
116
137
  assert agent.system_prompt == "Custom prompt"
117
138
 
118
- @pytest.mark.asyncio
119
- async def test_init_no_client_no_task(self):
120
- """Test initialize fails without client and without task."""
121
-
122
- # Create a minimal concrete implementation to test the ValueError
123
- class TestAgent(MCPAgent):
124
- async def create_initial_messages(
125
- self, prompt: str, initial_screenshot: bool = False
126
- ) -> list[dict[str, Any]]:
127
- return []
128
-
129
- async def format_tool_results(
130
- self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
131
- ) -> list[dict[str, Any]]:
132
- return []
133
-
134
- async def get_response(self, messages: list[dict[str, Any]]) -> dict[str, Any]:
135
- return {"content": "test"}
136
-
137
- async def get_system_messages(self) -> list[Any]:
138
- return []
139
139
 
140
- async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
141
- return []
142
-
143
- # Agent can be created with None client
144
- agent = TestAgent(mcp_client=None)
145
-
146
- # But initialize should fail without client or task
147
- with pytest.raises(ValueError, match="No MCPClient"):
148
- await agent.initialize()
140
+ class TestMCPAgentRun:
141
+ """Tests for MCPAgent.run() with EvalContext."""
149
142
 
150
143
  @pytest.mark.asyncio
151
- async def test_initialize_with_sessions(self):
152
- """Test initialize with existing sessions."""
144
+ async def test_run_basic(self) -> None:
145
+ """Test basic run flow with EvalContext."""
146
+ ctx = MockEvalContext(prompt="Do something")
153
147
  agent = MockMCPAgent()
154
148
 
155
- # Create proper async mock for session
156
- mock_session = MagicMock()
157
-
158
- # Set up the connector and client_session structure
159
- mock_session.connector = MagicMock()
160
- mock_session.connector.client_session = MagicMock()
161
-
162
- # Mock list_tools on the client_session
163
- async def mock_list_tools():
164
- return types.ListToolsResult(
165
- tools=[
166
- types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
167
- types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
168
- types.Tool(
169
- name="setup", description="Setup tool", inputSchema={"type": "object"}
170
- ),
171
- ]
172
- )
173
-
174
- mock_session.connector.client_session.list_tools = mock_list_tools
175
-
176
- assert agent.mcp_client is not None
177
-
178
- # Mock the list_tools method on mcp_client to return the tools
179
- agent.mcp_client.list_tools = AsyncMock(
180
- return_value=[
181
- types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
182
- types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
183
- types.Tool(name="setup", description="Setup tool", inputSchema={"type": "object"}),
184
- ]
185
- )
186
-
187
- await agent.initialize()
149
+ result = await agent.run(ctx)
188
150
 
189
- # Check available tools were populated (excludes lifecycle tools)
190
- tools = agent.get_available_tools()
191
- assert len(tools) == 3 # All tools (setup is not in default lifecycle tools)
192
-
193
- # Ensure names exist in available tools
194
- names = {t.name for t in tools}
195
- assert {"tool1", "tool2", "setup"} <= names
196
-
197
- @pytest.mark.asyncio
198
- async def test_initialize_with_filtering(self):
199
- """Test initialize with tool filtering."""
200
- agent = MockMCPAgent(allowed_tools=["tool1"], disallowed_tools=["tool3"])
201
-
202
- # Create proper async mock for session
203
- mock_session = MagicMock()
204
-
205
- # Set up the connector and client_session structure
206
- mock_session.connector = MagicMock()
207
- mock_session.connector.client_session = MagicMock()
208
-
209
- async def mock_list_tools():
210
- return types.ListToolsResult(
211
- tools=[
212
- types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
213
- types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
214
- types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
215
- types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
216
- ]
217
- )
218
-
219
- mock_session.connector.client_session.list_tools = mock_list_tools
220
-
221
- assert agent.mcp_client is not None
222
-
223
- # Mock the list_tools method on mcp_client to return the tools
224
- agent.mcp_client.list_tools = AsyncMock(
225
- return_value=[
226
- types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
227
- types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
228
- types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
229
- types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
230
- ]
231
- )
232
-
233
- await agent.initialize()
234
-
235
- # Check filtering worked - get_available_tools excludes lifecycle tools
236
- tools = agent.get_available_tools()
237
- tool_names = [t.name for t in tools]
238
- assert len(tools) == 1 # Only tool1 (tool2 and tool3 are filtered out)
239
- assert "tool1" in tool_names
240
- assert "setup" not in tool_names # Lifecycle tool excluded from available tools
241
- assert "tool2" not in tool_names # Not in allowed list
242
- assert "tool3" not in tool_names # In disallowed list
151
+ assert result.done is True
152
+ assert result.content == "Mock response"
153
+ assert ctx._submitted == "Mock response"
243
154
 
244
155
  @pytest.mark.asyncio
245
- async def test_call_tool_success(self):
246
- """Test successful tool call."""
156
+ async def test_run_initializes_agent(self) -> None:
157
+ """Test run() initializes the agent with context."""
158
+ ctx = MockEvalContext(prompt="Do something")
247
159
  agent = MockMCPAgent()
248
160
 
249
- # Initialize with a tool
250
- mock_session = MagicMock()
251
- mock_session.connector = MagicMock()
252
- mock_session.connector.client_session = MagicMock()
253
-
254
- async def mock_list_tools():
255
- return types.ListToolsResult(
256
- tools=[
257
- types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
258
- ]
259
- )
260
-
261
- mock_session.connector.client_session.list_tools = mock_list_tools
262
-
263
- # Mock the call_tool method on the client session
264
- mock_result = types.CallToolResult(
265
- content=[types.TextContent(type="text", text="Tool result")], isError=False
266
- )
267
-
268
- async def mock_call_tool(name, args):
269
- return mock_result
270
-
271
- mock_session.connector.client_session.call_tool = mock_call_tool
272
-
273
- assert agent.mcp_client is not None
274
-
275
- # Mock the client's call_tool method directly
276
- agent.mcp_client.call_tool = AsyncMock(return_value=mock_result)
277
-
278
- # Mock the list_tools method to return the test tool
279
- agent.mcp_client.list_tools = AsyncMock(
280
- return_value=[
281
- types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
282
- ]
283
- )
284
-
285
- await agent.initialize()
286
-
287
- # Call the tool
288
- tool_call = MCPToolCall(name="test_tool", arguments={"param": "value"})
289
- results = await agent.call_tools(tool_call)
290
-
291
- assert len(results) == 1
292
- assert results[0] == mock_result
293
- assert not results[0].isError
161
+ assert not agent._initialized
162
+ await agent.run(ctx)
163
+ assert agent._initialized
294
164
 
295
165
  @pytest.mark.asyncio
296
- async def test_call_tool_not_found(self):
297
- """Test calling non-existent tool."""
166
+ async def test_run_discovers_tools(self) -> None:
167
+ """Test run() discovers tools from context."""
168
+ tools = [
169
+ types.Tool(name="tool1", description="Tool 1", inputSchema={}),
170
+ types.Tool(name="tool2", description="Tool 2", inputSchema={}),
171
+ ]
172
+ ctx = MockEvalContext(prompt="Do something", tools=tools)
298
173
  agent = MockMCPAgent()
299
174
 
300
- # Initialize without tools
301
- mock_session = MagicMock()
175
+ # We need to check tools before cleanup
176
+ # Store a reference to check
177
+ discovered_tools = []
302
178
 
303
- async def mock_list_tools():
304
- return types.ListToolsResult(tools=[])
179
+ original_run = agent._run_context
305
180
 
306
- mock_session.list_tools = mock_list_tools
307
- assert agent.mcp_client is not None
181
+ async def capture_tools(*args: Any, **kwargs: Any) -> Any:
182
+ discovered_tools.extend(agent.get_available_tools())
183
+ return await original_run(*args, **kwargs)
308
184
 
309
- await agent.initialize()
185
+ agent._run_context = capture_tools # type: ignore
186
+ await agent.run(ctx)
310
187
 
311
- # Try to call unknown tool - call_tools doesn't raise for unknown tools
312
- tool_call = MCPToolCall(name="unknown_tool", arguments={})
313
- await agent.call_tools(tool_call)
188
+ assert len(discovered_tools) == 2
189
+ assert discovered_tools[0].name == "tool1"
190
+ assert discovered_tools[1].name == "tool2"
314
191
 
315
192
  @pytest.mark.asyncio
316
- async def test_call_tool_no_name(self):
317
- """Test calling tool without name."""
318
- # MCPToolCall accepts empty names
193
+ async def test_run_requires_eval_context(self) -> None:
194
+ """Test run() raises TypeError for non-EvalContext."""
319
195
  agent = MockMCPAgent()
320
- tool_call = MCPToolCall(name="", arguments={})
321
196
 
322
- # call_tools doesn't validate empty names, it will return error
323
- await agent.call_tools(tool_call)
197
+ with pytest.raises(TypeError, match="must be EvalContext"):
198
+ await agent.run("not a context") # type: ignore
324
199
 
325
- def test_get_tool_schemas(self):
326
- """Test getting tool schemas."""
200
+ @pytest.mark.asyncio
201
+ async def test_run_requires_prompt(self) -> None:
202
+ """Test run() raises ValueError when prompt is empty."""
203
+ ctx = MockEvalContext(prompt="")
327
204
  agent = MockMCPAgent()
328
205
 
329
- # Add setup to lifecycle tools to test filtering
330
- agent.lifecycle_tools = ["setup"]
331
-
332
- agent._available_tools = [
333
- types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
334
- types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
335
- ]
336
-
337
- schemas = agent.get_tool_schemas()
338
-
339
- # Should include non-lifecycle tools
340
- assert len(schemas) == 1
341
- assert schemas[0]["name"] == "tool1"
206
+ with pytest.raises(ValueError, match="prompt is not set"):
207
+ await agent.run(ctx)
342
208
 
343
- def test_get_tools_by_server(self):
344
- """Test getting tools grouped by server."""
209
+ @pytest.mark.asyncio
210
+ async def test_run_clears_context_after(self) -> None:
211
+ """Test run() clears ctx after completion."""
212
+ ctx = MockEvalContext(prompt="Do something")
345
213
  agent = MockMCPAgent()
346
214
 
347
- # Set up tools from different servers
348
- tool1 = types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"})
349
- tool2 = types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"})
350
-
351
- agent._available_tools = [tool1, tool2]
352
- tools = agent.get_available_tools()
353
- assert {t.name for t in tools} == {"tool1", "tool2"}
215
+ await agent.run(ctx)
216
+ assert agent.ctx is None
354
217
 
355
218
  @pytest.mark.asyncio
356
- async def test_executor_integration(self):
357
- """Test integration with BaseExecutor for simulated actions."""
219
+ async def test_run_no_submit_on_empty_content(self) -> None:
220
+ """Test run() doesn't submit when content is empty."""
221
+ ctx = MockEvalContext(prompt="Do something")
358
222
  agent = MockMCPAgent()
223
+ agent.set_response(AgentResponse(content="", tool_calls=[], done=True))
359
224
 
360
- # Test various executor actions
361
- click_result = await agent.executor.click(100, 200, take_screenshot=False)
362
- assert click_result.output is not None
363
- assert "[SIMULATED] Click at (100, 200)" in click_result.output
225
+ await agent.run(ctx)
226
+ assert ctx._submitted is None
364
227
 
365
- type_result = await agent.executor.write("Test input", take_screenshot=False)
366
- assert type_result.output is not None
367
- assert "[SIMULATED] Type 'Test input'" in type_result.output
368
228
 
369
- scroll_result = await agent.executor.scroll(x=50, y=50, scroll_y=5, take_screenshot=False)
370
- assert scroll_result.output is not None
371
- assert "[SIMULATED] Scroll" in scroll_result.output
229
+ class TestMCPAgentToolCalling:
230
+ """Tests for tool calling through context."""
372
231
 
373
- # Test screenshot
374
- screenshot = await agent.executor.screenshot()
375
- assert isinstance(screenshot, str)
376
- assert screenshot.startswith("iVBORw0KGgo") # PNG header
232
+ @pytest.mark.asyncio
233
+ async def test_call_tools_uses_context(self) -> None:
234
+ """Test call_tools routes through ctx.call_tool."""
235
+ ctx = MockEvalContext(prompt="Do something")
236
+ agent = MockMCPAgent()
377
237
 
238
+ # Bind context manually
239
+ agent.ctx = ctx
240
+ await agent._initialize_from_ctx(ctx)
378
241
 
379
- class MockAgentExtended(MCPAgent):
380
- """Mock agent for testing with predefined responses."""
242
+ # Call a tool
243
+ results = await agent.call_tools(MCPToolCall(name="test_tool", arguments={"arg": "value"}))
381
244
 
382
- metadata: ClassVar[dict[str, Any]] = {} # Optional metadata for MCP config
245
+ assert len(results) == 1
246
+ assert not results[0].isError
247
+ assert ("test_tool", {"arg": "value"}) in ctx._tool_calls
383
248
 
384
- def __init__(self, responses=None, **kwargs):
385
- super().__init__(**kwargs)
386
- self.responses = responses or []
387
- self.call_count = 0
249
+ @pytest.mark.asyncio
250
+ async def test_call_tools_without_context_raises(self) -> None:
251
+ """Test call_tools raises when no context bound."""
252
+ agent = MockMCPAgent()
388
253
 
389
- async def create_initial_messages(
390
- self, prompt: str, initial_screenshot: bool = False
391
- ) -> list[dict[str, Any]]:
392
- """Create initial messages."""
393
- messages = [{"role": "user", "content": prompt}]
394
- if initial_screenshot:
395
- # capture_screenshot doesn't exist, just mock it
396
- screenshot = "mock_screenshot_data"
397
- messages.append({"role": "assistant", "content": f"Screenshot: {screenshot}"})
398
- return messages
254
+ with pytest.raises(ValueError, match="not bound to context"):
255
+ await agent.call_tools(MCPToolCall(name="test_tool", arguments={}))
399
256
 
400
- async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
401
- """Return predefined responses - must be async."""
402
- if self.call_count < len(self.responses):
403
- response_dict = self.responses[self.call_count]
404
- self.call_count += 1
405
- # Convert dict to AgentResponse
406
- return AgentResponse(
407
- content=response_dict.get("content", ""),
408
- tool_calls=response_dict.get("tool_calls", []),
409
- done=response_dict.get("done", not bool(response_dict.get("tool_calls"))),
410
- )
411
- return AgentResponse(content="Done", tool_calls=[], done=True)
412
257
 
413
- async def format_tool_results(
414
- self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
415
- ) -> list[dict[str, Any]]:
416
- """Format tool results."""
417
- formatted = []
418
- for tool_call, result in zip(tool_calls, tool_results):
419
- formatted.append({"role": "tool", "name": tool_call.name, "content": str(result)})
420
- return formatted
258
+ class TestMCPAgentRequiredTools:
259
+ """Tests for required_tools validation."""
421
260
 
422
- async def create_user_message(self, text: str) -> Any:
423
- """Create user message."""
424
- return {"role": "user", "content": text}
425
-
426
- async def get_system_messages(self) -> list[Any]:
427
- """Mock get system messages."""
428
- return []
261
+ @pytest.mark.asyncio
262
+ async def test_missing_required_tools_raises(self) -> None:
263
+ """Test run() raises when required tools are missing."""
429
264
 
430
- async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
431
- """Mock format blocks."""
432
- formatted = []
433
- for block in blocks:
434
- if isinstance(block, types.TextContent):
435
- formatted.append({"type": "text", "text": block.text})
436
- elif isinstance(block, types.ImageContent):
437
- formatted.append({"type": "image", "data": block.data})
438
- elif hasattr(block, "type"):
439
- formatted.append({"type": getattr(block, "type", "unknown")})
440
- return formatted
265
+ class AgentWithRequiredTools(MockMCPAgent):
266
+ required_tools: ClassVar[list[str]] = ["must_have_tool"]
441
267
 
268
+ ctx = MockEvalContext(prompt="Do something", tools=[])
269
+ agent = AgentWithRequiredTools()
442
270
 
443
- class TestMCPAgentExtended:
444
- """Extended tests for MCPAgent."""
445
-
446
- @pytest.fixture
447
- def mock_client(self):
448
- """Create a mock MCP client."""
449
- client = MagicMock()
450
- client.get_all_active_sessions = MagicMock(return_value={})
451
- client.initialize = AsyncMock()
452
- client.list_tools = AsyncMock(return_value=[])
453
- client.call_tool = AsyncMock(
454
- return_value=types.CallToolResult(
455
- content=[types.TextContent(type="text", text="Success")],
456
- isError=False,
457
- )
458
- )
459
- return client
460
-
461
- @pytest.fixture
462
- def agent_with_tools(self, mock_client):
463
- """Create agent with mock tools."""
464
- mock_client.list_tools = AsyncMock(
465
- return_value=[
466
- types.Tool(name="screenshot", description="Take screenshot", inputSchema={}),
467
- types.Tool(name="click", description="Click at coordinates", inputSchema={}),
468
- types.Tool(name="type", description="Type text", inputSchema={}),
469
- types.Tool(name="bad_tool", description="A tool that fails", inputSchema={}),
470
- ]
471
- )
472
- return MockAgentExtended(mcp_client=mock_client)
271
+ with pytest.raises(ValueError, match="Required tools are missing"):
272
+ await agent.run(ctx)
473
273
 
474
274
  @pytest.mark.asyncio
475
- async def test_run_with_task_object(self, agent_with_tools):
476
- """Test running agent with Task object."""
477
- from hud.types import MCPToolResult
478
-
479
- task = Task(
480
- id="test_task",
481
- prompt="Click the button",
482
- mcp_config={"test_server": {"url": "http://localhost:8080"}},
483
- setup_tool={"name": "navigate", "arguments": {"url": "https://example.com"}}, # type: ignore[arg-type]
484
- evaluate_tool={"name": "check_result", "arguments": {}}, # type: ignore[arg-type]
485
- )
486
-
487
- # Set up responses
488
- agent_with_tools.responses = [
489
- {
490
- "role": "assistant",
491
- "content": "I'll click the button",
492
- "tool_calls": [MCPToolCall(name="click", arguments={"x": 100, "y": 200})],
493
- }
494
- ]
275
+ async def test_required_tools_present_succeeds(self) -> None:
276
+ """Test run() succeeds when required tools are present."""
495
277
 
496
- # Mock the evaluation to return a reward
497
- agent_with_tools.mcp_client.call_tool = AsyncMock(
498
- side_effect=[
499
- # Setup tool
500
- MCPToolResult(
501
- content=[types.TextContent(type="text", text="Navigated")],
502
- isError=False,
503
- ),
504
- # Click tool
505
- MCPToolResult(
506
- content=[types.TextContent(type="text", text="Clicked")],
507
- isError=False,
508
- ),
509
- # Evaluate tool with reward
510
- MCPToolResult(
511
- content=[types.TextContent(type="text", text="Success")],
512
- isError=False,
513
- structuredContent={"reward": 1.0},
514
- ),
515
- ]
516
- )
278
+ class AgentWithRequiredTools(MockMCPAgent):
279
+ required_tools: ClassVar[list[str]] = ["required_tool"]
517
280
 
518
- result = await agent_with_tools.run(task)
281
+ tools = [types.Tool(name="required_tool", description="Required", inputSchema={})]
282
+ ctx = MockEvalContext(prompt="Do something", tools=tools)
283
+ agent = AgentWithRequiredTools()
519
284
 
520
- assert isinstance(result, Trace)
521
- assert result.reward == 1.0
522
- assert not result.isError
285
+ result = await agent.run(ctx)
523
286
  assert result.done
524
287
 
288
+
289
+ class TestMCPAgentOnToolsReady:
290
+ """Tests for _on_tools_ready hook."""
291
+
525
292
  @pytest.mark.asyncio
526
- async def test_run_with_setup_error(self, agent_with_tools):
527
- """Test task execution with setup phase error."""
528
- from hud.types import MCPToolResult
529
-
530
- task = Task(
531
- id="test_task",
532
- prompt="Do something",
533
- mcp_config={"test_server": {"url": "http://localhost:8080"}},
534
- setup_tool={"name": "bad_setup", "arguments": {}}, # type: ignore[arg-type]
535
- )
293
+ async def test_on_tools_ready_called(self) -> None:
294
+ """Test _on_tools_ready is called during initialization."""
295
+ hook_called = [False]
536
296
 
537
- # Mock setup tool to fail
538
- agent_with_tools.mcp_client.call_tool = AsyncMock(
539
- return_value=MCPToolResult(
540
- content=[types.TextContent(type="text", text="Setup failed")],
541
- isError=True,
542
- )
543
- )
297
+ class AgentWithHook(MockMCPAgent):
298
+ def _on_tools_ready(self) -> None:
299
+ hook_called[0] = True
544
300
 
545
- result = await agent_with_tools.run(task)
301
+ ctx = MockEvalContext(prompt="Do something")
302
+ agent = AgentWithHook()
546
303
 
547
- assert isinstance(result, Trace)
548
- assert result.isError
549
- # Error content is the string representation of the MCPToolResult list
550
- assert result.content is not None
551
- assert "Setup failed" in result.content
552
- assert "MCPToolResult" in result.content
304
+ await agent.run(ctx)
305
+ assert hook_called[0]
553
306
 
554
307
  @pytest.mark.asyncio
555
- async def test_run_with_multiple_setup_tools(self, agent_with_tools):
556
- """Test task with multiple setup tools."""
557
-
558
- task = Task(
559
- id="test_task",
560
- prompt="Test multiple setup",
561
- mcp_config={"test_server": {"url": "http://localhost:8080"}},
562
- setup_tool=[
563
- MCPToolCall(name="setup1", arguments={}),
564
- MCPToolCall(name="setup2", arguments={}),
565
- ],
566
- )
567
-
568
- agent_with_tools.responses = [{"role": "assistant", "content": "Done", "tool_calls": []}]
308
+ async def test_on_tools_ready_has_access_to_tools(self) -> None:
309
+ """Test _on_tools_ready can access discovered tools."""
310
+ captured_tools: list[types.Tool] = []
569
311
 
570
- setup_calls = []
571
- agent_with_tools.mcp_client.call_tool = AsyncMock(
572
- side_effect=lambda tool_call: setup_calls.append(tool_call)
573
- or MCPToolResult(
574
- content=[types.TextContent(type="text", text=f"{tool_call.name} done")],
575
- isError=False,
576
- )
577
- )
312
+ class AgentWithHook(MockMCPAgent):
313
+ def _on_tools_ready(self) -> None:
314
+ captured_tools.extend(self.get_available_tools())
578
315
 
579
- result = await agent_with_tools.run(task)
316
+ tools = [
317
+ types.Tool(name="tool1", description="Tool 1", inputSchema={}),
318
+ types.Tool(name="tool2", description="Tool 2", inputSchema={}),
319
+ ]
320
+ ctx = MockEvalContext(prompt="Do something", tools=tools)
321
+ agent = AgentWithHook()
580
322
 
581
- # Check that the tool names match
582
- setup_names = [call.name for call in setup_calls]
583
- assert "setup1" in setup_names
584
- assert "setup2" in setup_names
585
- assert not result.isError
323
+ await agent.run(ctx)
586
324
 
587
- @pytest.mark.asyncio
588
- async def test_allowed_tools_filtering(self, mock_client):
589
- """Test that allowed_tools filters available tools."""
590
- mock_client.list_tools = AsyncMock(
591
- return_value=[
592
- types.Tool(name="tool1", description="Tool 1", inputSchema={}),
593
- types.Tool(name="tool2", description="Tool 2", inputSchema={}),
594
- types.Tool(name="tool3", description="Tool 3", inputSchema={}),
595
- ]
596
- )
325
+ assert len(captured_tools) == 2
326
+ assert captured_tools[0].name == "tool1"
597
327
 
598
- agent = MockAgentExtended(mcp_client=mock_client, allowed_tools=["tool1", "tool3"])
599
- await agent.initialize("test")
600
328
 
601
- available_names = [tool.name for tool in agent._available_tools]
602
- assert "tool1" in available_names
603
- assert "tool3" in available_names
604
- assert "tool2" not in available_names
329
+ class TestMCPAgentToolSchemas:
330
+ """Tests for tool schema generation."""
605
331
 
606
332
  @pytest.mark.asyncio
607
- async def test_disallowed_tools_filtering(self, mock_client):
608
- """Test that disallowed_tools filters available tools."""
609
- mock_client.list_tools = AsyncMock(
610
- return_value=[
611
- types.Tool(name="tool1", description="Tool 1", inputSchema={}),
612
- types.Tool(name="tool2", description="Tool 2", inputSchema={}),
613
- types.Tool(name="tool3", description="Tool 3", inputSchema={}),
614
- ]
615
- )
616
-
617
- agent = MockAgentExtended(mcp_client=mock_client, disallowed_tools=["tool2"])
618
- await agent.initialize("test")
333
+ async def test_get_tool_schemas(self) -> None:
334
+ """Test get_tool_schemas returns correct format."""
335
+ tools = [
336
+ types.Tool(
337
+ name="my_tool",
338
+ description="My tool description",
339
+ inputSchema={"type": "object", "properties": {"x": {"type": "string"}}},
340
+ )
341
+ ]
342
+ ctx = MockEvalContext(prompt="Do something", tools=tools)
343
+ agent = MockMCPAgent()
619
344
 
620
- available_names = [tool.name for tool in agent._available_tools]
621
- assert "tool1" in available_names
622
- assert "tool3" in available_names
623
- assert "tool2" not in available_names
345
+ # Initialize agent
346
+ agent.ctx = ctx
347
+ await agent._initialize_from_ctx(ctx)
624
348
 
625
- @pytest.mark.asyncio
626
- async def test_lifecycle_tools(self, mock_client):
627
- """Test lifecycle tools are called in run_prompt."""
628
- # Lifecycle tools are specified by name, not as objects
629
- agent = MockAgentExtended(
630
- mcp_client=mock_client,
631
- responses=[{"role": "assistant", "content": "Done", "tool_calls": []}],
632
- )
349
+ schemas = agent.get_tool_schemas()
350
+ assert len(schemas) == 1
351
+ assert schemas[0]["name"] == "my_tool"
352
+ assert schemas[0]["description"] == "My tool description"
633
353
 
634
- # Add screenshot tool to available tools
635
- mock_client.list_tools = AsyncMock(
636
- return_value=[
637
- types.Tool(name="screenshot", description="Take screenshot", inputSchema={})
638
- ]
639
- )
640
354
 
641
- # Initialize to make tools available
642
- await agent.initialize()
643
-
644
- result = await agent.run("Test lifecycle", max_steps=1)
645
- assert not result.isError
646
-
647
- # This test is commented out as screenshot history management may have changed
648
- # @pytest.mark.asyncio
649
- # async def test_screenshot_history_management(self, agent_with_tools):
650
- # """Test screenshot history is maintained."""
651
- # agent_with_tools.initial_screenshot = True
652
-
653
- # # Set up responses with tool calls
654
- # agent_with_tools.responses = [
655
- # {
656
- # "role": "assistant",
657
- # "content": "Action 1",
658
- # "tool_calls": [MCPToolCall(name="click", arguments={"x": 1, "y": 1})],
659
- # },
660
- # {
661
- # "role": "assistant",
662
- # "content": "Action 2",
663
- # "tool_calls": [MCPToolCall(name="click", arguments={"x": 2, "y": 2})],
664
- # },
665
- # {
666
- # "role": "assistant",
667
- # "content": "Action 3",
668
- # "tool_calls": [MCPToolCall(name="click", arguments={"x": 3, "y": 3})],
669
- # },
670
- # ]
671
-
672
- # await agent_with_tools.run("Test screenshots", max_steps=3)
673
-
674
- # # Should have screenshots in history
675
- # assert len(agent_with_tools.screenshot_history) > 0
355
+ class TestMCPAgentErrorPropagation:
356
+ """Tests for error propagation to EvalContext."""
676
357
 
677
358
  @pytest.mark.asyncio
678
- async def test_run_with_invalid_prompt_type(self, agent_with_tools):
679
- """Test run with invalid prompt type raises TypeError."""
680
- with pytest.raises(TypeError, match="prompt_or_task must be str or Task"):
681
- await agent_with_tools.run(123) # Invalid type
359
+ async def test_exception_propagates_to_ctx_error(self) -> None:
360
+ """Test that exceptions during run() set ctx.error for platform visibility."""
682
361
 
683
- @pytest.mark.asyncio
684
- async def test_evaluate_phase_with_multiple_tools(self, agent_with_tools):
685
- """Test evaluation phase with multiple evaluation tools."""
686
- from hud.types import MCPToolResult
687
-
688
- task = Task(
689
- id="test_task",
690
- prompt="Test evaluation",
691
- mcp_config={"test_server": {"url": "http://localhost:8080"}},
692
- evaluate_tool=[
693
- MCPToolCall(name="eval1", arguments={}),
694
- MCPToolCall(name="eval2", arguments={"reward": True}),
695
- ],
696
- )
362
+ class FailingAgent(MockMCPAgent):
363
+ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
364
+ raise RuntimeError("Agent crashed")
697
365
 
698
- agent_with_tools.responses = [{"role": "assistant", "content": "Done", "tool_calls": []}]
366
+ ctx = MockEvalContext(prompt="Do something")
367
+ agent = FailingAgent()
699
368
 
700
- eval_calls = []
701
- agent_with_tools.mcp_client.call_tool = AsyncMock(
702
- side_effect=lambda tool_call: eval_calls.append(tool_call)
703
- or MCPToolResult(
704
- content=[types.TextContent(type="text", text=f"{tool_call.name} result")],
705
- isError=False,
706
- structuredContent={"reward": 0.5} if tool_call.name == "eval1" else {"reward": 1.0},
707
- )
708
- )
369
+ result = await agent.run(ctx)
709
370
 
710
- result = await agent_with_tools.run(task)
371
+ # Should return error trace
372
+ assert result.isError is True
373
+ assert result.content is not None
374
+ assert "Agent crashed" in result.content
711
375
 
712
- # Check that the tool names match
713
- eval_names = [call.name for call in eval_calls]
714
- assert "eval1" in eval_names
715
- assert "eval2" in eval_names
716
- assert result.reward == 0.5 # From eval1 (first evaluation tool)
376
+ assert ctx.error is not None
377
+ assert isinstance(ctx.error, BaseException)
378
+ assert "Agent crashed" in str(ctx.error)
717
379
 
718
380
  @pytest.mark.asyncio
719
- async def test_trace_population_on_error(self, agent_with_tools):
720
- """Test that trace is populated on task execution error."""
721
-
722
- task = Task(
723
- id="test_task",
724
- prompt="Test error",
725
- mcp_config={"test_server": {"url": "http://localhost:8080"}},
726
- setup_tool={"name": "failing_setup", "arguments": {}}, # type: ignore[arg-type]
727
- )
381
+ async def test_step_error_propagates_to_ctx_error(self) -> None:
382
+ """Test that step-level errors (caught internally) set ctx.error."""
383
+ step_count = [0]
384
+
385
+ class FailOnSecondStepAgent(MockMCPAgent):
386
+ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
387
+ step_count[0] += 1
388
+ if step_count[0] == 1:
389
+ return AgentResponse(
390
+ content="",
391
+ tool_calls=[MCPToolCall(name="test_tool", arguments={})],
392
+ done=False,
393
+ )
394
+ else:
395
+ raise ValueError("Step 2 failed")
396
+
397
+ ctx = MockEvalContext(prompt="Do something")
398
+ agent = FailOnSecondStepAgent()
399
+
400
+ result = await agent.run(ctx)
401
+
402
+ # Should return error trace
403
+ assert result.isError is True
404
+ assert ctx.error is not None
405
+ assert "Step 2 failed" in str(ctx.error)
728
406
 
729
- # Make setup fail with exception
730
- agent_with_tools.mcp_client.call_tool = AsyncMock(side_effect=Exception("Setup explosion"))
407
+ @pytest.mark.asyncio
408
+ async def test_no_error_when_successful(self) -> None:
409
+ """Test that ctx.error remains None on successful run."""
410
+ ctx = MockEvalContext(prompt="Do something")
411
+ agent = MockMCPAgent()
731
412
 
732
- result = await agent_with_tools.run(task)
413
+ result = await agent.run(ctx)
733
414
 
734
- assert result.isError
735
- # Error content is the string representation of the MCPToolResult list
736
- assert "Setup explosion" in result.content
737
- assert "MCPToolResult" in result.content
738
- assert result.done
415
+ assert result.isError is False
416
+ assert ctx.error is None