hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/agents/base.py CHANGED
@@ -10,335 +10,235 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal
10
10
 
11
11
  import mcp.types as types
12
12
 
13
- from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
13
+ from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult, Trace
14
14
  from hud.utils.hud_console import HUDConsole
15
- from hud.utils.mcp import MCPConfigPatch, patch_mcp_config, setup_hud_telemetry
16
15
 
17
- if TYPE_CHECKING:
18
- from hud.clients.base import AgentMCPClient
19
- from hud.datasets import Task
16
+ from .types import BaseCreateParams
20
17
 
21
- from .misc import ResponseAgent
18
+ if TYPE_CHECKING:
19
+ from hud.environment import Environment
20
+ from hud.eval.context import EvalContext
22
21
 
23
22
 
24
23
  logger = logging.getLogger(__name__)
25
24
 
26
- GLOBAL_SYSTEM_PROMPT = "You are an assistant that can use tools to help the user. You will be given a task and you will need to use the tools to complete the task." # noqa: E501
27
-
28
25
 
29
26
  class MCPAgent(ABC):
30
27
  """
31
28
  Base class for MCP-enabled agents.
32
29
 
33
- Provides common behavior for agents that interact with MCP servers, including:
34
- - Client management: accepts an `AgentMCPClient` or auto-creates one at
35
- runtime when `run()` is called with a `Task` that includes `mcp_config`.
36
- - Tool lifecycle: discovery, filtering (`allowed_tools`, `disallowed_tools`),
37
- and automatic marking of lifecycle tools (setup/evaluate) from a `Task`.
38
- - Messaging: system prompt handling, optional inclusion of setup output on
39
- the first turn, and control over initial screenshots.
40
- - Telemetry & UX: standardized logging/printing via `HUDConsole` and optional
41
- automatic tracing (`auto_trace`).
30
+ Agents interact with MCP servers through an EvalContext:
31
+ - run(ctx): Main entry point - takes EvalContext from hud.eval()
32
+ - ctx.call_tool(): Used internally for all tool execution
33
+ - ctx.submit(): Called automatically with agent's final response
42
34
 
43
35
  Subclasses implement provider-specific formatting and response fetching
44
- by overriding these abstract methods: `get_system_messages`, `get_response`,
45
- `format_blocks`, and `format_tool_results`.
36
+ by overriding: `get_system_messages`, `get_response`, `format_blocks`,
37
+ and `format_tool_results`.
46
38
  """
47
39
 
48
- metadata: dict[str, Any] | None = None
40
+ metadata: ClassVar[dict[str, Any] | None] = None
49
41
  required_tools: ClassVar[list[str]] = [] # Tools that must be available
42
+ config_cls: ClassVar[type[BaseAgentConfig]] = BaseAgentConfig
50
43
 
51
- def __init__(
52
- self,
53
- mcp_client: AgentMCPClient | None = None,
54
- # Filtering
55
- allowed_tools: list[str] | None = None,
56
- disallowed_tools: list[str] | None = None,
57
- # Messages
58
- system_prompt: str = GLOBAL_SYSTEM_PROMPT,
59
- append_setup_output: bool = True,
60
- initial_screenshot: bool = True,
61
- # Misc
62
- model_name: str = "mcp-agent",
63
- response_agent: ResponseAgent | None = None,
64
- auto_trace: bool = True,
65
- verbose: bool = False,
66
- ) -> None:
67
- """
68
- Initialize the base MCP agent.
44
+ def __init__(self, params: BaseCreateParams | None = None, **kwargs: Any) -> None:
45
+ if params is None:
46
+ import warnings
69
47
 
70
- Args:
71
- mcp_client: Client for connecting to MCP servers. If None, a client
72
- is auto-created at runtime when `run()` is called with a `Task`
73
- that provides `mcp_config`.
74
- allowed_tools: Names of tools to allow (None means allow all).
75
- disallowed_tools: Names of tools to always exclude.
76
- system_prompt: System prompt to seed the conversation.
77
- append_setup_output: Whether to append setup tool output to the
78
- first turn's messages.
79
- initial_screenshot: Whether to include an initial screenshot before
80
- the first prompt (when supported by the environment).
81
- model_name: Label used in telemetry/logging to identify the model.
82
- response_agent: Optional automation that can respond to the model's
83
- outputs to keep the loop going (e.g., auto-continue/stop).
84
- auto_trace: If True, automatically creates a trace/span for runs.
85
- verbose: If True, increases logging verbosity for developer UX.
86
- """
48
+ warnings.warn(
49
+ f"Passing kwargs to {self.__class__.__name__}() is deprecated. "
50
+ f"Use {self.__class__.__name__}.create(...) instead.",
51
+ DeprecationWarning,
52
+ stacklevel=2,
53
+ )
54
+ CreateParams = type(
55
+ f"{self.config_cls.__name__}CreateParams",
56
+ (BaseCreateParams, self.config_cls),
57
+ {"__module__": self.config_cls.__module__},
58
+ )
59
+ params = CreateParams(**kwargs)
87
60
 
88
- self.mcp_client = mcp_client
89
- self._auto_created_client = False # Track if we created the client
61
+ config_kwargs = {
62
+ k: getattr(params, k) for k in self.config_cls.model_fields if hasattr(params, k)
63
+ }
64
+ self.config = self.config_cls(**config_kwargs)
90
65
 
91
- self.model_name = model_name
92
- self.console = HUDConsole(logger=logger)
66
+ # v5: Store execution context (EvalContext/Environment) - agent uses ctx.call_tool()
67
+ self.ctx: EvalContext | Environment | None = params.ctx
93
68
 
94
- # Set verbose mode if requested
95
- if verbose:
96
- self.console.set_verbose(True)
69
+ self.model_name: str = getattr(params, "model_name", "MCPAgent")
70
+ self.model: str = getattr(params, "model", None) or "unknown"
71
+ self.auto_respond = params.auto_respond
97
72
 
98
- # User filtering
99
- self.allowed_tools = allowed_tools
100
- self.disallowed_tools = disallowed_tools or []
73
+ self.console = HUDConsole(logger=logger)
101
74
 
102
- # Task filtering
103
- self.agent_tools = None
104
- self.lifecycle_tools = []
75
+ if params.verbose:
76
+ self.console.set_verbose(True)
105
77
 
106
- # Messages
107
- self.system_prompt = system_prompt
108
- self.append_setup_output = append_setup_output
109
- self.initial_screenshot = initial_screenshot
78
+ self.system_prompt = self.config.system_prompt
110
79
 
111
- # Initialize these here so methods can be called before initialize()
112
- self._available_tools: list[types.Tool] = []
113
- self._tool_map: dict[str, types.Tool] = {} # Simplified: just name to tool
114
- self.response_tool_name = None
80
+ self._available_tools: list[types.Tool] | None = None
81
+ self._tool_map: dict[str, types.Tool] = {}
82
+ self._initialized: bool = False
115
83
 
116
- # Trace
117
- self._auto_trace = auto_trace
118
- self._auto_trace_cm: Any | None = None # Store auto-created trace context manager
84
+ @classmethod
85
+ def create(cls, **kwargs: Any) -> MCPAgent:
86
+ """
87
+ Factory method to create an agent with typed parameters.
88
+ """
89
+ CreateParams = type(
90
+ f"{cls.config_cls.__name__}CreateParams",
91
+ (BaseCreateParams, cls.config_cls),
92
+ {"__module__": cls.config_cls.__module__},
93
+ )
94
+ return cls(params=CreateParams(**kwargs))
119
95
 
120
- # Response agent to automatically interact with the model
121
- self.response_agent = response_agent
96
+ async def _initialize_from_ctx(self, ctx: EvalContext) -> None:
97
+ """Initialize agent from EvalContext - discovers tools and sets up state.
122
98
 
123
- async def initialize(self, task: str | Task | None = None) -> None:
124
- """Initialize the agent with task-specific configuration."""
125
- from hud.datasets import Task
99
+ This is the v5 initialization path. The agent uses ctx.call_tool() directly
100
+ for tool execution (no EnvironmentClient wrapper needed).
101
+ """
102
+ from hud.eval.context import EvalContext
126
103
 
127
- # Create client if needed
128
- if self.mcp_client is None and isinstance(task, Task) and task.mcp_config:
129
- from hud.clients import MCPClient
104
+ if not isinstance(ctx, EvalContext):
105
+ raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
130
106
 
131
- self.mcp_client = MCPClient(mcp_config=task.mcp_config)
132
- self._auto_created_client = True
133
- self.console.debug("Auto-created MCPClient from task.mcp_config")
107
+ # Refresh tools from connections, then get filtered list for agent
108
+ await ctx.list_tools()
109
+ self._available_tools = ctx.as_tools()
110
+ self._tool_map = {t.name: t for t in self._available_tools}
134
111
 
135
- # Ensure we have a client
136
- if self.mcp_client is None:
112
+ # Validate required tools are present
113
+ available_tool_names = {t.name for t in self._available_tools}
114
+ missing_tools = [tool for tool in self.required_tools if tool not in available_tool_names]
115
+ if missing_tools:
137
116
  raise ValueError(
138
- "No MCPClient. Please provide one when initializing the agent or pass a Task with mcp_config." # noqa: E501
117
+ f"Required tools are missing: {missing_tools}. "
118
+ f"Available tools: {sorted(available_tool_names)}"
139
119
  )
140
120
 
141
- await self._setup_config(self.mcp_client.mcp_config)
121
+ self.console.info(
122
+ f"Agent initialized with {len(self._available_tools)} tools: "
123
+ f"{', '.join([t.name for t in self._available_tools])}"
124
+ )
142
125
 
143
- # Initialize client if needed
144
- try:
145
- await self.mcp_client.initialize()
146
- except Exception as e:
147
- self._handle_connection_error(e)
148
-
149
- # If task is provided, add lifecycle tools
150
- if isinstance(task, Task):
151
- if task.agent_tools:
152
- self.agent_tools = task.agent_tools
153
- if task.setup_tool:
154
- if isinstance(task.setup_tool, list):
155
- for tool in task.setup_tool:
156
- if not self.agent_tools or (
157
- self.agent_tools and tool.name not in self.agent_tools
158
- ):
159
- self.lifecycle_tools.append(tool.name)
160
- elif not self.agent_tools or (
161
- self.agent_tools and task.setup_tool.name not in self.agent_tools
162
- ):
163
- self.lifecycle_tools.append(task.setup_tool.name)
164
- if task.evaluate_tool:
165
- if isinstance(task.evaluate_tool, list):
166
- for tool in task.evaluate_tool:
167
- if not self.agent_tools or (
168
- self.agent_tools and tool.name not in self.agent_tools
169
- ):
170
- self.lifecycle_tools.append(tool.name)
171
- elif not self.agent_tools or (
172
- self.agent_tools and task.evaluate_tool.name not in self.agent_tools
173
- ):
174
- self.lifecycle_tools.append(task.evaluate_tool.name)
175
- if task.system_prompt:
176
- self.system_prompt += "\n\n" + task.system_prompt
177
-
178
- # Re-apply filtering with updated lifecycle tools
179
- await self._filter_tools()
180
-
181
- async def run(self, prompt_or_task: str | Task | dict[str, Any], max_steps: int = 10) -> Trace:
182
- """
183
- Run the agent with the given prompt or task.
126
+ # Call hook for subclass-specific initialization (e.g., tool format conversion)
127
+ self._on_tools_ready()
184
128
 
185
- Args:
186
- prompt_or_task: Either a string prompt for simple execution or a Task object
187
- max_steps: Maximum number of steps (-1 for infinite)
129
+ self._initialized = True
188
130
 
189
- Returns:
190
- Trace with reward, done, content, isError fields and trace steps
131
+ def _on_tools_ready(self) -> None:
132
+ """Hook called after tools are discovered and validated.
133
+
134
+ Subclasses can override this to perform provider-specific setup,
135
+ such as converting MCP tools to the provider's format.
136
+
137
+ Called by _initialize_from_ctx() after _available_tools is populated.
191
138
  """
192
- # Import here to avoid circular imports
193
- from hud.datasets import Task
139
+ return # Default no-op - subclasses override for provider-specific setup
194
140
 
195
- if isinstance(prompt_or_task, dict):
196
- prompt_or_task = Task(**prompt_or_task)
197
- elif not isinstance(prompt_or_task, str) and not isinstance(prompt_or_task, Task):
198
- raise TypeError(f"prompt_or_task must be str or Task, got {type(prompt_or_task)}")
141
+ async def run(
142
+ self,
143
+ ctx: EvalContext,
144
+ *,
145
+ max_steps: int = 10,
146
+ ) -> Trace:
147
+ """
148
+ Run the agent on the given evaluation context.
199
149
 
200
- try:
201
- # Establish the connection with the MCP server/Environment
202
- await self.initialize(prompt_or_task)
150
+ The agent uses ctx.prompt as the task and ctx.call_tool() for tool execution.
151
+ Automatically calls ctx.submit() with the final answer.
203
152
 
204
- # Handle Task objects with full lifecycle
205
- if isinstance(prompt_or_task, Task):
206
- return await self.run_task(prompt_or_task, max_steps)
153
+ Args:
154
+ ctx: EvalContext from hud.eval() - contains prompt and tools
155
+ max_steps: Maximum number of agent steps (-1 for infinite)
207
156
 
208
- # Handle simple string prompts
209
- elif isinstance(prompt_or_task, str):
210
- context = text_to_blocks(prompt_or_task)
211
- return await self._run_context(context, max_steps=max_steps)
157
+ Returns:
158
+ Trace with done, content, isError fields
159
+
160
+ Example:
161
+ ```python
162
+ async with hud.eval(task) as ctx:
163
+ agent = ClaudeAgent.create()
164
+ await agent.run(ctx)
165
+ # ctx.reward is set by the scenario's evaluate phase
166
+ ```
167
+ """
168
+ from hud.eval.context import EvalContext
212
169
 
213
- except Exception as e:
214
- # Always return a Trace object for any exception
215
- if self._is_connection_error(e):
216
- # Return error trace for connection failures
217
- return Trace(
218
- reward=0.0,
219
- done=True,
220
- content=self._get_connection_error_message(e),
221
- isError=True,
170
+ if not isinstance(ctx, EvalContext):
171
+ raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
172
+
173
+ if not ctx.prompt:
174
+ if ctx.has_scenario:
175
+ # Scenario was specified but prompt is still empty
176
+ # (e.g., scenario returned empty string, or edge case not caught in scenarios.py)
177
+ scenario = ctx._task.scenario if ctx._task else "unknown"
178
+ raise ValueError(
179
+ f"ctx.prompt is not set.\n\n"
180
+ f"Scenario '{scenario}' was specified but returned an empty prompt.\n"
181
+ f"Check that the scenario's setup function returns a non-empty string."
222
182
  )
223
183
  else:
224
- # Return error trace for any other exception
225
- return Trace(
226
- reward=0.0,
227
- done=True,
228
- content=f"Task failed with error: {e}",
229
- isError=True,
230
- info={"error": str(e)},
184
+ # No scenario specified at all
185
+ raise ValueError(
186
+ "ctx.prompt is not set.\n\n"
187
+ "No scenario was specified in your task file.\n"
188
+ "Either add a 'scenario' field to your task, or set ctx.prompt manually "
189
+ "before running the agent."
231
190
  )
232
- finally:
233
- # Cleanup auto-created resources
234
- await self._cleanup()
235
191
 
236
- async def run_task(self, task: Task, max_steps: int = 10) -> Trace:
237
- """
238
- Execute a task with setup and evaluate phases.
192
+ # Store context for tool calls
193
+ self.ctx = ctx
239
194
 
240
- Args:
241
- task: Task object with prompt, setup, and evaluate configs
242
- max_steps: Maximum steps for task execution (-1 for infinite)
195
+ # Initialize tools from context
196
+ if not self._initialized:
197
+ await self._initialize_from_ctx(ctx)
243
198
 
244
- Returns:
245
- Trace with reward from evaluation
246
- """
247
199
  try:
248
- # Setup phase
249
- start_context: list[types.ContentBlock] = []
250
-
251
- # Extract the initial task information
252
- if task.prompt:
253
- start_context.extend(text_to_blocks(task.prompt))
254
-
255
- # Execute the setup tool and append the initial observation to the context
256
- if task.setup_tool is not None:
257
- self.console.progress_log(f"Setting up tool phase: {task.setup_tool}")
258
- results = await self.call_tools(task.setup_tool)
259
- if any(result.isError for result in results):
260
- return Trace(
261
- reward=0.0,
262
- done=True,
263
- content=f"Setup tool failed: {results}",
264
- isError=True,
265
- task=task,
266
- )
267
-
268
- if self.append_setup_output and isinstance(results[0].content, list):
269
- start_context.extend(results[0].content)
270
- if not self.initial_screenshot:
271
- start_context = await self._filter_messages(start_context, include_types=["text"])
272
-
273
- # Execute the task (agent loop) - this returns a empty trace object with the final response # noqa: E501
274
- prompt_result = await self._run_context(start_context, max_steps=max_steps)
200
+ # Build initial context - optionally append setup tool output
201
+ # Check ctx first (task-level override), then fall back to agent config
202
+ append_setup = getattr(ctx, "append_setup_output", False) or getattr(
203
+ self.config, "append_setup_output", False
204
+ )
205
+ initial_prompt = ctx.prompt
206
+ if append_setup:
207
+ setup_output = getattr(ctx, "setup_output", None)
208
+ if setup_output:
209
+ initial_prompt = f"{initial_prompt}\n\n{setup_output}"
275
210
 
276
- except Exception as e:
277
- self.console.error_log(f"Task execution failed: {e}")
278
- # Create an error result but don't return yet - we still want to evaluate
279
- prompt_result = Trace(reward=0.0, done=True, content=str(e), isError=True, task=task)
280
- prompt_result.populate_from_context()
211
+ # Build initial blocks (text prompt + optional screenshot)
212
+ initial_blocks = text_to_blocks(initial_prompt)
281
213
 
282
- # Always evaluate if we have evaluate tool, regardless of errors
283
- if task.evaluate_tool is not None:
284
- try:
285
- results = await self.call_tools(task.evaluate_tool)
286
-
287
- if any(result.isError for result in results):
288
- self.console.warning_log(f"Evaluate tool returned error: {results}")
289
- # Still extract what we can from the error response
290
- if prompt_result is None:
291
- prompt_result = Trace(
292
- reward=0.0,
293
- done=True,
294
- content="Task failed before evaluation",
295
- isError=True,
296
- task=task,
297
- )
298
- prompt_result.reward = 0.0 # Default to 0 on error
299
- else:
300
- # Extract reward and content from evaluation
301
- if results:
302
- reward = find_reward(results[0])
303
- self.console.info_log(f"Eval: {reward:.4f} {task.evaluate_tool}")
304
- eval_content = find_content(results[0])
305
-
306
- # Update the prompt result with evaluation reward
307
- if prompt_result is None:
308
- prompt_result = Trace(
309
- reward=reward,
310
- done=True,
311
- content=eval_content or "",
312
- isError=False,
313
- task=task,
314
- )
315
- else:
316
- prompt_result.reward = reward
214
+ result = await self._run_context(initial_blocks, max_steps=max_steps)
317
215
 
318
- # Update the prompt result with evaluation content (if available)
319
- if eval_content:
320
- # Prompt result may already have final response content,
321
- # so we append to it
322
- if prompt_result.content:
323
- prompt_result.content += "\n\n" + eval_content
324
- else:
325
- prompt_result.content = eval_content
216
+ # Propagate error state to context for platform visibility
217
+ if result.isError and hasattr(ctx, "error"):
218
+ error_msg = result.info.get("error") if result.info else result.content
219
+ ctx.error = Exception(str(error_msg)) if error_msg else Exception("Agent error")
326
220
 
327
- except Exception as e:
328
- self.console.error_log(f"Evaluation phase failed: {e}")
329
- # Ensure we have a result even if evaluation failed
330
- if prompt_result is None:
331
- prompt_result = Trace(
332
- reward=0.0,
333
- done=True,
334
- content=f"Evaluation failed: {e}",
335
- isError=True,
336
- task=task,
337
- )
221
+ # Submit final answer to context (only if scenario is running)
222
+ if result.content and ctx.has_scenario:
223
+ await ctx.submit(result.content)
338
224
 
339
- prompt_result.task = task
225
+ return result
340
226
 
341
- return prompt_result
227
+ except Exception as e:
228
+ logger.exception("Error while running agent:")
229
+ # Propagate error to context for platform visibility
230
+ if hasattr(ctx, "error"):
231
+ ctx.error = e
232
+ return Trace(
233
+ reward=0.0,
234
+ done=True,
235
+ content=f"Agent failed with error: {e}",
236
+ isError=True,
237
+ info={"error": str(e)},
238
+ )
239
+ finally:
240
+ # Cleanup auto-created resources
241
+ await self._cleanup()
342
242
 
343
243
  async def _run_context(
344
244
  self, context: list[types.ContentBlock], *, max_steps: int = 10
@@ -356,6 +256,8 @@ class MCPAgent(ABC):
356
256
  final_response = None
357
257
  error = None
358
258
 
259
+ messages: list[Any] = []
260
+
359
261
  try:
360
262
  # Start with system messages
361
263
  messages = await self.get_system_messages()
@@ -380,19 +282,17 @@ class MCPAgent(ABC):
380
282
 
381
283
  # Check if we should stop
382
284
  if response.done or not response.tool_calls:
383
- # Optional external ResponseAgent to decide whether to stop
384
- decision = "STOP"
385
- if self.response_agent is not None and response.content:
285
+ # Use auto_respond to decide whether to stop
286
+ decision: Literal["STOP", "CONTINUE"] = "STOP"
287
+ if self.auto_respond and response.content:
386
288
  try:
387
- decision = await self.response_agent.determine_response(
388
- response.content
389
- )
289
+ from hud.agents.misc import ResponseAgent
290
+
291
+ response_agent = ResponseAgent()
292
+ decision = await response_agent.determine_response(response.content)
390
293
  except Exception as e:
391
- self.console.warning_log(f"ResponseAgent failed: {e}")
294
+ self.console.warning_log(f"Auto-respond failed: {e}")
392
295
  if decision == "STOP":
393
- # Try to submit response through lifecycle tool
394
- await self._maybe_submit_response(response, messages)
395
-
396
296
  self.console.debug("Stopping execution")
397
297
  final_response = response
398
298
  break
@@ -403,11 +303,7 @@ class MCPAgent(ABC):
403
303
 
404
304
  # 2. Execute tools
405
305
  tool_calls = response.tool_calls
406
- for tool_call in tool_calls:
407
- self.console.info_log(f"{tool_call}")
408
306
  tool_results = await self.call_tools(tool_calls)
409
- for tool_result in tool_results:
410
- self.console.info_log(f"{tool_result}")
411
307
 
412
308
  # 3. Format tool results and add to messages
413
309
  tool_messages = await self.format_tool_results(tool_calls, tool_results)
@@ -449,8 +345,17 @@ class MCPAgent(ABC):
449
345
  is_error = False
450
346
 
451
347
  # Ensure all parameters are the correct type
348
+ # Use ctx.reward if already set (e.g., from scenario evaluate), otherwise 0.0
349
+ # Note: For v4 tasks with evaluate_tool, reward is set in __aexit__ after this returns,
350
+ # so callers should prefer ctx.reward over Trace.reward for the final result.
351
+ reward = 0.0
352
+ if self.ctx is not None:
353
+ ctx_reward = getattr(self.ctx, "reward", None)
354
+ if ctx_reward is not None:
355
+ reward = ctx_reward
356
+
452
357
  trace_params = {
453
- "reward": 0.0,
358
+ "reward": reward,
454
359
  "done": True,
455
360
  "messages": messages,
456
361
  "content": final_response.content if final_response else error,
@@ -459,16 +364,13 @@ class MCPAgent(ABC):
459
364
  }
460
365
  trace_result = Trace(**trace_params)
461
366
 
462
- # Populate trace steps from current context
463
- trace_result.populate_from_context()
464
-
465
367
  return trace_result
466
368
 
467
369
  async def call_tools(
468
370
  self, tool_call: MCPToolCall | list[MCPToolCall] | None = None
469
371
  ) -> list[MCPToolResult]:
470
372
  """
471
- Call a tool through the MCP client.
373
+ Call tools through the bound EvalContext.
472
374
 
473
375
  Args:
474
376
  tool_call: MCPToolCall or list of MCPToolCall
@@ -482,20 +384,17 @@ class MCPAgent(ABC):
482
384
  if isinstance(tool_call, MCPToolCall):
483
385
  tool_call = [tool_call]
484
386
 
485
- if self.mcp_client is None:
486
- raise ValueError("Client is not initialized")
387
+ if self.ctx is None:
388
+ raise ValueError("Agent not bound to context - call run(ctx) first")
487
389
 
488
390
  results: list[MCPToolResult] = []
489
391
  for tc in tool_call:
490
392
  try:
491
393
  self.console.debug(f"Calling tool: {tc}")
492
- results.append(await self.mcp_client.call_tool(tc))
394
+ result = await self.ctx.call_tool(tc)
395
+ results.append(MCPToolResult(content=result.content, isError=result.isError))
493
396
  except TimeoutError as e:
494
397
  self.console.error_log(f"Tool execution timed out: {e}")
495
- try:
496
- await self.mcp_client.shutdown()
497
- except Exception as close_err:
498
- self.console.debug(f"Failed to close MCP client cleanly: {close_err}")
499
398
  raise
500
399
  except Exception as e:
501
400
  self.console.error_log(f"Tool execution failed: {e}")
@@ -514,8 +413,6 @@ class MCPAgent(ABC):
514
413
  """
515
414
  Get response from the model including any tool calls.
516
415
 
517
- NOTE: Subclasses should decorate this method with:
518
- @hud.instrument(span_type="agent", record_args=False, record_result=True)
519
416
 
520
417
  Args:
521
418
  messages: Current conversation messages
@@ -575,148 +472,13 @@ class MCPAgent(ABC):
575
472
 
576
473
  return await self.format_blocks(blocks)
577
474
 
578
- async def _filter_tools(self) -> None:
579
- """Apply tool filtering based on allowed/disallowed lists."""
580
- # Get all tools from client
581
- if self.mcp_client is None:
582
- raise ValueError("MCP client is not initialized")
583
-
584
- all_tools = await self.mcp_client.list_tools()
585
-
586
- response_tools_by_server: dict[str, str] = {} # server_name -> tool_name
587
- for tool in all_tools:
588
- if "response" in tool.name or tool.name == "response":
589
- self.console.debug(f"Found response tool: '{tool.name}'")
590
- # Extract server name from tool name (e.g., "grader_response" -> "grader")
591
- if "_" in tool.name:
592
- server_name = tool.name.split("_", 1)[0]
593
- response_tools_by_server[server_name] = tool.name
594
- else:
595
- response_tools_by_server["_default"] = tool.name
596
-
597
- # Add response tool to lifecycle tools BEFORE filtering
598
- if response_tools_by_server and hasattr(self.mcp_client, "mcp_config"):
599
- # Get server names in order from mcp_config
600
- server_names = list(self.mcp_client.mcp_config.keys())
601
- self.console.debug(f"Server names: {server_names}")
602
-
603
- # Try to find response tool from last server first
604
- response_tool_name = None
605
- for server_name in reversed(server_names):
606
- if server_name in response_tools_by_server:
607
- response_tool_name = response_tools_by_server[server_name]
608
- self.console.debug(
609
- f"Found response tool '{response_tool_name}' from server '{server_name}'"
610
- )
611
- break
612
-
613
- # Fallback to any response tool
614
- if not response_tool_name and response_tools_by_server:
615
- response_tool_name = next(iter(response_tools_by_server.values()))
616
- self.console.debug(f"Using fallback response tool '{response_tool_name}'")
617
-
618
- # Add to lifecycle tools if found
619
- if response_tool_name and response_tool_name not in self.lifecycle_tools:
620
- self.console.debug(f"Auto-detected '{response_tool_name}' tool as a lifecycle tool")
621
- self.response_tool_name = response_tool_name
622
- self.lifecycle_tools.append(response_tool_name)
623
- elif response_tool_name:
624
- self.console.debug(
625
- f"Response tool '{response_tool_name}' already in lifecycle_tools"
626
- )
627
- self.response_tool_name = response_tool_name
628
- else:
629
- self.console.debug("No response tools found or no mcp_config")
630
-
631
- # Filter tools
632
- self._available_tools = []
633
- self._tool_map = {}
634
-
635
- self.console.debug(f"All tools: {[t.name for t in all_tools]}")
636
- self.console.debug(f"Allowed tools: {self.allowed_tools}")
637
- self.console.debug(f"Agent tools: {self.agent_tools}")
638
- self.console.debug(f"Disallowed tools: {self.disallowed_tools}")
639
- self.console.debug(f"Lifecycle tools: {self.lifecycle_tools}")
640
-
641
- for tool in all_tools:
642
- # Lifecycle tools (setup, evaluate, response) should always be included
643
- is_lifecycle = tool.name in self.lifecycle_tools
644
-
645
- # Check if tool should be included
646
- if not is_lifecycle:
647
- if self.allowed_tools and tool.name not in self.allowed_tools:
648
- self.console.debug(f"Skipping tool '{tool.name}' - not in allowed_tools")
649
- continue
650
- if self.agent_tools and tool.name not in self.agent_tools:
651
- self.console.debug(f"Skipping tool '{tool.name}' - not in agent_tools")
652
- continue
653
- if tool.name in self.disallowed_tools:
654
- self.console.debug(f"Skipping tool '{tool.name}' - in disallowed_tools")
655
- continue
656
-
657
- self.console.debug(
658
- f"Adding tool '{tool.name}' to available tools (lifecycle={is_lifecycle})"
659
- )
660
- self._available_tools.append(tool)
661
- self._tool_map[tool.name] = tool
662
-
663
- # Check if all required tools are available
664
- if self.required_tools:
665
- available_tool_names = {tool.name for tool in self._available_tools}
666
- missing_tools = [
667
- tool for tool in self.required_tools if tool not in available_tool_names
668
- ]
669
- if missing_tools:
670
- raise ValueError(
671
- f"Required tools not available: {missing_tools}. "
672
- f"Available tools: {list(available_tool_names)}"
673
- )
674
-
675
- available_tools = self.get_available_tools()
676
- self.console.info(
677
- f"Agent initialized with {len(available_tools)} tools: {', '.join([t.name for t in available_tools])}" # noqa: E501
678
- )
679
-
680
- async def _maybe_submit_response(self, response: AgentResponse, messages: list[Any]) -> None:
681
- """Submit response through lifecycle tool if available.
682
-
683
- Args:
684
- response: The agent's response
685
- messages: The current message history (will be modified in-place)
686
- """
687
- if self.response_tool_name:
688
- self.console.debug(f"Calling response lifecycle tool: {self.response_tool_name}")
689
- try:
690
- # Call the response tool with the agent's response
691
- response_tool_call = MCPToolCall(
692
- name=self.response_tool_name, arguments={"response": response.content}
693
- )
694
- response_results = await self.call_tools(response_tool_call)
695
-
696
- # Format and add the response tool results to messages
697
- response_messages = await self.format_tool_results(
698
- [response_tool_call], response_results
699
- )
700
- messages.extend(response_messages)
701
-
702
- # Mark the task as done
703
- self.console.debug("Response lifecycle tool executed, marking task as done")
704
- except Exception as e:
705
- self.console.error_log(f"Response lifecycle tool failed: {e}")
706
-
707
- async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
708
- """Inject metadata into the metadata of the initialize request."""
709
- if self.metadata:
710
- patch_mcp_config(
711
- mcp_config,
712
- MCPConfigPatch(meta=self.metadata),
713
- )
714
- self._auto_trace_cm = setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
715
-
716
475
  def get_available_tools(self) -> list[types.Tool]:
717
476
  """Get list of available MCP tools for LLM use (excludes lifecycle tools)."""
718
- lifecycle_tool_names = self.lifecycle_tools
719
- return [tool for tool in self._available_tools if tool.name not in lifecycle_tool_names]
477
+ if self._available_tools is None:
478
+ raise RuntimeError(
479
+ "Tools have not been initialized. Call initialize() before accessing available tools." # noqa: E501
480
+ )
481
+ return self._available_tools
720
482
 
721
483
  def get_tool_schemas(self) -> list[dict]:
722
484
  """Get tool schemas in a format suitable for the model."""
@@ -752,65 +514,8 @@ class MCPAgent(ABC):
752
514
 
753
515
  async def _cleanup(self) -> None:
754
516
  """Cleanup resources."""
755
- # Clean up auto-created trace if any
756
- if self._auto_trace_cm:
757
- try:
758
- self._auto_trace_cm.__exit__(None, None, None)
759
- self.console.debug("Closed auto-created trace")
760
- except Exception as e:
761
- self.console.warning_log(f"Failed to close auto-created trace: {e}")
762
- finally:
763
- self._auto_trace_cm = None
764
-
765
- # Clean up auto-created client
766
- if self._auto_created_client and self.mcp_client:
767
- try:
768
- await self.mcp_client.shutdown()
769
- self.console.debug("Closed auto-created MCPClient")
770
- except Exception as e:
771
- self.console.warning_log(f"Failed to close auto-created client: {e}")
772
- finally:
773
- self.mcp_client = None
774
- self._auto_created_client = False
775
-
776
- def _is_connection_error(self, e: Exception) -> bool:
777
- """Check if an exception is a connection error."""
778
- error_msg = str(e).lower()
779
- return any(
780
- pattern in error_msg
781
- for pattern in [
782
- "connection",
783
- "connect",
784
- "refused",
785
- "failed",
786
- "could not connect",
787
- "mcp server",
788
- ]
789
- )
790
-
791
- def _get_connection_error_message(self, e: Exception) -> str:
792
- """Extract a helpful connection error message."""
793
- import re
794
-
795
- url_match = re.search(r"https?://[^\s]+", str(e))
796
- url = url_match.group(0) if url_match else "the MCP server"
797
- return f"Connection failed: Could not connect to {url}. Is your MCP client/server running?"
798
-
799
- def _handle_connection_error(self, e: Exception) -> None:
800
- """Handle connection errors with helpful messages."""
801
- if self._is_connection_error(e):
802
- msg = self._get_connection_error_message(e)
803
- # Always show connection errors, not just when logging is enabled
804
- self.console.error(f"❌ {msg}")
805
- self.console.info("💡 Make sure the MCP server is started before running the agent.")
806
-
807
- # For localhost, provide specific instructions
808
- error_str = str(e).lower()
809
- if "localhost" in error_str or "127.0.0.1" in error_str:
810
- self.console.info(" Run 'hud dev' in another terminal to start the MCP server")
811
-
812
- raise RuntimeError(msg) from e
813
- raise
517
+ # Clear context reference
518
+ self.ctx = None
814
519
 
815
520
 
816
521
  def _format_error_result(error_message: str) -> MCPToolResult:
@@ -824,14 +529,45 @@ def text_to_blocks(text: str) -> list[types.ContentBlock]:
824
529
  def find_reward(result: MCPToolResult) -> float:
825
530
  """Find the reward in the result.
826
531
 
827
- Agent accepts "reward", "grade", "score"
532
+ Agent accepts "reward", "grade", "score", or weighted subscores
828
533
 
534
+ If isError is True, return 0.0 (error results should not contribute positive reward).
829
535
  If not found, return 0.0
830
536
  """
537
+ # Error results should return 0.0 - don't extract reward from error responses
538
+ if result.isError:
539
+ logger.warning("Evaluate tool returned error, using reward=0.0")
540
+ return 0.0
541
+
831
542
  accept_keys = ["reward", "grade", "score"]
543
+
544
+ # Check for direct reward/grade/score keys
832
545
  for key in accept_keys:
833
546
  if isinstance(result.structuredContent, dict) and key in result.structuredContent:
834
547
  return result.structuredContent[key]
548
+
549
+ # Check for subscores and weights format
550
+ if (
551
+ isinstance(result.structuredContent, dict)
552
+ and "subscores" in result.structuredContent
553
+ and "weights" in result.structuredContent
554
+ ):
555
+ subscores = result.structuredContent["subscores"]
556
+ weights = result.structuredContent["weights"]
557
+ if isinstance(subscores, dict) and isinstance(weights, dict):
558
+ try:
559
+ # Multiply each subscore by its corresponding weight and sum
560
+ reward = sum(
561
+ float(subscores[key]) * float(weights.get(key, 0.0))
562
+ for key in subscores
563
+ if key in weights
564
+ )
565
+ return reward
566
+ except (ValueError, TypeError) as e:
567
+ logger.error("Failed to parse subscores/weights: %s", e)
568
+ return 0.0
569
+
570
+ # Check for reward in JSON text content
835
571
  if isinstance(result.content, list):
836
572
  for content in result.content:
837
573
  if isinstance(content, types.TextContent):
@@ -842,6 +578,8 @@ def find_reward(result: MCPToolResult) -> float:
842
578
  return value
843
579
  except json.JSONDecodeError:
844
580
  pass
581
+
582
+ logger.error("Couldn't parse reward from result: %s", str(result.structuredContent))
845
583
  return 0.0
846
584
 
847
585