hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/agents/base.py CHANGED
@@ -9,336 +9,233 @@ from abc import ABC, abstractmethod
9
9
  from typing import TYPE_CHECKING, Any, ClassVar, Literal
10
10
 
11
11
  import mcp.types as types
12
+ from pydantic import BaseModel, ConfigDict
12
13
 
13
- from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
14
+ from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult, Trace
14
15
  from hud.utils.hud_console import HUDConsole
15
- from hud.utils.mcp import MCPConfigPatch, patch_mcp_config, setup_hud_telemetry
16
16
 
17
17
  if TYPE_CHECKING:
18
- from hud.clients.base import AgentMCPClient
19
- from hud.datasets import Task
20
-
21
- from .misc import ResponseAgent
18
+ from hud.environment import Environment
19
+ from hud.eval.context import EvalContext
22
20
 
23
21
 
24
22
  logger = logging.getLogger(__name__)
25
23
 
26
- GLOBAL_SYSTEM_PROMPT = "You are an assistant that can use tools to help the user. You will be given a task and you will need to use the tools to complete the task." # noqa: E501
24
+
25
+ class BaseCreateParams(BaseModel):
26
+ """Runtime parameters for agent creation."""
27
+
28
+ model_config = ConfigDict(arbitrary_types_allowed=True)
29
+
30
+ # Primary way to bind agent to execution context (v5)
31
+ ctx: Any | None = None # EvalContext or Environment - agent uses this for tool calls
32
+
33
+ auto_respond: bool = False
34
+ verbose: bool = False
27
35
 
28
36
 
29
37
  class MCPAgent(ABC):
30
38
  """
31
39
  Base class for MCP-enabled agents.
32
40
 
33
- Provides common behavior for agents that interact with MCP servers, including:
34
- - Client management: accepts an `AgentMCPClient` or auto-creates one at
35
- runtime when `run()` is called with a `Task` that includes `mcp_config`.
36
- - Tool lifecycle: discovery, filtering (`allowed_tools`, `disallowed_tools`),
37
- and automatic marking of lifecycle tools (setup/evaluate) from a `Task`.
38
- - Messaging: system prompt handling, optional inclusion of setup output on
39
- the first turn, and control over initial screenshots.
40
- - Telemetry & UX: standardized logging/printing via `HUDConsole` and optional
41
- automatic tracing (`auto_trace`).
41
+ Agents interact with MCP servers through an EvalContext:
42
+ - run(ctx): Main entry point - takes EvalContext from hud.eval()
43
+ - ctx.call_tool(): Used internally for all tool execution
44
+ - ctx.submit(): Called automatically with agent's final response
42
45
 
43
46
  Subclasses implement provider-specific formatting and response fetching
44
- by overriding these abstract methods: `get_system_messages`, `get_response`,
45
- `format_blocks`, and `format_tool_results`.
47
+ by overriding: `get_system_messages`, `get_response`, `format_blocks`,
48
+ and `format_tool_results`.
46
49
  """
47
50
 
48
- metadata: dict[str, Any] | None = None
51
+ metadata: ClassVar[dict[str, Any] | None] = None
49
52
  required_tools: ClassVar[list[str]] = [] # Tools that must be available
53
+ config_cls: ClassVar[type[BaseAgentConfig]] = BaseAgentConfig
50
54
 
51
- def __init__(
52
- self,
53
- mcp_client: AgentMCPClient | None = None,
54
- # Filtering
55
- allowed_tools: list[str] | None = None,
56
- disallowed_tools: list[str] | None = None,
57
- # Messages
58
- system_prompt: str = GLOBAL_SYSTEM_PROMPT,
59
- append_setup_output: bool = True,
60
- initial_screenshot: bool = True,
61
- # Misc
62
- model_name: str = "mcp-agent",
63
- response_agent: ResponseAgent | None = None,
64
- auto_trace: bool = True,
65
- verbose: bool = False,
66
- ) -> None:
67
- """
68
- Initialize the base MCP agent.
55
+ def __init__(self, params: BaseCreateParams | None = None, **kwargs: Any) -> None:
56
+ if params is None:
57
+ import warnings
69
58
 
70
- Args:
71
- mcp_client: Client for connecting to MCP servers. If None, a client
72
- is auto-created at runtime when `run()` is called with a `Task`
73
- that provides `mcp_config`.
74
- allowed_tools: Names of tools to allow (None means allow all).
75
- disallowed_tools: Names of tools to always exclude.
76
- system_prompt: System prompt to seed the conversation.
77
- append_setup_output: Whether to append setup tool output to the
78
- first turn's messages.
79
- initial_screenshot: Whether to include an initial screenshot before
80
- the first prompt (when supported by the environment).
81
- model_name: Label used in telemetry/logging to identify the model.
82
- response_agent: Optional automation that can respond to the model's
83
- outputs to keep the loop going (e.g., auto-continue/stop).
84
- auto_trace: If True, automatically creates a trace/span for runs.
85
- verbose: If True, increases logging verbosity for developer UX.
86
- """
59
+ warnings.warn(
60
+ f"Passing kwargs to {self.__class__.__name__}() is deprecated. "
61
+ f"Use {self.__class__.__name__}.create(...) instead.",
62
+ DeprecationWarning,
63
+ stacklevel=2,
64
+ )
65
+ CreateParams = type(
66
+ f"{self.config_cls.__name__}CreateParams",
67
+ (BaseCreateParams, self.config_cls),
68
+ {"__module__": self.config_cls.__module__},
69
+ )
70
+ params = CreateParams(**kwargs)
87
71
 
88
- self.mcp_client = mcp_client
89
- self._auto_created_client = False # Track if we created the client
72
+ config_kwargs = {
73
+ k: getattr(params, k) for k in self.config_cls.model_fields if hasattr(params, k)
74
+ }
75
+ self.config = self.config_cls(**config_kwargs)
90
76
 
91
- self.model_name = model_name
92
- self.console = HUDConsole(logger=logger)
77
+ # v5: Store execution context (EvalContext/Environment) - agent uses ctx.call_tool()
78
+ self.ctx: EvalContext | Environment | None = params.ctx
93
79
 
94
- # Set verbose mode if requested
95
- if verbose:
96
- self.console.set_verbose(True)
80
+ self.model_name: str = getattr(params, "model_name", "MCPAgent")
81
+ self.model: str = getattr(params, "model", None) or "unknown"
82
+ self.auto_respond = params.auto_respond
97
83
 
98
- # User filtering
99
- self.allowed_tools = allowed_tools
100
- self.disallowed_tools = disallowed_tools or []
84
+ self.console = HUDConsole(logger=logger)
101
85
 
102
- # Task filtering
103
- self.agent_tools = None
104
- self.lifecycle_tools = []
86
+ if params.verbose:
87
+ self.console.set_verbose(True)
105
88
 
106
- # Messages
107
- self.system_prompt = system_prompt
108
- self.append_setup_output = append_setup_output
109
- self.initial_screenshot = initial_screenshot
89
+ self.system_prompt = self.config.system_prompt
110
90
 
111
- # Initialize these here so methods can be called before initialize()
112
- self._available_tools: list[types.Tool] = []
113
- self._tool_map: dict[str, types.Tool] = {} # Simplified: just name to tool
114
- self.response_tool_name = None
91
+ self._available_tools: list[types.Tool] | None = None
92
+ self._tool_map: dict[str, types.Tool] = {}
93
+ self._initialized: bool = False
115
94
 
116
- # Trace
117
- self._auto_trace = auto_trace
118
- self._auto_trace_cm: Any | None = None # Store auto-created trace context manager
95
+ @classmethod
96
+ def create(cls, **kwargs: Any) -> MCPAgent:
97
+ """
98
+ Factory method to create an agent with typed parameters.
99
+ """
100
+ CreateParams = type(
101
+ f"{cls.config_cls.__name__}CreateParams",
102
+ (BaseCreateParams, cls.config_cls),
103
+ {"__module__": cls.config_cls.__module__},
104
+ )
105
+ return cls(params=CreateParams(**kwargs))
119
106
 
120
- # Response agent to automatically interact with the model
121
- self.response_agent = response_agent
107
+ async def _initialize_from_ctx(self, ctx: EvalContext) -> None:
108
+ """Initialize agent from EvalContext - discovers tools and sets up state.
122
109
 
123
- async def initialize(self, task: str | Task | None = None) -> None:
124
- """Initialize the agent with task-specific configuration."""
125
- from hud.datasets import Task
110
+ This is the v5 initialization path. The agent uses ctx.call_tool() directly
111
+ for tool execution (no EnvironmentClient wrapper needed).
112
+ """
113
+ from hud.eval.context import EvalContext
126
114
 
127
- # Create client if needed
128
- if self.mcp_client is None and isinstance(task, Task) and task.mcp_config:
129
- from hud.clients import MCPClient
115
+ if not isinstance(ctx, EvalContext):
116
+ raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
130
117
 
131
- self.mcp_client = MCPClient(mcp_config=task.mcp_config)
132
- self._auto_created_client = True
133
- self.console.debug("Auto-created MCPClient from task.mcp_config")
118
+ # Refresh tools from connections, then get filtered list for agent
119
+ await ctx.list_tools()
120
+ self._available_tools = ctx.as_tools()
121
+ self._tool_map = {t.name: t for t in self._available_tools}
134
122
 
135
- # Ensure we have a client
136
- if self.mcp_client is None:
123
+ # Validate required tools are present
124
+ available_tool_names = {t.name for t in self._available_tools}
125
+ missing_tools = [tool for tool in self.required_tools if tool not in available_tool_names]
126
+ if missing_tools:
137
127
  raise ValueError(
138
- "No MCPClient. Please provide one when initializing the agent or pass a Task with mcp_config." # noqa: E501
128
+ f"Required tools are missing: {missing_tools}. "
129
+ f"Available tools: {sorted(available_tool_names)}"
139
130
  )
140
131
 
141
- await self._setup_config(self.mcp_client.mcp_config)
132
+ self.console.info(
133
+ f"Agent initialized with {len(self._available_tools)} tools: "
134
+ f"{', '.join([t.name for t in self._available_tools])}"
135
+ )
142
136
 
143
- # Initialize client if needed
144
- try:
145
- await self.mcp_client.initialize()
146
- except Exception as e:
147
- self._handle_connection_error(e)
148
-
149
- # If task is provided, add lifecycle tools
150
- if isinstance(task, Task):
151
- if task.agent_tools:
152
- self.agent_tools = task.agent_tools
153
- if task.setup_tool:
154
- if isinstance(task.setup_tool, list):
155
- for tool in task.setup_tool:
156
- if not self.agent_tools or (
157
- self.agent_tools and tool.name not in self.agent_tools
158
- ):
159
- self.lifecycle_tools.append(tool.name)
160
- elif not self.agent_tools or (
161
- self.agent_tools and task.setup_tool.name not in self.agent_tools
162
- ):
163
- self.lifecycle_tools.append(task.setup_tool.name)
164
- if task.evaluate_tool:
165
- if isinstance(task.evaluate_tool, list):
166
- for tool in task.evaluate_tool:
167
- if not self.agent_tools or (
168
- self.agent_tools and tool.name not in self.agent_tools
169
- ):
170
- self.lifecycle_tools.append(tool.name)
171
- elif not self.agent_tools or (
172
- self.agent_tools and task.evaluate_tool.name not in self.agent_tools
173
- ):
174
- self.lifecycle_tools.append(task.evaluate_tool.name)
175
- if task.system_prompt:
176
- self.system_prompt += "\n\n" + task.system_prompt
177
-
178
- # Re-apply filtering with updated lifecycle tools
179
- await self._filter_tools()
180
-
181
- async def run(self, prompt_or_task: str | Task | dict[str, Any], max_steps: int = 10) -> Trace:
182
- """
183
- Run the agent with the given prompt or task.
137
+ # Call hook for subclass-specific initialization (e.g., tool format conversion)
138
+ self._on_tools_ready()
184
139
 
185
- Args:
186
- prompt_or_task: Either a string prompt for simple execution or a Task object
187
- max_steps: Maximum number of steps (-1 for infinite)
140
+ self._initialized = True
188
141
 
189
- Returns:
190
- Trace with reward, done, content, isError fields and trace steps
142
+ def _on_tools_ready(self) -> None:
143
+ """Hook called after tools are discovered and validated.
144
+
145
+ Subclasses can override this to perform provider-specific setup,
146
+ such as converting MCP tools to the provider's format.
147
+
148
+ Called by _initialize_from_ctx() after _available_tools is populated.
191
149
  """
192
- # Import here to avoid circular imports
193
- from hud.datasets import Task
150
+ return # Default no-op - subclasses override for provider-specific setup
194
151
 
195
- if isinstance(prompt_or_task, dict):
196
- prompt_or_task = Task(**prompt_or_task)
197
- elif not isinstance(prompt_or_task, str) and not isinstance(prompt_or_task, Task):
198
- raise TypeError(f"prompt_or_task must be str or Task, got {type(prompt_or_task)}")
152
+ async def run(
153
+ self,
154
+ ctx: EvalContext,
155
+ *,
156
+ max_steps: int = 10,
157
+ ) -> Trace:
158
+ """
159
+ Run the agent on the given evaluation context.
199
160
 
200
- try:
201
- # Establish the connection with the MCP server/Environment
202
- await self.initialize(prompt_or_task)
161
+ The agent uses ctx.prompt as the task and ctx.call_tool() for tool execution.
162
+ Automatically calls ctx.submit() with the final answer.
203
163
 
204
- # Handle Task objects with full lifecycle
205
- if isinstance(prompt_or_task, Task):
206
- return await self.run_task(prompt_or_task, max_steps)
164
+ Args:
165
+ ctx: EvalContext from hud.eval() - contains prompt and tools
166
+ max_steps: Maximum number of agent steps (-1 for infinite)
167
+
168
+ Returns:
169
+ Trace with done, content, isError fields
170
+
171
+ Example:
172
+ ```python
173
+ async with hud.eval(task) as ctx:
174
+ agent = ClaudeAgent.create()
175
+ await agent.run(ctx)
176
+ # ctx.reward is set by the scenario's evaluate phase
177
+ ```
178
+ """
179
+ from hud.eval.context import EvalContext
207
180
 
208
- # Handle simple string prompts
209
- elif isinstance(prompt_or_task, str):
210
- context = text_to_blocks(prompt_or_task)
211
- return await self._run_context(context, max_steps=max_steps)
181
+ if not isinstance(ctx, EvalContext):
182
+ raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
212
183
 
213
- except Exception as e:
214
- # Always return a Trace object for any exception
215
- if self._is_connection_error(e):
216
- # Return error trace for connection failures
217
- return Trace(
218
- reward=0.0,
219
- done=True,
220
- content=self._get_connection_error_message(e),
221
- isError=True,
184
+ if not ctx.prompt:
185
+ if ctx.has_scenario:
186
+ # Scenario was specified but prompt is still empty
187
+ # (e.g., scenario returned empty string, or edge case not caught in scenarios.py)
188
+ scenario = ctx._task.scenario if ctx._task else "unknown"
189
+ raise ValueError(
190
+ f"ctx.prompt is not set.\n\n"
191
+ f"Scenario '{scenario}' was specified but returned an empty prompt.\n"
192
+ f"Check that the scenario's setup function returns a non-empty string."
222
193
  )
223
194
  else:
224
- # Return error trace for any other exception
225
- return Trace(
226
- reward=0.0,
227
- done=True,
228
- content=f"Task failed with error: {e}",
229
- isError=True,
230
- info={"error": str(e)},
195
+ # No scenario specified at all
196
+ raise ValueError(
197
+ "ctx.prompt is not set.\n\n"
198
+ "No scenario was specified in your task file.\n"
199
+ "Either add a 'scenario' field to your task, or set ctx.prompt manually "
200
+ "before running the agent."
231
201
  )
232
- finally:
233
- # Cleanup auto-created resources
234
- await self._cleanup()
235
202
 
236
- async def run_task(self, task: Task, max_steps: int = 10) -> Trace:
237
- """
238
- Execute a task with setup and evaluate phases.
203
+ # Store context for tool calls
204
+ self.ctx = ctx
239
205
 
240
- Args:
241
- task: Task object with prompt, setup, and evaluate configs
242
- max_steps: Maximum steps for task execution (-1 for infinite)
206
+ # Initialize tools from context
207
+ if not self._initialized:
208
+ await self._initialize_from_ctx(ctx)
243
209
 
244
- Returns:
245
- Trace with reward from evaluation
246
- """
247
210
  try:
248
- # Setup phase
249
- start_context: list[types.ContentBlock] = []
250
-
251
- # Extract the initial task information
252
- if task.prompt:
253
- start_context.extend(text_to_blocks(task.prompt))
254
-
255
- # Execute the setup tool and append the initial observation to the context
256
- if task.setup_tool is not None:
257
- self.console.progress_log(f"Setting up tool phase: {task.setup_tool}")
258
- results = await self.call_tools(task.setup_tool)
259
- if any(result.isError for result in results):
260
- return Trace(
261
- reward=0.0,
262
- done=True,
263
- content=f"Setup tool failed: {results}",
264
- isError=True,
265
- task=task,
266
- )
267
-
268
- if self.append_setup_output and isinstance(results[0].content, list):
269
- start_context.extend(results[0].content)
270
- if not self.initial_screenshot:
271
- start_context = await self._filter_messages(start_context, include_types=["text"])
272
-
273
- # Execute the task (agent loop) - this returns a empty trace object with the final response # noqa: E501
274
- prompt_result = await self._run_context(start_context, max_steps=max_steps)
275
-
276
- except Exception as e:
277
- self.console.error_log(f"Task execution failed: {e}")
278
- # Create an error result but don't return yet - we still want to evaluate
279
- prompt_result = Trace(reward=0.0, done=True, content=str(e), isError=True, task=task)
280
- prompt_result.populate_from_context()
281
-
282
- # Always evaluate if we have evaluate tool, regardless of errors
283
- if task.evaluate_tool is not None:
284
- try:
285
- results = await self.call_tools(task.evaluate_tool)
286
-
287
- if any(result.isError for result in results):
288
- self.console.warning_log(f"Evaluate tool returned error: {results}")
289
- # Still extract what we can from the error response
290
- if prompt_result is None:
291
- prompt_result = Trace(
292
- reward=0.0,
293
- done=True,
294
- content="Task failed before evaluation",
295
- isError=True,
296
- task=task,
297
- )
298
- prompt_result.reward = 0.0 # Default to 0 on error
299
- else:
300
- # Extract reward and content from evaluation
301
- if results:
302
- reward = find_reward(results[0])
303
- self.console.info_log(f"Eval: {reward:.4f} {task.evaluate_tool}")
304
- eval_content = find_content(results[0])
305
-
306
- # Update the prompt result with evaluation reward
307
- if prompt_result is None:
308
- prompt_result = Trace(
309
- reward=reward,
310
- done=True,
311
- content=eval_content or "",
312
- isError=False,
313
- task=task,
314
- )
315
- else:
316
- prompt_result.reward = reward
211
+ result = await self._run_context(text_to_blocks(ctx.prompt), max_steps=max_steps)
317
212
 
318
- # Update the prompt result with evaluation content (if available)
319
- if eval_content:
320
- # Prompt result may already have final response content,
321
- # so we append to it
322
- if prompt_result.content:
323
- prompt_result.content += "\n\n" + eval_content
324
- else:
325
- prompt_result.content = eval_content
213
+ # Propagate error state to context for platform visibility
214
+ if result.isError and hasattr(ctx, "error"):
215
+ error_msg = result.info.get("error") if result.info else result.content
216
+ ctx.error = Exception(str(error_msg)) if error_msg else Exception("Agent error")
326
217
 
327
- except Exception as e:
328
- self.console.error_log(f"Evaluation phase failed: {e}")
329
- # Ensure we have a result even if evaluation failed
330
- if prompt_result is None:
331
- prompt_result = Trace(
332
- reward=0.0,
333
- done=True,
334
- content=f"Evaluation failed: {e}",
335
- isError=True,
336
- task=task,
337
- )
218
+ # Submit final answer to context (only if scenario is running)
219
+ if result.content and ctx.has_scenario:
220
+ await ctx.submit(result.content)
338
221
 
339
- prompt_result.task = task
222
+ return result
340
223
 
341
- return prompt_result
224
+ except Exception as e:
225
+ logger.exception("Error while running agent:")
226
+ # Propagate error to context for platform visibility
227
+ if hasattr(ctx, "error"):
228
+ ctx.error = e
229
+ return Trace(
230
+ reward=0.0,
231
+ done=True,
232
+ content=f"Agent failed with error: {e}",
233
+ isError=True,
234
+ info={"error": str(e)},
235
+ )
236
+ finally:
237
+ # Cleanup auto-created resources
238
+ await self._cleanup()
342
239
 
343
240
  async def _run_context(
344
241
  self, context: list[types.ContentBlock], *, max_steps: int = 10
@@ -356,6 +253,8 @@ class MCPAgent(ABC):
356
253
  final_response = None
357
254
  error = None
358
255
 
256
+ messages: list[Any] = []
257
+
359
258
  try:
360
259
  # Start with system messages
361
260
  messages = await self.get_system_messages()
@@ -380,19 +279,17 @@ class MCPAgent(ABC):
380
279
 
381
280
  # Check if we should stop
382
281
  if response.done or not response.tool_calls:
383
- # Optional external ResponseAgent to decide whether to stop
384
- decision = "STOP"
385
- if self.response_agent is not None and response.content:
282
+ # Use auto_respond to decide whether to stop
283
+ decision: Literal["STOP", "CONTINUE"] = "STOP"
284
+ if self.auto_respond and response.content:
386
285
  try:
387
- decision = await self.response_agent.determine_response(
388
- response.content
389
- )
286
+ from hud.agents.misc import ResponseAgent
287
+
288
+ response_agent = ResponseAgent()
289
+ decision = await response_agent.determine_response(response.content)
390
290
  except Exception as e:
391
- self.console.warning_log(f"ResponseAgent failed: {e}")
291
+ self.console.warning_log(f"Auto-respond failed: {e}")
392
292
  if decision == "STOP":
393
- # Try to submit response through lifecycle tool
394
- await self._maybe_submit_response(response, messages)
395
-
396
293
  self.console.debug("Stopping execution")
397
294
  final_response = response
398
295
  break
@@ -403,11 +300,7 @@ class MCPAgent(ABC):
403
300
 
404
301
  # 2. Execute tools
405
302
  tool_calls = response.tool_calls
406
- for tool_call in tool_calls:
407
- self.console.info_log(f"{tool_call}")
408
303
  tool_results = await self.call_tools(tool_calls)
409
- for tool_result in tool_results:
410
- self.console.info_log(f"{tool_result}")
411
304
 
412
305
  # 3. Format tool results and add to messages
413
306
  tool_messages = await self.format_tool_results(tool_calls, tool_results)
@@ -459,16 +352,13 @@ class MCPAgent(ABC):
459
352
  }
460
353
  trace_result = Trace(**trace_params)
461
354
 
462
- # Populate trace steps from current context
463
- trace_result.populate_from_context()
464
-
465
355
  return trace_result
466
356
 
467
357
  async def call_tools(
468
358
  self, tool_call: MCPToolCall | list[MCPToolCall] | None = None
469
359
  ) -> list[MCPToolResult]:
470
360
  """
471
- Call a tool through the MCP client.
361
+ Call tools through the bound EvalContext.
472
362
 
473
363
  Args:
474
364
  tool_call: MCPToolCall or list of MCPToolCall
@@ -482,20 +372,17 @@ class MCPAgent(ABC):
482
372
  if isinstance(tool_call, MCPToolCall):
483
373
  tool_call = [tool_call]
484
374
 
485
- if self.mcp_client is None:
486
- raise ValueError("Client is not initialized")
375
+ if self.ctx is None:
376
+ raise ValueError("Agent not bound to context - call run(ctx) first")
487
377
 
488
378
  results: list[MCPToolResult] = []
489
379
  for tc in tool_call:
490
380
  try:
491
381
  self.console.debug(f"Calling tool: {tc}")
492
- results.append(await self.mcp_client.call_tool(tc))
382
+ result = await self.ctx.call_tool(tc)
383
+ results.append(MCPToolResult(content=result.content, isError=result.isError))
493
384
  except TimeoutError as e:
494
385
  self.console.error_log(f"Tool execution timed out: {e}")
495
- try:
496
- await self.mcp_client.shutdown()
497
- except Exception as close_err:
498
- self.console.debug(f"Failed to close MCP client cleanly: {close_err}")
499
386
  raise
500
387
  except Exception as e:
501
388
  self.console.error_log(f"Tool execution failed: {e}")
@@ -514,8 +401,6 @@ class MCPAgent(ABC):
514
401
  """
515
402
  Get response from the model including any tool calls.
516
403
 
517
- NOTE: Subclasses should decorate this method with:
518
- @hud.instrument(span_type="agent", record_args=False, record_result=True)
519
404
 
520
405
  Args:
521
406
  messages: Current conversation messages
@@ -575,148 +460,13 @@ class MCPAgent(ABC):
575
460
 
576
461
  return await self.format_blocks(blocks)
577
462
 
578
- async def _filter_tools(self) -> None:
579
- """Apply tool filtering based on allowed/disallowed lists."""
580
- # Get all tools from client
581
- if self.mcp_client is None:
582
- raise ValueError("MCP client is not initialized")
583
-
584
- all_tools = await self.mcp_client.list_tools()
585
-
586
- response_tools_by_server: dict[str, str] = {} # server_name -> tool_name
587
- for tool in all_tools:
588
- if "response" in tool.name or tool.name == "response":
589
- self.console.debug(f"Found response tool: '{tool.name}'")
590
- # Extract server name from tool name (e.g., "grader_response" -> "grader")
591
- if "_" in tool.name:
592
- server_name = tool.name.split("_", 1)[0]
593
- response_tools_by_server[server_name] = tool.name
594
- else:
595
- response_tools_by_server["_default"] = tool.name
596
-
597
- # Add response tool to lifecycle tools BEFORE filtering
598
- if response_tools_by_server and hasattr(self.mcp_client, "mcp_config"):
599
- # Get server names in order from mcp_config
600
- server_names = list(self.mcp_client.mcp_config.keys())
601
- self.console.debug(f"Server names: {server_names}")
602
-
603
- # Try to find response tool from last server first
604
- response_tool_name = None
605
- for server_name in reversed(server_names):
606
- if server_name in response_tools_by_server:
607
- response_tool_name = response_tools_by_server[server_name]
608
- self.console.debug(
609
- f"Found response tool '{response_tool_name}' from server '{server_name}'"
610
- )
611
- break
612
-
613
- # Fallback to any response tool
614
- if not response_tool_name and response_tools_by_server:
615
- response_tool_name = next(iter(response_tools_by_server.values()))
616
- self.console.debug(f"Using fallback response tool '{response_tool_name}'")
617
-
618
- # Add to lifecycle tools if found
619
- if response_tool_name and response_tool_name not in self.lifecycle_tools:
620
- self.console.debug(f"Auto-detected '{response_tool_name}' tool as a lifecycle tool")
621
- self.response_tool_name = response_tool_name
622
- self.lifecycle_tools.append(response_tool_name)
623
- elif response_tool_name:
624
- self.console.debug(
625
- f"Response tool '{response_tool_name}' already in lifecycle_tools"
626
- )
627
- self.response_tool_name = response_tool_name
628
- else:
629
- self.console.debug("No response tools found or no mcp_config")
630
-
631
- # Filter tools
632
- self._available_tools = []
633
- self._tool_map = {}
634
-
635
- self.console.debug(f"All tools: {[t.name for t in all_tools]}")
636
- self.console.debug(f"Allowed tools: {self.allowed_tools}")
637
- self.console.debug(f"Agent tools: {self.agent_tools}")
638
- self.console.debug(f"Disallowed tools: {self.disallowed_tools}")
639
- self.console.debug(f"Lifecycle tools: {self.lifecycle_tools}")
640
-
641
- for tool in all_tools:
642
- # Lifecycle tools (setup, evaluate, response) should always be included
643
- is_lifecycle = tool.name in self.lifecycle_tools
644
-
645
- # Check if tool should be included
646
- if not is_lifecycle:
647
- if self.allowed_tools and tool.name not in self.allowed_tools:
648
- self.console.debug(f"Skipping tool '{tool.name}' - not in allowed_tools")
649
- continue
650
- if self.agent_tools and tool.name not in self.agent_tools:
651
- self.console.debug(f"Skipping tool '{tool.name}' - not in agent_tools")
652
- continue
653
- if tool.name in self.disallowed_tools:
654
- self.console.debug(f"Skipping tool '{tool.name}' - in disallowed_tools")
655
- continue
656
-
657
- self.console.debug(
658
- f"Adding tool '{tool.name}' to available tools (lifecycle={is_lifecycle})"
659
- )
660
- self._available_tools.append(tool)
661
- self._tool_map[tool.name] = tool
662
-
663
- # Check if all required tools are available
664
- if self.required_tools:
665
- available_tool_names = {tool.name for tool in self._available_tools}
666
- missing_tools = [
667
- tool for tool in self.required_tools if tool not in available_tool_names
668
- ]
669
- if missing_tools:
670
- raise ValueError(
671
- f"Required tools not available: {missing_tools}. "
672
- f"Available tools: {list(available_tool_names)}"
673
- )
674
-
675
- available_tools = self.get_available_tools()
676
- self.console.info(
677
- f"Agent initialized with {len(available_tools)} tools: {', '.join([t.name for t in available_tools])}" # noqa: E501
678
- )
679
-
680
- async def _maybe_submit_response(self, response: AgentResponse, messages: list[Any]) -> None:
681
- """Submit response through lifecycle tool if available.
682
-
683
- Args:
684
- response: The agent's response
685
- messages: The current message history (will be modified in-place)
686
- """
687
- if self.response_tool_name:
688
- self.console.debug(f"Calling response lifecycle tool: {self.response_tool_name}")
689
- try:
690
- # Call the response tool with the agent's response
691
- response_tool_call = MCPToolCall(
692
- name=self.response_tool_name, arguments={"response": response.content}
693
- )
694
- response_results = await self.call_tools(response_tool_call)
695
-
696
- # Format and add the response tool results to messages
697
- response_messages = await self.format_tool_results(
698
- [response_tool_call], response_results
699
- )
700
- messages.extend(response_messages)
701
-
702
- # Mark the task as done
703
- self.console.debug("Response lifecycle tool executed, marking task as done")
704
- except Exception as e:
705
- self.console.error_log(f"Response lifecycle tool failed: {e}")
706
-
707
- async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
708
- """Inject metadata into the metadata of the initialize request."""
709
- if self.metadata:
710
- patch_mcp_config(
711
- mcp_config,
712
- MCPConfigPatch(meta=self.metadata),
713
- )
714
- self._auto_trace_cm = setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
715
-
716
463
  def get_available_tools(self) -> list[types.Tool]:
717
464
  """Get list of available MCP tools for LLM use (excludes lifecycle tools)."""
718
- lifecycle_tool_names = self.lifecycle_tools
719
- return [tool for tool in self._available_tools if tool.name not in lifecycle_tool_names]
465
+ if self._available_tools is None:
466
+ raise RuntimeError(
467
+ "Tools have not been initialized. Call initialize() before accessing available tools." # noqa: E501
468
+ )
469
+ return self._available_tools
720
470
 
721
471
  def get_tool_schemas(self) -> list[dict]:
722
472
  """Get tool schemas in a format suitable for the model."""
@@ -752,65 +502,8 @@ class MCPAgent(ABC):
752
502
 
753
503
  async def _cleanup(self) -> None:
754
504
  """Cleanup resources."""
755
- # Clean up auto-created trace if any
756
- if self._auto_trace_cm:
757
- try:
758
- self._auto_trace_cm.__exit__(None, None, None)
759
- self.console.debug("Closed auto-created trace")
760
- except Exception as e:
761
- self.console.warning_log(f"Failed to close auto-created trace: {e}")
762
- finally:
763
- self._auto_trace_cm = None
764
-
765
- # Clean up auto-created client
766
- if self._auto_created_client and self.mcp_client:
767
- try:
768
- await self.mcp_client.shutdown()
769
- self.console.debug("Closed auto-created MCPClient")
770
- except Exception as e:
771
- self.console.warning_log(f"Failed to close auto-created client: {e}")
772
- finally:
773
- self.mcp_client = None
774
- self._auto_created_client = False
775
-
776
- def _is_connection_error(self, e: Exception) -> bool:
777
- """Check if an exception is a connection error."""
778
- error_msg = str(e).lower()
779
- return any(
780
- pattern in error_msg
781
- for pattern in [
782
- "connection",
783
- "connect",
784
- "refused",
785
- "failed",
786
- "could not connect",
787
- "mcp server",
788
- ]
789
- )
790
-
791
- def _get_connection_error_message(self, e: Exception) -> str:
792
- """Extract a helpful connection error message."""
793
- import re
794
-
795
- url_match = re.search(r"https?://[^\s]+", str(e))
796
- url = url_match.group(0) if url_match else "the MCP server"
797
- return f"Connection failed: Could not connect to {url}. Is your MCP client/server running?"
798
-
799
- def _handle_connection_error(self, e: Exception) -> None:
800
- """Handle connection errors with helpful messages."""
801
- if self._is_connection_error(e):
802
- msg = self._get_connection_error_message(e)
803
- # Always show connection errors, not just when logging is enabled
804
- self.console.error(f"❌ {msg}")
805
- self.console.info("💡 Make sure the MCP server is started before running the agent.")
806
-
807
- # For localhost, provide specific instructions
808
- error_str = str(e).lower()
809
- if "localhost" in error_str or "127.0.0.1" in error_str:
810
- self.console.info(" Run 'hud dev' in another terminal to start the MCP server")
811
-
812
- raise RuntimeError(msg) from e
813
- raise
505
+ # Clear context reference
506
+ self.ctx = None
814
507
 
815
508
 
816
509
  def _format_error_result(error_message: str) -> MCPToolResult:
@@ -824,14 +517,39 @@ def text_to_blocks(text: str) -> list[types.ContentBlock]:
824
517
  def find_reward(result: MCPToolResult) -> float:
825
518
  """Find the reward in the result.
826
519
 
827
- Agent accepts "reward", "grade", "score"
520
+ Agent accepts "reward", "grade", "score", or weighted subscores
828
521
 
829
522
  If not found, return 0.0
830
523
  """
831
524
  accept_keys = ["reward", "grade", "score"]
525
+
526
+ # Check for direct reward/grade/score keys
832
527
  for key in accept_keys:
833
528
  if isinstance(result.structuredContent, dict) and key in result.structuredContent:
834
529
  return result.structuredContent[key]
530
+
531
+ # Check for subscores and weights format
532
+ if (
533
+ isinstance(result.structuredContent, dict)
534
+ and "subscores" in result.structuredContent
535
+ and "weights" in result.structuredContent
536
+ ):
537
+ subscores = result.structuredContent["subscores"]
538
+ weights = result.structuredContent["weights"]
539
+ if isinstance(subscores, dict) and isinstance(weights, dict):
540
+ try:
541
+ # Multiply each subscore by its corresponding weight and sum
542
+ reward = sum(
543
+ float(subscores[key]) * float(weights.get(key, 0.0))
544
+ for key in subscores
545
+ if key in weights
546
+ )
547
+ return reward
548
+ except (ValueError, TypeError) as e:
549
+ logger.error("Failed to parse subscores/weights: %s", e)
550
+ return 0.0
551
+
552
+ # Check for reward in JSON text content
835
553
  if isinstance(result.content, list):
836
554
  for content in result.content:
837
555
  if isinstance(content, types.TextContent):
@@ -842,6 +560,8 @@ def find_reward(result: MCPToolResult) -> float:
842
560
  return value
843
561
  except json.JSONDecodeError:
844
562
  pass
563
+
564
+ logger.error("Couldn't parse reward from result: %s", str(result.structuredContent))
845
565
  return 0.0
846
566
 
847
567