hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +70 -5
- hud/agents/base.py +238 -500
- hud/agents/claude.py +236 -247
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +264 -0
- hud/agents/gemini_cua.py +324 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +48 -36
- hud/agents/openai.py +282 -296
- hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
- hud/agents/operator.py +199 -0
- hud/agents/resolver.py +70 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +381 -214
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +377 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +493 -546
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +699 -113
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +889 -732
- hud/cli/eval.py +793 -667
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/pull.py +1 -1
- hud/cli/push.py +38 -13
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +110 -8
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push.py +1 -1
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +70 -1
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +45 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +326 -0
- hud/datasets/runner.py +198 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +52 -0
- hud/environment/connection.py +258 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +137 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +835 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +263 -0
- hud/environment/scenarios.py +620 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +205 -0
- hud/environment/tests/test_environment.py +593 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +242 -0
- hud/environment/tests/test_scenarios.py +1086 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +727 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +187 -0
- hud/eval/manager.py +533 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +372 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +291 -0
- hud/eval/types.py +65 -0
- hud/eval/utils.py +194 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +308 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +165 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +18 -2
- hud/tools/agent.py +223 -0
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +36 -3
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +194 -56
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +89 -18
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.13.dist-info/METADATA +264 -0
- hud_python-0.5.13.dist-info/RECORD +305 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/agents/base.py
CHANGED
|
@@ -10,335 +10,235 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal
|
|
|
10
10
|
|
|
11
11
|
import mcp.types as types
|
|
12
12
|
|
|
13
|
-
from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
|
|
13
|
+
from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult, Trace
|
|
14
14
|
from hud.utils.hud_console import HUDConsole
|
|
15
|
-
from hud.utils.mcp import MCPConfigPatch, patch_mcp_config, setup_hud_telemetry
|
|
16
15
|
|
|
17
|
-
|
|
18
|
-
from hud.clients.base import AgentMCPClient
|
|
19
|
-
from hud.datasets import Task
|
|
16
|
+
from .types import BaseCreateParams
|
|
20
17
|
|
|
21
|
-
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from hud.environment import Environment
|
|
20
|
+
from hud.eval.context import EvalContext
|
|
22
21
|
|
|
23
22
|
|
|
24
23
|
logger = logging.getLogger(__name__)
|
|
25
24
|
|
|
26
|
-
GLOBAL_SYSTEM_PROMPT = "You are an assistant that can use tools to help the user. You will be given a task and you will need to use the tools to complete the task." # noqa: E501
|
|
27
|
-
|
|
28
25
|
|
|
29
26
|
class MCPAgent(ABC):
|
|
30
27
|
"""
|
|
31
28
|
Base class for MCP-enabled agents.
|
|
32
29
|
|
|
33
|
-
|
|
34
|
-
-
|
|
35
|
-
|
|
36
|
-
-
|
|
37
|
-
and automatic marking of lifecycle tools (setup/evaluate) from a `Task`.
|
|
38
|
-
- Messaging: system prompt handling, optional inclusion of setup output on
|
|
39
|
-
the first turn, and control over initial screenshots.
|
|
40
|
-
- Telemetry & UX: standardized logging/printing via `HUDConsole` and optional
|
|
41
|
-
automatic tracing (`auto_trace`).
|
|
30
|
+
Agents interact with MCP servers through an EvalContext:
|
|
31
|
+
- run(ctx): Main entry point - takes EvalContext from hud.eval()
|
|
32
|
+
- ctx.call_tool(): Used internally for all tool execution
|
|
33
|
+
- ctx.submit(): Called automatically with agent's final response
|
|
42
34
|
|
|
43
35
|
Subclasses implement provider-specific formatting and response fetching
|
|
44
|
-
by overriding
|
|
45
|
-
|
|
36
|
+
by overriding: `get_system_messages`, `get_response`, `format_blocks`,
|
|
37
|
+
and `format_tool_results`.
|
|
46
38
|
"""
|
|
47
39
|
|
|
48
|
-
metadata: dict[str, Any] | None = None
|
|
40
|
+
metadata: ClassVar[dict[str, Any] | None] = None
|
|
49
41
|
required_tools: ClassVar[list[str]] = [] # Tools that must be available
|
|
42
|
+
config_cls: ClassVar[type[BaseAgentConfig]] = BaseAgentConfig
|
|
50
43
|
|
|
51
|
-
def __init__(
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
# Filtering
|
|
55
|
-
allowed_tools: list[str] | None = None,
|
|
56
|
-
disallowed_tools: list[str] | None = None,
|
|
57
|
-
# Messages
|
|
58
|
-
system_prompt: str = GLOBAL_SYSTEM_PROMPT,
|
|
59
|
-
append_setup_output: bool = True,
|
|
60
|
-
initial_screenshot: bool = True,
|
|
61
|
-
# Misc
|
|
62
|
-
model_name: str = "mcp-agent",
|
|
63
|
-
response_agent: ResponseAgent | None = None,
|
|
64
|
-
auto_trace: bool = True,
|
|
65
|
-
verbose: bool = False,
|
|
66
|
-
) -> None:
|
|
67
|
-
"""
|
|
68
|
-
Initialize the base MCP agent.
|
|
44
|
+
def __init__(self, params: BaseCreateParams | None = None, **kwargs: Any) -> None:
|
|
45
|
+
if params is None:
|
|
46
|
+
import warnings
|
|
69
47
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
response_agent: Optional automation that can respond to the model's
|
|
83
|
-
outputs to keep the loop going (e.g., auto-continue/stop).
|
|
84
|
-
auto_trace: If True, automatically creates a trace/span for runs.
|
|
85
|
-
verbose: If True, increases logging verbosity for developer UX.
|
|
86
|
-
"""
|
|
48
|
+
warnings.warn(
|
|
49
|
+
f"Passing kwargs to {self.__class__.__name__}() is deprecated. "
|
|
50
|
+
f"Use {self.__class__.__name__}.create(...) instead.",
|
|
51
|
+
DeprecationWarning,
|
|
52
|
+
stacklevel=2,
|
|
53
|
+
)
|
|
54
|
+
CreateParams = type(
|
|
55
|
+
f"{self.config_cls.__name__}CreateParams",
|
|
56
|
+
(BaseCreateParams, self.config_cls),
|
|
57
|
+
{"__module__": self.config_cls.__module__},
|
|
58
|
+
)
|
|
59
|
+
params = CreateParams(**kwargs)
|
|
87
60
|
|
|
88
|
-
|
|
89
|
-
|
|
61
|
+
config_kwargs = {
|
|
62
|
+
k: getattr(params, k) for k in self.config_cls.model_fields if hasattr(params, k)
|
|
63
|
+
}
|
|
64
|
+
self.config = self.config_cls(**config_kwargs)
|
|
90
65
|
|
|
91
|
-
|
|
92
|
-
self.
|
|
66
|
+
# v5: Store execution context (EvalContext/Environment) - agent uses ctx.call_tool()
|
|
67
|
+
self.ctx: EvalContext | Environment | None = params.ctx
|
|
93
68
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
69
|
+
self.model_name: str = getattr(params, "model_name", "MCPAgent")
|
|
70
|
+
self.model: str = getattr(params, "model", None) or "unknown"
|
|
71
|
+
self.auto_respond = params.auto_respond
|
|
97
72
|
|
|
98
|
-
|
|
99
|
-
self.allowed_tools = allowed_tools
|
|
100
|
-
self.disallowed_tools = disallowed_tools or []
|
|
73
|
+
self.console = HUDConsole(logger=logger)
|
|
101
74
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
self.lifecycle_tools = []
|
|
75
|
+
if params.verbose:
|
|
76
|
+
self.console.set_verbose(True)
|
|
105
77
|
|
|
106
|
-
|
|
107
|
-
self.system_prompt = system_prompt
|
|
108
|
-
self.append_setup_output = append_setup_output
|
|
109
|
-
self.initial_screenshot = initial_screenshot
|
|
78
|
+
self.system_prompt = self.config.system_prompt
|
|
110
79
|
|
|
111
|
-
|
|
112
|
-
self.
|
|
113
|
-
self.
|
|
114
|
-
self.response_tool_name = None
|
|
80
|
+
self._available_tools: list[types.Tool] | None = None
|
|
81
|
+
self._tool_map: dict[str, types.Tool] = {}
|
|
82
|
+
self._initialized: bool = False
|
|
115
83
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
84
|
+
@classmethod
|
|
85
|
+
def create(cls, **kwargs: Any) -> MCPAgent:
|
|
86
|
+
"""
|
|
87
|
+
Factory method to create an agent with typed parameters.
|
|
88
|
+
"""
|
|
89
|
+
CreateParams = type(
|
|
90
|
+
f"{cls.config_cls.__name__}CreateParams",
|
|
91
|
+
(BaseCreateParams, cls.config_cls),
|
|
92
|
+
{"__module__": cls.config_cls.__module__},
|
|
93
|
+
)
|
|
94
|
+
return cls(params=CreateParams(**kwargs))
|
|
119
95
|
|
|
120
|
-
|
|
121
|
-
|
|
96
|
+
async def _initialize_from_ctx(self, ctx: EvalContext) -> None:
|
|
97
|
+
"""Initialize agent from EvalContext - discovers tools and sets up state.
|
|
122
98
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
99
|
+
This is the v5 initialization path. The agent uses ctx.call_tool() directly
|
|
100
|
+
for tool execution (no EnvironmentClient wrapper needed).
|
|
101
|
+
"""
|
|
102
|
+
from hud.eval.context import EvalContext
|
|
126
103
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
from hud.clients import MCPClient
|
|
104
|
+
if not isinstance(ctx, EvalContext):
|
|
105
|
+
raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
|
|
130
106
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
107
|
+
# Refresh tools from connections, then get filtered list for agent
|
|
108
|
+
await ctx.list_tools()
|
|
109
|
+
self._available_tools = ctx.as_tools()
|
|
110
|
+
self._tool_map = {t.name: t for t in self._available_tools}
|
|
134
111
|
|
|
135
|
-
#
|
|
136
|
-
|
|
112
|
+
# Validate required tools are present
|
|
113
|
+
available_tool_names = {t.name for t in self._available_tools}
|
|
114
|
+
missing_tools = [tool for tool in self.required_tools if tool not in available_tool_names]
|
|
115
|
+
if missing_tools:
|
|
137
116
|
raise ValueError(
|
|
138
|
-
"
|
|
117
|
+
f"Required tools are missing: {missing_tools}. "
|
|
118
|
+
f"Available tools: {sorted(available_tool_names)}"
|
|
139
119
|
)
|
|
140
120
|
|
|
141
|
-
|
|
121
|
+
self.console.info(
|
|
122
|
+
f"Agent initialized with {len(self._available_tools)} tools: "
|
|
123
|
+
f"{', '.join([t.name for t in self._available_tools])}"
|
|
124
|
+
)
|
|
142
125
|
|
|
143
|
-
#
|
|
144
|
-
|
|
145
|
-
await self.mcp_client.initialize()
|
|
146
|
-
except Exception as e:
|
|
147
|
-
self._handle_connection_error(e)
|
|
148
|
-
|
|
149
|
-
# If task is provided, add lifecycle tools
|
|
150
|
-
if isinstance(task, Task):
|
|
151
|
-
if task.agent_tools:
|
|
152
|
-
self.agent_tools = task.agent_tools
|
|
153
|
-
if task.setup_tool:
|
|
154
|
-
if isinstance(task.setup_tool, list):
|
|
155
|
-
for tool in task.setup_tool:
|
|
156
|
-
if not self.agent_tools or (
|
|
157
|
-
self.agent_tools and tool.name not in self.agent_tools
|
|
158
|
-
):
|
|
159
|
-
self.lifecycle_tools.append(tool.name)
|
|
160
|
-
elif not self.agent_tools or (
|
|
161
|
-
self.agent_tools and task.setup_tool.name not in self.agent_tools
|
|
162
|
-
):
|
|
163
|
-
self.lifecycle_tools.append(task.setup_tool.name)
|
|
164
|
-
if task.evaluate_tool:
|
|
165
|
-
if isinstance(task.evaluate_tool, list):
|
|
166
|
-
for tool in task.evaluate_tool:
|
|
167
|
-
if not self.agent_tools or (
|
|
168
|
-
self.agent_tools and tool.name not in self.agent_tools
|
|
169
|
-
):
|
|
170
|
-
self.lifecycle_tools.append(tool.name)
|
|
171
|
-
elif not self.agent_tools or (
|
|
172
|
-
self.agent_tools and task.evaluate_tool.name not in self.agent_tools
|
|
173
|
-
):
|
|
174
|
-
self.lifecycle_tools.append(task.evaluate_tool.name)
|
|
175
|
-
if task.system_prompt:
|
|
176
|
-
self.system_prompt += "\n\n" + task.system_prompt
|
|
177
|
-
|
|
178
|
-
# Re-apply filtering with updated lifecycle tools
|
|
179
|
-
await self._filter_tools()
|
|
180
|
-
|
|
181
|
-
async def run(self, prompt_or_task: str | Task | dict[str, Any], max_steps: int = 10) -> Trace:
|
|
182
|
-
"""
|
|
183
|
-
Run the agent with the given prompt or task.
|
|
126
|
+
# Call hook for subclass-specific initialization (e.g., tool format conversion)
|
|
127
|
+
self._on_tools_ready()
|
|
184
128
|
|
|
185
|
-
|
|
186
|
-
prompt_or_task: Either a string prompt for simple execution or a Task object
|
|
187
|
-
max_steps: Maximum number of steps (-1 for infinite)
|
|
129
|
+
self._initialized = True
|
|
188
130
|
|
|
189
|
-
|
|
190
|
-
|
|
131
|
+
def _on_tools_ready(self) -> None:
|
|
132
|
+
"""Hook called after tools are discovered and validated.
|
|
133
|
+
|
|
134
|
+
Subclasses can override this to perform provider-specific setup,
|
|
135
|
+
such as converting MCP tools to the provider's format.
|
|
136
|
+
|
|
137
|
+
Called by _initialize_from_ctx() after _available_tools is populated.
|
|
191
138
|
"""
|
|
192
|
-
#
|
|
193
|
-
from hud.datasets import Task
|
|
139
|
+
return # Default no-op - subclasses override for provider-specific setup
|
|
194
140
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
141
|
+
async def run(
|
|
142
|
+
self,
|
|
143
|
+
ctx: EvalContext,
|
|
144
|
+
*,
|
|
145
|
+
max_steps: int = 10,
|
|
146
|
+
) -> Trace:
|
|
147
|
+
"""
|
|
148
|
+
Run the agent on the given evaluation context.
|
|
199
149
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
await self.initialize(prompt_or_task)
|
|
150
|
+
The agent uses ctx.prompt as the task and ctx.call_tool() for tool execution.
|
|
151
|
+
Automatically calls ctx.submit() with the final answer.
|
|
203
152
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
153
|
+
Args:
|
|
154
|
+
ctx: EvalContext from hud.eval() - contains prompt and tools
|
|
155
|
+
max_steps: Maximum number of agent steps (-1 for infinite)
|
|
207
156
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
157
|
+
Returns:
|
|
158
|
+
Trace with done, content, isError fields
|
|
159
|
+
|
|
160
|
+
Example:
|
|
161
|
+
```python
|
|
162
|
+
async with hud.eval(task) as ctx:
|
|
163
|
+
agent = ClaudeAgent.create()
|
|
164
|
+
await agent.run(ctx)
|
|
165
|
+
# ctx.reward is set by the scenario's evaluate phase
|
|
166
|
+
```
|
|
167
|
+
"""
|
|
168
|
+
from hud.eval.context import EvalContext
|
|
212
169
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
170
|
+
if not isinstance(ctx, EvalContext):
|
|
171
|
+
raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
|
|
172
|
+
|
|
173
|
+
if not ctx.prompt:
|
|
174
|
+
if ctx.has_scenario:
|
|
175
|
+
# Scenario was specified but prompt is still empty
|
|
176
|
+
# (e.g., scenario returned empty string, or edge case not caught in scenarios.py)
|
|
177
|
+
scenario = ctx._task.scenario if ctx._task else "unknown"
|
|
178
|
+
raise ValueError(
|
|
179
|
+
f"ctx.prompt is not set.\n\n"
|
|
180
|
+
f"Scenario '{scenario}' was specified but returned an empty prompt.\n"
|
|
181
|
+
f"Check that the scenario's setup function returns a non-empty string."
|
|
222
182
|
)
|
|
223
183
|
else:
|
|
224
|
-
#
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
info={"error": str(e)},
|
|
184
|
+
# No scenario specified at all
|
|
185
|
+
raise ValueError(
|
|
186
|
+
"ctx.prompt is not set.\n\n"
|
|
187
|
+
"No scenario was specified in your task file.\n"
|
|
188
|
+
"Either add a 'scenario' field to your task, or set ctx.prompt manually "
|
|
189
|
+
"before running the agent."
|
|
231
190
|
)
|
|
232
|
-
finally:
|
|
233
|
-
# Cleanup auto-created resources
|
|
234
|
-
await self._cleanup()
|
|
235
191
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
Execute a task with setup and evaluate phases.
|
|
192
|
+
# Store context for tool calls
|
|
193
|
+
self.ctx = ctx
|
|
239
194
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
195
|
+
# Initialize tools from context
|
|
196
|
+
if not self._initialized:
|
|
197
|
+
await self._initialize_from_ctx(ctx)
|
|
243
198
|
|
|
244
|
-
Returns:
|
|
245
|
-
Trace with reward from evaluation
|
|
246
|
-
"""
|
|
247
199
|
try:
|
|
248
|
-
#
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
results = await self.call_tools(task.setup_tool)
|
|
259
|
-
if any(result.isError for result in results):
|
|
260
|
-
return Trace(
|
|
261
|
-
reward=0.0,
|
|
262
|
-
done=True,
|
|
263
|
-
content=f"Setup tool failed: {results}",
|
|
264
|
-
isError=True,
|
|
265
|
-
task=task,
|
|
266
|
-
)
|
|
267
|
-
|
|
268
|
-
if self.append_setup_output and isinstance(results[0].content, list):
|
|
269
|
-
start_context.extend(results[0].content)
|
|
270
|
-
if not self.initial_screenshot:
|
|
271
|
-
start_context = await self._filter_messages(start_context, include_types=["text"])
|
|
272
|
-
|
|
273
|
-
# Execute the task (agent loop) - this returns a empty trace object with the final response # noqa: E501
|
|
274
|
-
prompt_result = await self._run_context(start_context, max_steps=max_steps)
|
|
200
|
+
# Build initial context - optionally append setup tool output
|
|
201
|
+
# Check ctx first (task-level override), then fall back to agent config
|
|
202
|
+
append_setup = getattr(ctx, "append_setup_output", False) or getattr(
|
|
203
|
+
self.config, "append_setup_output", False
|
|
204
|
+
)
|
|
205
|
+
initial_prompt = ctx.prompt
|
|
206
|
+
if append_setup:
|
|
207
|
+
setup_output = getattr(ctx, "setup_output", None)
|
|
208
|
+
if setup_output:
|
|
209
|
+
initial_prompt = f"{initial_prompt}\n\n{setup_output}"
|
|
275
210
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
# Create an error result but don't return yet - we still want to evaluate
|
|
279
|
-
prompt_result = Trace(reward=0.0, done=True, content=str(e), isError=True, task=task)
|
|
280
|
-
prompt_result.populate_from_context()
|
|
211
|
+
# Build initial blocks (text prompt + optional screenshot)
|
|
212
|
+
initial_blocks = text_to_blocks(initial_prompt)
|
|
281
213
|
|
|
282
|
-
|
|
283
|
-
if task.evaluate_tool is not None:
|
|
284
|
-
try:
|
|
285
|
-
results = await self.call_tools(task.evaluate_tool)
|
|
286
|
-
|
|
287
|
-
if any(result.isError for result in results):
|
|
288
|
-
self.console.warning_log(f"Evaluate tool returned error: {results}")
|
|
289
|
-
# Still extract what we can from the error response
|
|
290
|
-
if prompt_result is None:
|
|
291
|
-
prompt_result = Trace(
|
|
292
|
-
reward=0.0,
|
|
293
|
-
done=True,
|
|
294
|
-
content="Task failed before evaluation",
|
|
295
|
-
isError=True,
|
|
296
|
-
task=task,
|
|
297
|
-
)
|
|
298
|
-
prompt_result.reward = 0.0 # Default to 0 on error
|
|
299
|
-
else:
|
|
300
|
-
# Extract reward and content from evaluation
|
|
301
|
-
if results:
|
|
302
|
-
reward = find_reward(results[0])
|
|
303
|
-
self.console.info_log(f"Eval: {reward:.4f} {task.evaluate_tool}")
|
|
304
|
-
eval_content = find_content(results[0])
|
|
305
|
-
|
|
306
|
-
# Update the prompt result with evaluation reward
|
|
307
|
-
if prompt_result is None:
|
|
308
|
-
prompt_result = Trace(
|
|
309
|
-
reward=reward,
|
|
310
|
-
done=True,
|
|
311
|
-
content=eval_content or "",
|
|
312
|
-
isError=False,
|
|
313
|
-
task=task,
|
|
314
|
-
)
|
|
315
|
-
else:
|
|
316
|
-
prompt_result.reward = reward
|
|
214
|
+
result = await self._run_context(initial_blocks, max_steps=max_steps)
|
|
317
215
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
if prompt_result.content:
|
|
323
|
-
prompt_result.content += "\n\n" + eval_content
|
|
324
|
-
else:
|
|
325
|
-
prompt_result.content = eval_content
|
|
216
|
+
# Propagate error state to context for platform visibility
|
|
217
|
+
if result.isError and hasattr(ctx, "error"):
|
|
218
|
+
error_msg = result.info.get("error") if result.info else result.content
|
|
219
|
+
ctx.error = Exception(str(error_msg)) if error_msg else Exception("Agent error")
|
|
326
220
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
if prompt_result is None:
|
|
331
|
-
prompt_result = Trace(
|
|
332
|
-
reward=0.0,
|
|
333
|
-
done=True,
|
|
334
|
-
content=f"Evaluation failed: {e}",
|
|
335
|
-
isError=True,
|
|
336
|
-
task=task,
|
|
337
|
-
)
|
|
221
|
+
# Submit final answer to context (only if scenario is running)
|
|
222
|
+
if result.content and ctx.has_scenario:
|
|
223
|
+
await ctx.submit(result.content)
|
|
338
224
|
|
|
339
|
-
|
|
225
|
+
return result
|
|
340
226
|
|
|
341
|
-
|
|
227
|
+
except Exception as e:
|
|
228
|
+
logger.exception("Error while running agent:")
|
|
229
|
+
# Propagate error to context for platform visibility
|
|
230
|
+
if hasattr(ctx, "error"):
|
|
231
|
+
ctx.error = e
|
|
232
|
+
return Trace(
|
|
233
|
+
reward=0.0,
|
|
234
|
+
done=True,
|
|
235
|
+
content=f"Agent failed with error: {e}",
|
|
236
|
+
isError=True,
|
|
237
|
+
info={"error": str(e)},
|
|
238
|
+
)
|
|
239
|
+
finally:
|
|
240
|
+
# Cleanup auto-created resources
|
|
241
|
+
await self._cleanup()
|
|
342
242
|
|
|
343
243
|
async def _run_context(
|
|
344
244
|
self, context: list[types.ContentBlock], *, max_steps: int = 10
|
|
@@ -356,6 +256,8 @@ class MCPAgent(ABC):
|
|
|
356
256
|
final_response = None
|
|
357
257
|
error = None
|
|
358
258
|
|
|
259
|
+
messages: list[Any] = []
|
|
260
|
+
|
|
359
261
|
try:
|
|
360
262
|
# Start with system messages
|
|
361
263
|
messages = await self.get_system_messages()
|
|
@@ -380,19 +282,17 @@ class MCPAgent(ABC):
|
|
|
380
282
|
|
|
381
283
|
# Check if we should stop
|
|
382
284
|
if response.done or not response.tool_calls:
|
|
383
|
-
#
|
|
384
|
-
decision = "STOP"
|
|
385
|
-
if self.
|
|
285
|
+
# Use auto_respond to decide whether to stop
|
|
286
|
+
decision: Literal["STOP", "CONTINUE"] = "STOP"
|
|
287
|
+
if self.auto_respond and response.content:
|
|
386
288
|
try:
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
)
|
|
289
|
+
from hud.agents.misc import ResponseAgent
|
|
290
|
+
|
|
291
|
+
response_agent = ResponseAgent()
|
|
292
|
+
decision = await response_agent.determine_response(response.content)
|
|
390
293
|
except Exception as e:
|
|
391
|
-
self.console.warning_log(f"
|
|
294
|
+
self.console.warning_log(f"Auto-respond failed: {e}")
|
|
392
295
|
if decision == "STOP":
|
|
393
|
-
# Try to submit response through lifecycle tool
|
|
394
|
-
await self._maybe_submit_response(response, messages)
|
|
395
|
-
|
|
396
296
|
self.console.debug("Stopping execution")
|
|
397
297
|
final_response = response
|
|
398
298
|
break
|
|
@@ -403,11 +303,7 @@ class MCPAgent(ABC):
|
|
|
403
303
|
|
|
404
304
|
# 2. Execute tools
|
|
405
305
|
tool_calls = response.tool_calls
|
|
406
|
-
for tool_call in tool_calls:
|
|
407
|
-
self.console.info_log(f"{tool_call}")
|
|
408
306
|
tool_results = await self.call_tools(tool_calls)
|
|
409
|
-
for tool_result in tool_results:
|
|
410
|
-
self.console.info_log(f"{tool_result}")
|
|
411
307
|
|
|
412
308
|
# 3. Format tool results and add to messages
|
|
413
309
|
tool_messages = await self.format_tool_results(tool_calls, tool_results)
|
|
@@ -449,8 +345,17 @@ class MCPAgent(ABC):
|
|
|
449
345
|
is_error = False
|
|
450
346
|
|
|
451
347
|
# Ensure all parameters are the correct type
|
|
348
|
+
# Use ctx.reward if already set (e.g., from scenario evaluate), otherwise 0.0
|
|
349
|
+
# Note: For v4 tasks with evaluate_tool, reward is set in __aexit__ after this returns,
|
|
350
|
+
# so callers should prefer ctx.reward over Trace.reward for the final result.
|
|
351
|
+
reward = 0.0
|
|
352
|
+
if self.ctx is not None:
|
|
353
|
+
ctx_reward = getattr(self.ctx, "reward", None)
|
|
354
|
+
if ctx_reward is not None:
|
|
355
|
+
reward = ctx_reward
|
|
356
|
+
|
|
452
357
|
trace_params = {
|
|
453
|
-
"reward":
|
|
358
|
+
"reward": reward,
|
|
454
359
|
"done": True,
|
|
455
360
|
"messages": messages,
|
|
456
361
|
"content": final_response.content if final_response else error,
|
|
@@ -459,16 +364,13 @@ class MCPAgent(ABC):
|
|
|
459
364
|
}
|
|
460
365
|
trace_result = Trace(**trace_params)
|
|
461
366
|
|
|
462
|
-
# Populate trace steps from current context
|
|
463
|
-
trace_result.populate_from_context()
|
|
464
|
-
|
|
465
367
|
return trace_result
|
|
466
368
|
|
|
467
369
|
async def call_tools(
|
|
468
370
|
self, tool_call: MCPToolCall | list[MCPToolCall] | None = None
|
|
469
371
|
) -> list[MCPToolResult]:
|
|
470
372
|
"""
|
|
471
|
-
Call
|
|
373
|
+
Call tools through the bound EvalContext.
|
|
472
374
|
|
|
473
375
|
Args:
|
|
474
376
|
tool_call: MCPToolCall or list of MCPToolCall
|
|
@@ -482,20 +384,17 @@ class MCPAgent(ABC):
|
|
|
482
384
|
if isinstance(tool_call, MCPToolCall):
|
|
483
385
|
tool_call = [tool_call]
|
|
484
386
|
|
|
485
|
-
if self.
|
|
486
|
-
raise ValueError("
|
|
387
|
+
if self.ctx is None:
|
|
388
|
+
raise ValueError("Agent not bound to context - call run(ctx) first")
|
|
487
389
|
|
|
488
390
|
results: list[MCPToolResult] = []
|
|
489
391
|
for tc in tool_call:
|
|
490
392
|
try:
|
|
491
393
|
self.console.debug(f"Calling tool: {tc}")
|
|
492
|
-
|
|
394
|
+
result = await self.ctx.call_tool(tc)
|
|
395
|
+
results.append(MCPToolResult(content=result.content, isError=result.isError))
|
|
493
396
|
except TimeoutError as e:
|
|
494
397
|
self.console.error_log(f"Tool execution timed out: {e}")
|
|
495
|
-
try:
|
|
496
|
-
await self.mcp_client.shutdown()
|
|
497
|
-
except Exception as close_err:
|
|
498
|
-
self.console.debug(f"Failed to close MCP client cleanly: {close_err}")
|
|
499
398
|
raise
|
|
500
399
|
except Exception as e:
|
|
501
400
|
self.console.error_log(f"Tool execution failed: {e}")
|
|
@@ -514,8 +413,6 @@ class MCPAgent(ABC):
|
|
|
514
413
|
"""
|
|
515
414
|
Get response from the model including any tool calls.
|
|
516
415
|
|
|
517
|
-
NOTE: Subclasses should decorate this method with:
|
|
518
|
-
@hud.instrument(span_type="agent", record_args=False, record_result=True)
|
|
519
416
|
|
|
520
417
|
Args:
|
|
521
418
|
messages: Current conversation messages
|
|
@@ -575,148 +472,13 @@ class MCPAgent(ABC):
|
|
|
575
472
|
|
|
576
473
|
return await self.format_blocks(blocks)
|
|
577
474
|
|
|
578
|
-
async def _filter_tools(self) -> None:
|
|
579
|
-
"""Apply tool filtering based on allowed/disallowed lists."""
|
|
580
|
-
# Get all tools from client
|
|
581
|
-
if self.mcp_client is None:
|
|
582
|
-
raise ValueError("MCP client is not initialized")
|
|
583
|
-
|
|
584
|
-
all_tools = await self.mcp_client.list_tools()
|
|
585
|
-
|
|
586
|
-
response_tools_by_server: dict[str, str] = {} # server_name -> tool_name
|
|
587
|
-
for tool in all_tools:
|
|
588
|
-
if "response" in tool.name or tool.name == "response":
|
|
589
|
-
self.console.debug(f"Found response tool: '{tool.name}'")
|
|
590
|
-
# Extract server name from tool name (e.g., "grader_response" -> "grader")
|
|
591
|
-
if "_" in tool.name:
|
|
592
|
-
server_name = tool.name.split("_", 1)[0]
|
|
593
|
-
response_tools_by_server[server_name] = tool.name
|
|
594
|
-
else:
|
|
595
|
-
response_tools_by_server["_default"] = tool.name
|
|
596
|
-
|
|
597
|
-
# Add response tool to lifecycle tools BEFORE filtering
|
|
598
|
-
if response_tools_by_server and hasattr(self.mcp_client, "mcp_config"):
|
|
599
|
-
# Get server names in order from mcp_config
|
|
600
|
-
server_names = list(self.mcp_client.mcp_config.keys())
|
|
601
|
-
self.console.debug(f"Server names: {server_names}")
|
|
602
|
-
|
|
603
|
-
# Try to find response tool from last server first
|
|
604
|
-
response_tool_name = None
|
|
605
|
-
for server_name in reversed(server_names):
|
|
606
|
-
if server_name in response_tools_by_server:
|
|
607
|
-
response_tool_name = response_tools_by_server[server_name]
|
|
608
|
-
self.console.debug(
|
|
609
|
-
f"Found response tool '{response_tool_name}' from server '{server_name}'"
|
|
610
|
-
)
|
|
611
|
-
break
|
|
612
|
-
|
|
613
|
-
# Fallback to any response tool
|
|
614
|
-
if not response_tool_name and response_tools_by_server:
|
|
615
|
-
response_tool_name = next(iter(response_tools_by_server.values()))
|
|
616
|
-
self.console.debug(f"Using fallback response tool '{response_tool_name}'")
|
|
617
|
-
|
|
618
|
-
# Add to lifecycle tools if found
|
|
619
|
-
if response_tool_name and response_tool_name not in self.lifecycle_tools:
|
|
620
|
-
self.console.debug(f"Auto-detected '{response_tool_name}' tool as a lifecycle tool")
|
|
621
|
-
self.response_tool_name = response_tool_name
|
|
622
|
-
self.lifecycle_tools.append(response_tool_name)
|
|
623
|
-
elif response_tool_name:
|
|
624
|
-
self.console.debug(
|
|
625
|
-
f"Response tool '{response_tool_name}' already in lifecycle_tools"
|
|
626
|
-
)
|
|
627
|
-
self.response_tool_name = response_tool_name
|
|
628
|
-
else:
|
|
629
|
-
self.console.debug("No response tools found or no mcp_config")
|
|
630
|
-
|
|
631
|
-
# Filter tools
|
|
632
|
-
self._available_tools = []
|
|
633
|
-
self._tool_map = {}
|
|
634
|
-
|
|
635
|
-
self.console.debug(f"All tools: {[t.name for t in all_tools]}")
|
|
636
|
-
self.console.debug(f"Allowed tools: {self.allowed_tools}")
|
|
637
|
-
self.console.debug(f"Agent tools: {self.agent_tools}")
|
|
638
|
-
self.console.debug(f"Disallowed tools: {self.disallowed_tools}")
|
|
639
|
-
self.console.debug(f"Lifecycle tools: {self.lifecycle_tools}")
|
|
640
|
-
|
|
641
|
-
for tool in all_tools:
|
|
642
|
-
# Lifecycle tools (setup, evaluate, response) should always be included
|
|
643
|
-
is_lifecycle = tool.name in self.lifecycle_tools
|
|
644
|
-
|
|
645
|
-
# Check if tool should be included
|
|
646
|
-
if not is_lifecycle:
|
|
647
|
-
if self.allowed_tools and tool.name not in self.allowed_tools:
|
|
648
|
-
self.console.debug(f"Skipping tool '{tool.name}' - not in allowed_tools")
|
|
649
|
-
continue
|
|
650
|
-
if self.agent_tools and tool.name not in self.agent_tools:
|
|
651
|
-
self.console.debug(f"Skipping tool '{tool.name}' - not in agent_tools")
|
|
652
|
-
continue
|
|
653
|
-
if tool.name in self.disallowed_tools:
|
|
654
|
-
self.console.debug(f"Skipping tool '{tool.name}' - in disallowed_tools")
|
|
655
|
-
continue
|
|
656
|
-
|
|
657
|
-
self.console.debug(
|
|
658
|
-
f"Adding tool '{tool.name}' to available tools (lifecycle={is_lifecycle})"
|
|
659
|
-
)
|
|
660
|
-
self._available_tools.append(tool)
|
|
661
|
-
self._tool_map[tool.name] = tool
|
|
662
|
-
|
|
663
|
-
# Check if all required tools are available
|
|
664
|
-
if self.required_tools:
|
|
665
|
-
available_tool_names = {tool.name for tool in self._available_tools}
|
|
666
|
-
missing_tools = [
|
|
667
|
-
tool for tool in self.required_tools if tool not in available_tool_names
|
|
668
|
-
]
|
|
669
|
-
if missing_tools:
|
|
670
|
-
raise ValueError(
|
|
671
|
-
f"Required tools not available: {missing_tools}. "
|
|
672
|
-
f"Available tools: {list(available_tool_names)}"
|
|
673
|
-
)
|
|
674
|
-
|
|
675
|
-
available_tools = self.get_available_tools()
|
|
676
|
-
self.console.info(
|
|
677
|
-
f"Agent initialized with {len(available_tools)} tools: {', '.join([t.name for t in available_tools])}" # noqa: E501
|
|
678
|
-
)
|
|
679
|
-
|
|
680
|
-
async def _maybe_submit_response(self, response: AgentResponse, messages: list[Any]) -> None:
|
|
681
|
-
"""Submit response through lifecycle tool if available.
|
|
682
|
-
|
|
683
|
-
Args:
|
|
684
|
-
response: The agent's response
|
|
685
|
-
messages: The current message history (will be modified in-place)
|
|
686
|
-
"""
|
|
687
|
-
if self.response_tool_name:
|
|
688
|
-
self.console.debug(f"Calling response lifecycle tool: {self.response_tool_name}")
|
|
689
|
-
try:
|
|
690
|
-
# Call the response tool with the agent's response
|
|
691
|
-
response_tool_call = MCPToolCall(
|
|
692
|
-
name=self.response_tool_name, arguments={"response": response.content}
|
|
693
|
-
)
|
|
694
|
-
response_results = await self.call_tools(response_tool_call)
|
|
695
|
-
|
|
696
|
-
# Format and add the response tool results to messages
|
|
697
|
-
response_messages = await self.format_tool_results(
|
|
698
|
-
[response_tool_call], response_results
|
|
699
|
-
)
|
|
700
|
-
messages.extend(response_messages)
|
|
701
|
-
|
|
702
|
-
# Mark the task as done
|
|
703
|
-
self.console.debug("Response lifecycle tool executed, marking task as done")
|
|
704
|
-
except Exception as e:
|
|
705
|
-
self.console.error_log(f"Response lifecycle tool failed: {e}")
|
|
706
|
-
|
|
707
|
-
async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
|
|
708
|
-
"""Inject metadata into the metadata of the initialize request."""
|
|
709
|
-
if self.metadata:
|
|
710
|
-
patch_mcp_config(
|
|
711
|
-
mcp_config,
|
|
712
|
-
MCPConfigPatch(meta=self.metadata),
|
|
713
|
-
)
|
|
714
|
-
self._auto_trace_cm = setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
|
|
715
|
-
|
|
716
475
|
def get_available_tools(self) -> list[types.Tool]:
|
|
717
476
|
"""Get list of available MCP tools for LLM use (excludes lifecycle tools)."""
|
|
718
|
-
|
|
719
|
-
|
|
477
|
+
if self._available_tools is None:
|
|
478
|
+
raise RuntimeError(
|
|
479
|
+
"Tools have not been initialized. Call initialize() before accessing available tools." # noqa: E501
|
|
480
|
+
)
|
|
481
|
+
return self._available_tools
|
|
720
482
|
|
|
721
483
|
def get_tool_schemas(self) -> list[dict]:
|
|
722
484
|
"""Get tool schemas in a format suitable for the model."""
|
|
@@ -752,65 +514,8 @@ class MCPAgent(ABC):
|
|
|
752
514
|
|
|
753
515
|
async def _cleanup(self) -> None:
|
|
754
516
|
"""Cleanup resources."""
|
|
755
|
-
#
|
|
756
|
-
|
|
757
|
-
try:
|
|
758
|
-
self._auto_trace_cm.__exit__(None, None, None)
|
|
759
|
-
self.console.debug("Closed auto-created trace")
|
|
760
|
-
except Exception as e:
|
|
761
|
-
self.console.warning_log(f"Failed to close auto-created trace: {e}")
|
|
762
|
-
finally:
|
|
763
|
-
self._auto_trace_cm = None
|
|
764
|
-
|
|
765
|
-
# Clean up auto-created client
|
|
766
|
-
if self._auto_created_client and self.mcp_client:
|
|
767
|
-
try:
|
|
768
|
-
await self.mcp_client.shutdown()
|
|
769
|
-
self.console.debug("Closed auto-created MCPClient")
|
|
770
|
-
except Exception as e:
|
|
771
|
-
self.console.warning_log(f"Failed to close auto-created client: {e}")
|
|
772
|
-
finally:
|
|
773
|
-
self.mcp_client = None
|
|
774
|
-
self._auto_created_client = False
|
|
775
|
-
|
|
776
|
-
def _is_connection_error(self, e: Exception) -> bool:
|
|
777
|
-
"""Check if an exception is a connection error."""
|
|
778
|
-
error_msg = str(e).lower()
|
|
779
|
-
return any(
|
|
780
|
-
pattern in error_msg
|
|
781
|
-
for pattern in [
|
|
782
|
-
"connection",
|
|
783
|
-
"connect",
|
|
784
|
-
"refused",
|
|
785
|
-
"failed",
|
|
786
|
-
"could not connect",
|
|
787
|
-
"mcp server",
|
|
788
|
-
]
|
|
789
|
-
)
|
|
790
|
-
|
|
791
|
-
def _get_connection_error_message(self, e: Exception) -> str:
|
|
792
|
-
"""Extract a helpful connection error message."""
|
|
793
|
-
import re
|
|
794
|
-
|
|
795
|
-
url_match = re.search(r"https?://[^\s]+", str(e))
|
|
796
|
-
url = url_match.group(0) if url_match else "the MCP server"
|
|
797
|
-
return f"Connection failed: Could not connect to {url}. Is your MCP client/server running?"
|
|
798
|
-
|
|
799
|
-
def _handle_connection_error(self, e: Exception) -> None:
|
|
800
|
-
"""Handle connection errors with helpful messages."""
|
|
801
|
-
if self._is_connection_error(e):
|
|
802
|
-
msg = self._get_connection_error_message(e)
|
|
803
|
-
# Always show connection errors, not just when logging is enabled
|
|
804
|
-
self.console.error(f"❌ {msg}")
|
|
805
|
-
self.console.info("💡 Make sure the MCP server is started before running the agent.")
|
|
806
|
-
|
|
807
|
-
# For localhost, provide specific instructions
|
|
808
|
-
error_str = str(e).lower()
|
|
809
|
-
if "localhost" in error_str or "127.0.0.1" in error_str:
|
|
810
|
-
self.console.info(" Run 'hud dev' in another terminal to start the MCP server")
|
|
811
|
-
|
|
812
|
-
raise RuntimeError(msg) from e
|
|
813
|
-
raise
|
|
517
|
+
# Clear context reference
|
|
518
|
+
self.ctx = None
|
|
814
519
|
|
|
815
520
|
|
|
816
521
|
def _format_error_result(error_message: str) -> MCPToolResult:
|
|
@@ -824,14 +529,45 @@ def text_to_blocks(text: str) -> list[types.ContentBlock]:
|
|
|
824
529
|
def find_reward(result: MCPToolResult) -> float:
|
|
825
530
|
"""Find the reward in the result.
|
|
826
531
|
|
|
827
|
-
Agent accepts "reward", "grade", "score"
|
|
532
|
+
Agent accepts "reward", "grade", "score", or weighted subscores
|
|
828
533
|
|
|
534
|
+
If isError is True, return 0.0 (error results should not contribute positive reward).
|
|
829
535
|
If not found, return 0.0
|
|
830
536
|
"""
|
|
537
|
+
# Error results should return 0.0 - don't extract reward from error responses
|
|
538
|
+
if result.isError:
|
|
539
|
+
logger.warning("Evaluate tool returned error, using reward=0.0")
|
|
540
|
+
return 0.0
|
|
541
|
+
|
|
831
542
|
accept_keys = ["reward", "grade", "score"]
|
|
543
|
+
|
|
544
|
+
# Check for direct reward/grade/score keys
|
|
832
545
|
for key in accept_keys:
|
|
833
546
|
if isinstance(result.structuredContent, dict) and key in result.structuredContent:
|
|
834
547
|
return result.structuredContent[key]
|
|
548
|
+
|
|
549
|
+
# Check for subscores and weights format
|
|
550
|
+
if (
|
|
551
|
+
isinstance(result.structuredContent, dict)
|
|
552
|
+
and "subscores" in result.structuredContent
|
|
553
|
+
and "weights" in result.structuredContent
|
|
554
|
+
):
|
|
555
|
+
subscores = result.structuredContent["subscores"]
|
|
556
|
+
weights = result.structuredContent["weights"]
|
|
557
|
+
if isinstance(subscores, dict) and isinstance(weights, dict):
|
|
558
|
+
try:
|
|
559
|
+
# Multiply each subscore by its corresponding weight and sum
|
|
560
|
+
reward = sum(
|
|
561
|
+
float(subscores[key]) * float(weights.get(key, 0.0))
|
|
562
|
+
for key in subscores
|
|
563
|
+
if key in weights
|
|
564
|
+
)
|
|
565
|
+
return reward
|
|
566
|
+
except (ValueError, TypeError) as e:
|
|
567
|
+
logger.error("Failed to parse subscores/weights: %s", e)
|
|
568
|
+
return 0.0
|
|
569
|
+
|
|
570
|
+
# Check for reward in JSON text content
|
|
835
571
|
if isinstance(result.content, list):
|
|
836
572
|
for content in result.content:
|
|
837
573
|
if isinstance(content, types.TextContent):
|
|
@@ -842,6 +578,8 @@ def find_reward(result: MCPToolResult) -> float:
|
|
|
842
578
|
return value
|
|
843
579
|
except json.JSONDecodeError:
|
|
844
580
|
pass
|
|
581
|
+
|
|
582
|
+
logger.error("Couldn't parse reward from result: %s", str(result.structuredContent))
|
|
845
583
|
return 0.0
|
|
846
584
|
|
|
847
585
|
|