hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/agents/base.py
CHANGED
|
@@ -9,336 +9,233 @@ from abc import ABC, abstractmethod
|
|
|
9
9
|
from typing import TYPE_CHECKING, Any, ClassVar, Literal
|
|
10
10
|
|
|
11
11
|
import mcp.types as types
|
|
12
|
+
from pydantic import BaseModel, ConfigDict
|
|
12
13
|
|
|
13
|
-
from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
|
|
14
|
+
from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult, Trace
|
|
14
15
|
from hud.utils.hud_console import HUDConsole
|
|
15
|
-
from hud.utils.mcp import MCPConfigPatch, patch_mcp_config, setup_hud_telemetry
|
|
16
16
|
|
|
17
17
|
if TYPE_CHECKING:
|
|
18
|
-
from hud.
|
|
19
|
-
from hud.
|
|
20
|
-
|
|
21
|
-
from .misc import ResponseAgent
|
|
18
|
+
from hud.environment import Environment
|
|
19
|
+
from hud.eval.context import EvalContext
|
|
22
20
|
|
|
23
21
|
|
|
24
22
|
logger = logging.getLogger(__name__)
|
|
25
23
|
|
|
26
|
-
|
|
24
|
+
|
|
25
|
+
class BaseCreateParams(BaseModel):
|
|
26
|
+
"""Runtime parameters for agent creation."""
|
|
27
|
+
|
|
28
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
29
|
+
|
|
30
|
+
# Primary way to bind agent to execution context (v5)
|
|
31
|
+
ctx: Any | None = None # EvalContext or Environment - agent uses this for tool calls
|
|
32
|
+
|
|
33
|
+
auto_respond: bool = False
|
|
34
|
+
verbose: bool = False
|
|
27
35
|
|
|
28
36
|
|
|
29
37
|
class MCPAgent(ABC):
|
|
30
38
|
"""
|
|
31
39
|
Base class for MCP-enabled agents.
|
|
32
40
|
|
|
33
|
-
|
|
34
|
-
-
|
|
35
|
-
|
|
36
|
-
-
|
|
37
|
-
and automatic marking of lifecycle tools (setup/evaluate) from a `Task`.
|
|
38
|
-
- Messaging: system prompt handling, optional inclusion of setup output on
|
|
39
|
-
the first turn, and control over initial screenshots.
|
|
40
|
-
- Telemetry & UX: standardized logging/printing via `HUDConsole` and optional
|
|
41
|
-
automatic tracing (`auto_trace`).
|
|
41
|
+
Agents interact with MCP servers through an EvalContext:
|
|
42
|
+
- run(ctx): Main entry point - takes EvalContext from hud.eval()
|
|
43
|
+
- ctx.call_tool(): Used internally for all tool execution
|
|
44
|
+
- ctx.submit(): Called automatically with agent's final response
|
|
42
45
|
|
|
43
46
|
Subclasses implement provider-specific formatting and response fetching
|
|
44
|
-
by overriding
|
|
45
|
-
|
|
47
|
+
by overriding: `get_system_messages`, `get_response`, `format_blocks`,
|
|
48
|
+
and `format_tool_results`.
|
|
46
49
|
"""
|
|
47
50
|
|
|
48
|
-
metadata: dict[str, Any] | None = None
|
|
51
|
+
metadata: ClassVar[dict[str, Any] | None] = None
|
|
49
52
|
required_tools: ClassVar[list[str]] = [] # Tools that must be available
|
|
53
|
+
config_cls: ClassVar[type[BaseAgentConfig]] = BaseAgentConfig
|
|
50
54
|
|
|
51
|
-
def __init__(
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
# Filtering
|
|
55
|
-
allowed_tools: list[str] | None = None,
|
|
56
|
-
disallowed_tools: list[str] | None = None,
|
|
57
|
-
# Messages
|
|
58
|
-
system_prompt: str = GLOBAL_SYSTEM_PROMPT,
|
|
59
|
-
append_setup_output: bool = True,
|
|
60
|
-
initial_screenshot: bool = True,
|
|
61
|
-
# Misc
|
|
62
|
-
model_name: str = "mcp-agent",
|
|
63
|
-
response_agent: ResponseAgent | None = None,
|
|
64
|
-
auto_trace: bool = True,
|
|
65
|
-
verbose: bool = False,
|
|
66
|
-
) -> None:
|
|
67
|
-
"""
|
|
68
|
-
Initialize the base MCP agent.
|
|
55
|
+
def __init__(self, params: BaseCreateParams | None = None, **kwargs: Any) -> None:
|
|
56
|
+
if params is None:
|
|
57
|
+
import warnings
|
|
69
58
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
response_agent: Optional automation that can respond to the model's
|
|
83
|
-
outputs to keep the loop going (e.g., auto-continue/stop).
|
|
84
|
-
auto_trace: If True, automatically creates a trace/span for runs.
|
|
85
|
-
verbose: If True, increases logging verbosity for developer UX.
|
|
86
|
-
"""
|
|
59
|
+
warnings.warn(
|
|
60
|
+
f"Passing kwargs to {self.__class__.__name__}() is deprecated. "
|
|
61
|
+
f"Use {self.__class__.__name__}.create(...) instead.",
|
|
62
|
+
DeprecationWarning,
|
|
63
|
+
stacklevel=2,
|
|
64
|
+
)
|
|
65
|
+
CreateParams = type(
|
|
66
|
+
f"{self.config_cls.__name__}CreateParams",
|
|
67
|
+
(BaseCreateParams, self.config_cls),
|
|
68
|
+
{"__module__": self.config_cls.__module__},
|
|
69
|
+
)
|
|
70
|
+
params = CreateParams(**kwargs)
|
|
87
71
|
|
|
88
|
-
|
|
89
|
-
|
|
72
|
+
config_kwargs = {
|
|
73
|
+
k: getattr(params, k) for k in self.config_cls.model_fields if hasattr(params, k)
|
|
74
|
+
}
|
|
75
|
+
self.config = self.config_cls(**config_kwargs)
|
|
90
76
|
|
|
91
|
-
|
|
92
|
-
self.
|
|
77
|
+
# v5: Store execution context (EvalContext/Environment) - agent uses ctx.call_tool()
|
|
78
|
+
self.ctx: EvalContext | Environment | None = params.ctx
|
|
93
79
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
80
|
+
self.model_name: str = getattr(params, "model_name", "MCPAgent")
|
|
81
|
+
self.model: str = getattr(params, "model", None) or "unknown"
|
|
82
|
+
self.auto_respond = params.auto_respond
|
|
97
83
|
|
|
98
|
-
|
|
99
|
-
self.allowed_tools = allowed_tools
|
|
100
|
-
self.disallowed_tools = disallowed_tools or []
|
|
84
|
+
self.console = HUDConsole(logger=logger)
|
|
101
85
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
self.lifecycle_tools = []
|
|
86
|
+
if params.verbose:
|
|
87
|
+
self.console.set_verbose(True)
|
|
105
88
|
|
|
106
|
-
|
|
107
|
-
self.system_prompt = system_prompt
|
|
108
|
-
self.append_setup_output = append_setup_output
|
|
109
|
-
self.initial_screenshot = initial_screenshot
|
|
89
|
+
self.system_prompt = self.config.system_prompt
|
|
110
90
|
|
|
111
|
-
|
|
112
|
-
self.
|
|
113
|
-
self.
|
|
114
|
-
self.response_tool_name = None
|
|
91
|
+
self._available_tools: list[types.Tool] | None = None
|
|
92
|
+
self._tool_map: dict[str, types.Tool] = {}
|
|
93
|
+
self._initialized: bool = False
|
|
115
94
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
95
|
+
@classmethod
|
|
96
|
+
def create(cls, **kwargs: Any) -> MCPAgent:
|
|
97
|
+
"""
|
|
98
|
+
Factory method to create an agent with typed parameters.
|
|
99
|
+
"""
|
|
100
|
+
CreateParams = type(
|
|
101
|
+
f"{cls.config_cls.__name__}CreateParams",
|
|
102
|
+
(BaseCreateParams, cls.config_cls),
|
|
103
|
+
{"__module__": cls.config_cls.__module__},
|
|
104
|
+
)
|
|
105
|
+
return cls(params=CreateParams(**kwargs))
|
|
119
106
|
|
|
120
|
-
|
|
121
|
-
|
|
107
|
+
async def _initialize_from_ctx(self, ctx: EvalContext) -> None:
|
|
108
|
+
"""Initialize agent from EvalContext - discovers tools and sets up state.
|
|
122
109
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
110
|
+
This is the v5 initialization path. The agent uses ctx.call_tool() directly
|
|
111
|
+
for tool execution (no EnvironmentClient wrapper needed).
|
|
112
|
+
"""
|
|
113
|
+
from hud.eval.context import EvalContext
|
|
126
114
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
from hud.clients import MCPClient
|
|
115
|
+
if not isinstance(ctx, EvalContext):
|
|
116
|
+
raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
|
|
130
117
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
118
|
+
# Refresh tools from connections, then get filtered list for agent
|
|
119
|
+
await ctx.list_tools()
|
|
120
|
+
self._available_tools = ctx.as_tools()
|
|
121
|
+
self._tool_map = {t.name: t for t in self._available_tools}
|
|
134
122
|
|
|
135
|
-
#
|
|
136
|
-
|
|
123
|
+
# Validate required tools are present
|
|
124
|
+
available_tool_names = {t.name for t in self._available_tools}
|
|
125
|
+
missing_tools = [tool for tool in self.required_tools if tool not in available_tool_names]
|
|
126
|
+
if missing_tools:
|
|
137
127
|
raise ValueError(
|
|
138
|
-
"
|
|
128
|
+
f"Required tools are missing: {missing_tools}. "
|
|
129
|
+
f"Available tools: {sorted(available_tool_names)}"
|
|
139
130
|
)
|
|
140
131
|
|
|
141
|
-
|
|
132
|
+
self.console.info(
|
|
133
|
+
f"Agent initialized with {len(self._available_tools)} tools: "
|
|
134
|
+
f"{', '.join([t.name for t in self._available_tools])}"
|
|
135
|
+
)
|
|
142
136
|
|
|
143
|
-
#
|
|
144
|
-
|
|
145
|
-
await self.mcp_client.initialize()
|
|
146
|
-
except Exception as e:
|
|
147
|
-
self._handle_connection_error(e)
|
|
148
|
-
|
|
149
|
-
# If task is provided, add lifecycle tools
|
|
150
|
-
if isinstance(task, Task):
|
|
151
|
-
if task.agent_tools:
|
|
152
|
-
self.agent_tools = task.agent_tools
|
|
153
|
-
if task.setup_tool:
|
|
154
|
-
if isinstance(task.setup_tool, list):
|
|
155
|
-
for tool in task.setup_tool:
|
|
156
|
-
if not self.agent_tools or (
|
|
157
|
-
self.agent_tools and tool.name not in self.agent_tools
|
|
158
|
-
):
|
|
159
|
-
self.lifecycle_tools.append(tool.name)
|
|
160
|
-
elif not self.agent_tools or (
|
|
161
|
-
self.agent_tools and task.setup_tool.name not in self.agent_tools
|
|
162
|
-
):
|
|
163
|
-
self.lifecycle_tools.append(task.setup_tool.name)
|
|
164
|
-
if task.evaluate_tool:
|
|
165
|
-
if isinstance(task.evaluate_tool, list):
|
|
166
|
-
for tool in task.evaluate_tool:
|
|
167
|
-
if not self.agent_tools or (
|
|
168
|
-
self.agent_tools and tool.name not in self.agent_tools
|
|
169
|
-
):
|
|
170
|
-
self.lifecycle_tools.append(tool.name)
|
|
171
|
-
elif not self.agent_tools or (
|
|
172
|
-
self.agent_tools and task.evaluate_tool.name not in self.agent_tools
|
|
173
|
-
):
|
|
174
|
-
self.lifecycle_tools.append(task.evaluate_tool.name)
|
|
175
|
-
if task.system_prompt:
|
|
176
|
-
self.system_prompt += "\n\n" + task.system_prompt
|
|
177
|
-
|
|
178
|
-
# Re-apply filtering with updated lifecycle tools
|
|
179
|
-
await self._filter_tools()
|
|
180
|
-
|
|
181
|
-
async def run(self, prompt_or_task: str | Task | dict[str, Any], max_steps: int = 10) -> Trace:
|
|
182
|
-
"""
|
|
183
|
-
Run the agent with the given prompt or task.
|
|
137
|
+
# Call hook for subclass-specific initialization (e.g., tool format conversion)
|
|
138
|
+
self._on_tools_ready()
|
|
184
139
|
|
|
185
|
-
|
|
186
|
-
prompt_or_task: Either a string prompt for simple execution or a Task object
|
|
187
|
-
max_steps: Maximum number of steps (-1 for infinite)
|
|
140
|
+
self._initialized = True
|
|
188
141
|
|
|
189
|
-
|
|
190
|
-
|
|
142
|
+
def _on_tools_ready(self) -> None:
|
|
143
|
+
"""Hook called after tools are discovered and validated.
|
|
144
|
+
|
|
145
|
+
Subclasses can override this to perform provider-specific setup,
|
|
146
|
+
such as converting MCP tools to the provider's format.
|
|
147
|
+
|
|
148
|
+
Called by _initialize_from_ctx() after _available_tools is populated.
|
|
191
149
|
"""
|
|
192
|
-
#
|
|
193
|
-
from hud.datasets import Task
|
|
150
|
+
return # Default no-op - subclasses override for provider-specific setup
|
|
194
151
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
152
|
+
async def run(
|
|
153
|
+
self,
|
|
154
|
+
ctx: EvalContext,
|
|
155
|
+
*,
|
|
156
|
+
max_steps: int = 10,
|
|
157
|
+
) -> Trace:
|
|
158
|
+
"""
|
|
159
|
+
Run the agent on the given evaluation context.
|
|
199
160
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
await self.initialize(prompt_or_task)
|
|
161
|
+
The agent uses ctx.prompt as the task and ctx.call_tool() for tool execution.
|
|
162
|
+
Automatically calls ctx.submit() with the final answer.
|
|
203
163
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
164
|
+
Args:
|
|
165
|
+
ctx: EvalContext from hud.eval() - contains prompt and tools
|
|
166
|
+
max_steps: Maximum number of agent steps (-1 for infinite)
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Trace with done, content, isError fields
|
|
170
|
+
|
|
171
|
+
Example:
|
|
172
|
+
```python
|
|
173
|
+
async with hud.eval(task) as ctx:
|
|
174
|
+
agent = ClaudeAgent.create()
|
|
175
|
+
await agent.run(ctx)
|
|
176
|
+
# ctx.reward is set by the scenario's evaluate phase
|
|
177
|
+
```
|
|
178
|
+
"""
|
|
179
|
+
from hud.eval.context import EvalContext
|
|
207
180
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
context = text_to_blocks(prompt_or_task)
|
|
211
|
-
return await self._run_context(context, max_steps=max_steps)
|
|
181
|
+
if not isinstance(ctx, EvalContext):
|
|
182
|
+
raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
|
|
212
183
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
#
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
184
|
+
if not ctx.prompt:
|
|
185
|
+
if ctx.has_scenario:
|
|
186
|
+
# Scenario was specified but prompt is still empty
|
|
187
|
+
# (e.g., scenario returned empty string, or edge case not caught in scenarios.py)
|
|
188
|
+
scenario = ctx._task.scenario if ctx._task else "unknown"
|
|
189
|
+
raise ValueError(
|
|
190
|
+
f"ctx.prompt is not set.\n\n"
|
|
191
|
+
f"Scenario '{scenario}' was specified but returned an empty prompt.\n"
|
|
192
|
+
f"Check that the scenario's setup function returns a non-empty string."
|
|
222
193
|
)
|
|
223
194
|
else:
|
|
224
|
-
#
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
info={"error": str(e)},
|
|
195
|
+
# No scenario specified at all
|
|
196
|
+
raise ValueError(
|
|
197
|
+
"ctx.prompt is not set.\n\n"
|
|
198
|
+
"No scenario was specified in your task file.\n"
|
|
199
|
+
"Either add a 'scenario' field to your task, or set ctx.prompt manually "
|
|
200
|
+
"before running the agent."
|
|
231
201
|
)
|
|
232
|
-
finally:
|
|
233
|
-
# Cleanup auto-created resources
|
|
234
|
-
await self._cleanup()
|
|
235
202
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
Execute a task with setup and evaluate phases.
|
|
203
|
+
# Store context for tool calls
|
|
204
|
+
self.ctx = ctx
|
|
239
205
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
206
|
+
# Initialize tools from context
|
|
207
|
+
if not self._initialized:
|
|
208
|
+
await self._initialize_from_ctx(ctx)
|
|
243
209
|
|
|
244
|
-
Returns:
|
|
245
|
-
Trace with reward from evaluation
|
|
246
|
-
"""
|
|
247
210
|
try:
|
|
248
|
-
|
|
249
|
-
start_context: list[types.ContentBlock] = []
|
|
250
|
-
|
|
251
|
-
# Extract the initial task information
|
|
252
|
-
if task.prompt:
|
|
253
|
-
start_context.extend(text_to_blocks(task.prompt))
|
|
254
|
-
|
|
255
|
-
# Execute the setup tool and append the initial observation to the context
|
|
256
|
-
if task.setup_tool is not None:
|
|
257
|
-
self.console.progress_log(f"Setting up tool phase: {task.setup_tool}")
|
|
258
|
-
results = await self.call_tools(task.setup_tool)
|
|
259
|
-
if any(result.isError for result in results):
|
|
260
|
-
return Trace(
|
|
261
|
-
reward=0.0,
|
|
262
|
-
done=True,
|
|
263
|
-
content=f"Setup tool failed: {results}",
|
|
264
|
-
isError=True,
|
|
265
|
-
task=task,
|
|
266
|
-
)
|
|
267
|
-
|
|
268
|
-
if self.append_setup_output and isinstance(results[0].content, list):
|
|
269
|
-
start_context.extend(results[0].content)
|
|
270
|
-
if not self.initial_screenshot:
|
|
271
|
-
start_context = await self._filter_messages(start_context, include_types=["text"])
|
|
272
|
-
|
|
273
|
-
# Execute the task (agent loop) - this returns a empty trace object with the final response # noqa: E501
|
|
274
|
-
prompt_result = await self._run_context(start_context, max_steps=max_steps)
|
|
275
|
-
|
|
276
|
-
except Exception as e:
|
|
277
|
-
self.console.error_log(f"Task execution failed: {e}")
|
|
278
|
-
# Create an error result but don't return yet - we still want to evaluate
|
|
279
|
-
prompt_result = Trace(reward=0.0, done=True, content=str(e), isError=True, task=task)
|
|
280
|
-
prompt_result.populate_from_context()
|
|
281
|
-
|
|
282
|
-
# Always evaluate if we have evaluate tool, regardless of errors
|
|
283
|
-
if task.evaluate_tool is not None:
|
|
284
|
-
try:
|
|
285
|
-
results = await self.call_tools(task.evaluate_tool)
|
|
286
|
-
|
|
287
|
-
if any(result.isError for result in results):
|
|
288
|
-
self.console.warning_log(f"Evaluate tool returned error: {results}")
|
|
289
|
-
# Still extract what we can from the error response
|
|
290
|
-
if prompt_result is None:
|
|
291
|
-
prompt_result = Trace(
|
|
292
|
-
reward=0.0,
|
|
293
|
-
done=True,
|
|
294
|
-
content="Task failed before evaluation",
|
|
295
|
-
isError=True,
|
|
296
|
-
task=task,
|
|
297
|
-
)
|
|
298
|
-
prompt_result.reward = 0.0 # Default to 0 on error
|
|
299
|
-
else:
|
|
300
|
-
# Extract reward and content from evaluation
|
|
301
|
-
if results:
|
|
302
|
-
reward = find_reward(results[0])
|
|
303
|
-
self.console.info_log(f"Eval: {reward:.4f} {task.evaluate_tool}")
|
|
304
|
-
eval_content = find_content(results[0])
|
|
305
|
-
|
|
306
|
-
# Update the prompt result with evaluation reward
|
|
307
|
-
if prompt_result is None:
|
|
308
|
-
prompt_result = Trace(
|
|
309
|
-
reward=reward,
|
|
310
|
-
done=True,
|
|
311
|
-
content=eval_content or "",
|
|
312
|
-
isError=False,
|
|
313
|
-
task=task,
|
|
314
|
-
)
|
|
315
|
-
else:
|
|
316
|
-
prompt_result.reward = reward
|
|
211
|
+
result = await self._run_context(text_to_blocks(ctx.prompt), max_steps=max_steps)
|
|
317
212
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
if prompt_result.content:
|
|
323
|
-
prompt_result.content += "\n\n" + eval_content
|
|
324
|
-
else:
|
|
325
|
-
prompt_result.content = eval_content
|
|
213
|
+
# Propagate error state to context for platform visibility
|
|
214
|
+
if result.isError and hasattr(ctx, "error"):
|
|
215
|
+
error_msg = result.info.get("error") if result.info else result.content
|
|
216
|
+
ctx.error = Exception(str(error_msg)) if error_msg else Exception("Agent error")
|
|
326
217
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
if prompt_result is None:
|
|
331
|
-
prompt_result = Trace(
|
|
332
|
-
reward=0.0,
|
|
333
|
-
done=True,
|
|
334
|
-
content=f"Evaluation failed: {e}",
|
|
335
|
-
isError=True,
|
|
336
|
-
task=task,
|
|
337
|
-
)
|
|
218
|
+
# Submit final answer to context (only if scenario is running)
|
|
219
|
+
if result.content and ctx.has_scenario:
|
|
220
|
+
await ctx.submit(result.content)
|
|
338
221
|
|
|
339
|
-
|
|
222
|
+
return result
|
|
340
223
|
|
|
341
|
-
|
|
224
|
+
except Exception as e:
|
|
225
|
+
logger.exception("Error while running agent:")
|
|
226
|
+
# Propagate error to context for platform visibility
|
|
227
|
+
if hasattr(ctx, "error"):
|
|
228
|
+
ctx.error = e
|
|
229
|
+
return Trace(
|
|
230
|
+
reward=0.0,
|
|
231
|
+
done=True,
|
|
232
|
+
content=f"Agent failed with error: {e}",
|
|
233
|
+
isError=True,
|
|
234
|
+
info={"error": str(e)},
|
|
235
|
+
)
|
|
236
|
+
finally:
|
|
237
|
+
# Cleanup auto-created resources
|
|
238
|
+
await self._cleanup()
|
|
342
239
|
|
|
343
240
|
async def _run_context(
|
|
344
241
|
self, context: list[types.ContentBlock], *, max_steps: int = 10
|
|
@@ -356,6 +253,8 @@ class MCPAgent(ABC):
|
|
|
356
253
|
final_response = None
|
|
357
254
|
error = None
|
|
358
255
|
|
|
256
|
+
messages: list[Any] = []
|
|
257
|
+
|
|
359
258
|
try:
|
|
360
259
|
# Start with system messages
|
|
361
260
|
messages = await self.get_system_messages()
|
|
@@ -380,19 +279,17 @@ class MCPAgent(ABC):
|
|
|
380
279
|
|
|
381
280
|
# Check if we should stop
|
|
382
281
|
if response.done or not response.tool_calls:
|
|
383
|
-
#
|
|
384
|
-
decision = "STOP"
|
|
385
|
-
if self.
|
|
282
|
+
# Use auto_respond to decide whether to stop
|
|
283
|
+
decision: Literal["STOP", "CONTINUE"] = "STOP"
|
|
284
|
+
if self.auto_respond and response.content:
|
|
386
285
|
try:
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
)
|
|
286
|
+
from hud.agents.misc import ResponseAgent
|
|
287
|
+
|
|
288
|
+
response_agent = ResponseAgent()
|
|
289
|
+
decision = await response_agent.determine_response(response.content)
|
|
390
290
|
except Exception as e:
|
|
391
|
-
self.console.warning_log(f"
|
|
291
|
+
self.console.warning_log(f"Auto-respond failed: {e}")
|
|
392
292
|
if decision == "STOP":
|
|
393
|
-
# Try to submit response through lifecycle tool
|
|
394
|
-
await self._maybe_submit_response(response, messages)
|
|
395
|
-
|
|
396
293
|
self.console.debug("Stopping execution")
|
|
397
294
|
final_response = response
|
|
398
295
|
break
|
|
@@ -403,11 +300,7 @@ class MCPAgent(ABC):
|
|
|
403
300
|
|
|
404
301
|
# 2. Execute tools
|
|
405
302
|
tool_calls = response.tool_calls
|
|
406
|
-
for tool_call in tool_calls:
|
|
407
|
-
self.console.info_log(f"{tool_call}")
|
|
408
303
|
tool_results = await self.call_tools(tool_calls)
|
|
409
|
-
for tool_result in tool_results:
|
|
410
|
-
self.console.info_log(f"{tool_result}")
|
|
411
304
|
|
|
412
305
|
# 3. Format tool results and add to messages
|
|
413
306
|
tool_messages = await self.format_tool_results(tool_calls, tool_results)
|
|
@@ -459,16 +352,13 @@ class MCPAgent(ABC):
|
|
|
459
352
|
}
|
|
460
353
|
trace_result = Trace(**trace_params)
|
|
461
354
|
|
|
462
|
-
# Populate trace steps from current context
|
|
463
|
-
trace_result.populate_from_context()
|
|
464
|
-
|
|
465
355
|
return trace_result
|
|
466
356
|
|
|
467
357
|
async def call_tools(
|
|
468
358
|
self, tool_call: MCPToolCall | list[MCPToolCall] | None = None
|
|
469
359
|
) -> list[MCPToolResult]:
|
|
470
360
|
"""
|
|
471
|
-
Call
|
|
361
|
+
Call tools through the bound EvalContext.
|
|
472
362
|
|
|
473
363
|
Args:
|
|
474
364
|
tool_call: MCPToolCall or list of MCPToolCall
|
|
@@ -482,20 +372,17 @@ class MCPAgent(ABC):
|
|
|
482
372
|
if isinstance(tool_call, MCPToolCall):
|
|
483
373
|
tool_call = [tool_call]
|
|
484
374
|
|
|
485
|
-
if self.
|
|
486
|
-
raise ValueError("
|
|
375
|
+
if self.ctx is None:
|
|
376
|
+
raise ValueError("Agent not bound to context - call run(ctx) first")
|
|
487
377
|
|
|
488
378
|
results: list[MCPToolResult] = []
|
|
489
379
|
for tc in tool_call:
|
|
490
380
|
try:
|
|
491
381
|
self.console.debug(f"Calling tool: {tc}")
|
|
492
|
-
|
|
382
|
+
result = await self.ctx.call_tool(tc)
|
|
383
|
+
results.append(MCPToolResult(content=result.content, isError=result.isError))
|
|
493
384
|
except TimeoutError as e:
|
|
494
385
|
self.console.error_log(f"Tool execution timed out: {e}")
|
|
495
|
-
try:
|
|
496
|
-
await self.mcp_client.shutdown()
|
|
497
|
-
except Exception as close_err:
|
|
498
|
-
self.console.debug(f"Failed to close MCP client cleanly: {close_err}")
|
|
499
386
|
raise
|
|
500
387
|
except Exception as e:
|
|
501
388
|
self.console.error_log(f"Tool execution failed: {e}")
|
|
@@ -514,8 +401,6 @@ class MCPAgent(ABC):
|
|
|
514
401
|
"""
|
|
515
402
|
Get response from the model including any tool calls.
|
|
516
403
|
|
|
517
|
-
NOTE: Subclasses should decorate this method with:
|
|
518
|
-
@hud.instrument(span_type="agent", record_args=False, record_result=True)
|
|
519
404
|
|
|
520
405
|
Args:
|
|
521
406
|
messages: Current conversation messages
|
|
@@ -575,148 +460,13 @@ class MCPAgent(ABC):
|
|
|
575
460
|
|
|
576
461
|
return await self.format_blocks(blocks)
|
|
577
462
|
|
|
578
|
-
async def _filter_tools(self) -> None:
|
|
579
|
-
"""Apply tool filtering based on allowed/disallowed lists."""
|
|
580
|
-
# Get all tools from client
|
|
581
|
-
if self.mcp_client is None:
|
|
582
|
-
raise ValueError("MCP client is not initialized")
|
|
583
|
-
|
|
584
|
-
all_tools = await self.mcp_client.list_tools()
|
|
585
|
-
|
|
586
|
-
response_tools_by_server: dict[str, str] = {} # server_name -> tool_name
|
|
587
|
-
for tool in all_tools:
|
|
588
|
-
if "response" in tool.name or tool.name == "response":
|
|
589
|
-
self.console.debug(f"Found response tool: '{tool.name}'")
|
|
590
|
-
# Extract server name from tool name (e.g., "grader_response" -> "grader")
|
|
591
|
-
if "_" in tool.name:
|
|
592
|
-
server_name = tool.name.split("_", 1)[0]
|
|
593
|
-
response_tools_by_server[server_name] = tool.name
|
|
594
|
-
else:
|
|
595
|
-
response_tools_by_server["_default"] = tool.name
|
|
596
|
-
|
|
597
|
-
# Add response tool to lifecycle tools BEFORE filtering
|
|
598
|
-
if response_tools_by_server and hasattr(self.mcp_client, "mcp_config"):
|
|
599
|
-
# Get server names in order from mcp_config
|
|
600
|
-
server_names = list(self.mcp_client.mcp_config.keys())
|
|
601
|
-
self.console.debug(f"Server names: {server_names}")
|
|
602
|
-
|
|
603
|
-
# Try to find response tool from last server first
|
|
604
|
-
response_tool_name = None
|
|
605
|
-
for server_name in reversed(server_names):
|
|
606
|
-
if server_name in response_tools_by_server:
|
|
607
|
-
response_tool_name = response_tools_by_server[server_name]
|
|
608
|
-
self.console.debug(
|
|
609
|
-
f"Found response tool '{response_tool_name}' from server '{server_name}'"
|
|
610
|
-
)
|
|
611
|
-
break
|
|
612
|
-
|
|
613
|
-
# Fallback to any response tool
|
|
614
|
-
if not response_tool_name and response_tools_by_server:
|
|
615
|
-
response_tool_name = next(iter(response_tools_by_server.values()))
|
|
616
|
-
self.console.debug(f"Using fallback response tool '{response_tool_name}'")
|
|
617
|
-
|
|
618
|
-
# Add to lifecycle tools if found
|
|
619
|
-
if response_tool_name and response_tool_name not in self.lifecycle_tools:
|
|
620
|
-
self.console.debug(f"Auto-detected '{response_tool_name}' tool as a lifecycle tool")
|
|
621
|
-
self.response_tool_name = response_tool_name
|
|
622
|
-
self.lifecycle_tools.append(response_tool_name)
|
|
623
|
-
elif response_tool_name:
|
|
624
|
-
self.console.debug(
|
|
625
|
-
f"Response tool '{response_tool_name}' already in lifecycle_tools"
|
|
626
|
-
)
|
|
627
|
-
self.response_tool_name = response_tool_name
|
|
628
|
-
else:
|
|
629
|
-
self.console.debug("No response tools found or no mcp_config")
|
|
630
|
-
|
|
631
|
-
# Filter tools
|
|
632
|
-
self._available_tools = []
|
|
633
|
-
self._tool_map = {}
|
|
634
|
-
|
|
635
|
-
self.console.debug(f"All tools: {[t.name for t in all_tools]}")
|
|
636
|
-
self.console.debug(f"Allowed tools: {self.allowed_tools}")
|
|
637
|
-
self.console.debug(f"Agent tools: {self.agent_tools}")
|
|
638
|
-
self.console.debug(f"Disallowed tools: {self.disallowed_tools}")
|
|
639
|
-
self.console.debug(f"Lifecycle tools: {self.lifecycle_tools}")
|
|
640
|
-
|
|
641
|
-
for tool in all_tools:
|
|
642
|
-
# Lifecycle tools (setup, evaluate, response) should always be included
|
|
643
|
-
is_lifecycle = tool.name in self.lifecycle_tools
|
|
644
|
-
|
|
645
|
-
# Check if tool should be included
|
|
646
|
-
if not is_lifecycle:
|
|
647
|
-
if self.allowed_tools and tool.name not in self.allowed_tools:
|
|
648
|
-
self.console.debug(f"Skipping tool '{tool.name}' - not in allowed_tools")
|
|
649
|
-
continue
|
|
650
|
-
if self.agent_tools and tool.name not in self.agent_tools:
|
|
651
|
-
self.console.debug(f"Skipping tool '{tool.name}' - not in agent_tools")
|
|
652
|
-
continue
|
|
653
|
-
if tool.name in self.disallowed_tools:
|
|
654
|
-
self.console.debug(f"Skipping tool '{tool.name}' - in disallowed_tools")
|
|
655
|
-
continue
|
|
656
|
-
|
|
657
|
-
self.console.debug(
|
|
658
|
-
f"Adding tool '{tool.name}' to available tools (lifecycle={is_lifecycle})"
|
|
659
|
-
)
|
|
660
|
-
self._available_tools.append(tool)
|
|
661
|
-
self._tool_map[tool.name] = tool
|
|
662
|
-
|
|
663
|
-
# Check if all required tools are available
|
|
664
|
-
if self.required_tools:
|
|
665
|
-
available_tool_names = {tool.name for tool in self._available_tools}
|
|
666
|
-
missing_tools = [
|
|
667
|
-
tool for tool in self.required_tools if tool not in available_tool_names
|
|
668
|
-
]
|
|
669
|
-
if missing_tools:
|
|
670
|
-
raise ValueError(
|
|
671
|
-
f"Required tools not available: {missing_tools}. "
|
|
672
|
-
f"Available tools: {list(available_tool_names)}"
|
|
673
|
-
)
|
|
674
|
-
|
|
675
|
-
available_tools = self.get_available_tools()
|
|
676
|
-
self.console.info(
|
|
677
|
-
f"Agent initialized with {len(available_tools)} tools: {', '.join([t.name for t in available_tools])}" # noqa: E501
|
|
678
|
-
)
|
|
679
|
-
|
|
680
|
-
async def _maybe_submit_response(self, response: AgentResponse, messages: list[Any]) -> None:
|
|
681
|
-
"""Submit response through lifecycle tool if available.
|
|
682
|
-
|
|
683
|
-
Args:
|
|
684
|
-
response: The agent's response
|
|
685
|
-
messages: The current message history (will be modified in-place)
|
|
686
|
-
"""
|
|
687
|
-
if self.response_tool_name:
|
|
688
|
-
self.console.debug(f"Calling response lifecycle tool: {self.response_tool_name}")
|
|
689
|
-
try:
|
|
690
|
-
# Call the response tool with the agent's response
|
|
691
|
-
response_tool_call = MCPToolCall(
|
|
692
|
-
name=self.response_tool_name, arguments={"response": response.content}
|
|
693
|
-
)
|
|
694
|
-
response_results = await self.call_tools(response_tool_call)
|
|
695
|
-
|
|
696
|
-
# Format and add the response tool results to messages
|
|
697
|
-
response_messages = await self.format_tool_results(
|
|
698
|
-
[response_tool_call], response_results
|
|
699
|
-
)
|
|
700
|
-
messages.extend(response_messages)
|
|
701
|
-
|
|
702
|
-
# Mark the task as done
|
|
703
|
-
self.console.debug("Response lifecycle tool executed, marking task as done")
|
|
704
|
-
except Exception as e:
|
|
705
|
-
self.console.error_log(f"Response lifecycle tool failed: {e}")
|
|
706
|
-
|
|
707
|
-
async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
|
|
708
|
-
"""Inject metadata into the metadata of the initialize request."""
|
|
709
|
-
if self.metadata:
|
|
710
|
-
patch_mcp_config(
|
|
711
|
-
mcp_config,
|
|
712
|
-
MCPConfigPatch(meta=self.metadata),
|
|
713
|
-
)
|
|
714
|
-
self._auto_trace_cm = setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
|
|
715
|
-
|
|
716
463
|
def get_available_tools(self) -> list[types.Tool]:
|
|
717
464
|
"""Get list of available MCP tools for LLM use (excludes lifecycle tools)."""
|
|
718
|
-
|
|
719
|
-
|
|
465
|
+
if self._available_tools is None:
|
|
466
|
+
raise RuntimeError(
|
|
467
|
+
"Tools have not been initialized. Call initialize() before accessing available tools." # noqa: E501
|
|
468
|
+
)
|
|
469
|
+
return self._available_tools
|
|
720
470
|
|
|
721
471
|
def get_tool_schemas(self) -> list[dict]:
|
|
722
472
|
"""Get tool schemas in a format suitable for the model."""
|
|
@@ -752,65 +502,8 @@ class MCPAgent(ABC):
|
|
|
752
502
|
|
|
753
503
|
async def _cleanup(self) -> None:
|
|
754
504
|
"""Cleanup resources."""
|
|
755
|
-
#
|
|
756
|
-
|
|
757
|
-
try:
|
|
758
|
-
self._auto_trace_cm.__exit__(None, None, None)
|
|
759
|
-
self.console.debug("Closed auto-created trace")
|
|
760
|
-
except Exception as e:
|
|
761
|
-
self.console.warning_log(f"Failed to close auto-created trace: {e}")
|
|
762
|
-
finally:
|
|
763
|
-
self._auto_trace_cm = None
|
|
764
|
-
|
|
765
|
-
# Clean up auto-created client
|
|
766
|
-
if self._auto_created_client and self.mcp_client:
|
|
767
|
-
try:
|
|
768
|
-
await self.mcp_client.shutdown()
|
|
769
|
-
self.console.debug("Closed auto-created MCPClient")
|
|
770
|
-
except Exception as e:
|
|
771
|
-
self.console.warning_log(f"Failed to close auto-created client: {e}")
|
|
772
|
-
finally:
|
|
773
|
-
self.mcp_client = None
|
|
774
|
-
self._auto_created_client = False
|
|
775
|
-
|
|
776
|
-
def _is_connection_error(self, e: Exception) -> bool:
|
|
777
|
-
"""Check if an exception is a connection error."""
|
|
778
|
-
error_msg = str(e).lower()
|
|
779
|
-
return any(
|
|
780
|
-
pattern in error_msg
|
|
781
|
-
for pattern in [
|
|
782
|
-
"connection",
|
|
783
|
-
"connect",
|
|
784
|
-
"refused",
|
|
785
|
-
"failed",
|
|
786
|
-
"could not connect",
|
|
787
|
-
"mcp server",
|
|
788
|
-
]
|
|
789
|
-
)
|
|
790
|
-
|
|
791
|
-
def _get_connection_error_message(self, e: Exception) -> str:
|
|
792
|
-
"""Extract a helpful connection error message."""
|
|
793
|
-
import re
|
|
794
|
-
|
|
795
|
-
url_match = re.search(r"https?://[^\s]+", str(e))
|
|
796
|
-
url = url_match.group(0) if url_match else "the MCP server"
|
|
797
|
-
return f"Connection failed: Could not connect to {url}. Is your MCP client/server running?"
|
|
798
|
-
|
|
799
|
-
def _handle_connection_error(self, e: Exception) -> None:
|
|
800
|
-
"""Handle connection errors with helpful messages."""
|
|
801
|
-
if self._is_connection_error(e):
|
|
802
|
-
msg = self._get_connection_error_message(e)
|
|
803
|
-
# Always show connection errors, not just when logging is enabled
|
|
804
|
-
self.console.error(f"❌ {msg}")
|
|
805
|
-
self.console.info("💡 Make sure the MCP server is started before running the agent.")
|
|
806
|
-
|
|
807
|
-
# For localhost, provide specific instructions
|
|
808
|
-
error_str = str(e).lower()
|
|
809
|
-
if "localhost" in error_str or "127.0.0.1" in error_str:
|
|
810
|
-
self.console.info(" Run 'hud dev' in another terminal to start the MCP server")
|
|
811
|
-
|
|
812
|
-
raise RuntimeError(msg) from e
|
|
813
|
-
raise
|
|
505
|
+
# Clear context reference
|
|
506
|
+
self.ctx = None
|
|
814
507
|
|
|
815
508
|
|
|
816
509
|
def _format_error_result(error_message: str) -> MCPToolResult:
|
|
@@ -824,14 +517,39 @@ def text_to_blocks(text: str) -> list[types.ContentBlock]:
|
|
|
824
517
|
def find_reward(result: MCPToolResult) -> float:
|
|
825
518
|
"""Find the reward in the result.
|
|
826
519
|
|
|
827
|
-
Agent accepts "reward", "grade", "score"
|
|
520
|
+
Agent accepts "reward", "grade", "score", or weighted subscores
|
|
828
521
|
|
|
829
522
|
If not found, return 0.0
|
|
830
523
|
"""
|
|
831
524
|
accept_keys = ["reward", "grade", "score"]
|
|
525
|
+
|
|
526
|
+
# Check for direct reward/grade/score keys
|
|
832
527
|
for key in accept_keys:
|
|
833
528
|
if isinstance(result.structuredContent, dict) and key in result.structuredContent:
|
|
834
529
|
return result.structuredContent[key]
|
|
530
|
+
|
|
531
|
+
# Check for subscores and weights format
|
|
532
|
+
if (
|
|
533
|
+
isinstance(result.structuredContent, dict)
|
|
534
|
+
and "subscores" in result.structuredContent
|
|
535
|
+
and "weights" in result.structuredContent
|
|
536
|
+
):
|
|
537
|
+
subscores = result.structuredContent["subscores"]
|
|
538
|
+
weights = result.structuredContent["weights"]
|
|
539
|
+
if isinstance(subscores, dict) and isinstance(weights, dict):
|
|
540
|
+
try:
|
|
541
|
+
# Multiply each subscore by its corresponding weight and sum
|
|
542
|
+
reward = sum(
|
|
543
|
+
float(subscores[key]) * float(weights.get(key, 0.0))
|
|
544
|
+
for key in subscores
|
|
545
|
+
if key in weights
|
|
546
|
+
)
|
|
547
|
+
return reward
|
|
548
|
+
except (ValueError, TypeError) as e:
|
|
549
|
+
logger.error("Failed to parse subscores/weights: %s", e)
|
|
550
|
+
return 0.0
|
|
551
|
+
|
|
552
|
+
# Check for reward in JSON text content
|
|
835
553
|
if isinstance(result.content, list):
|
|
836
554
|
for content in result.content:
|
|
837
555
|
if isinstance(content, types.TextContent):
|
|
@@ -842,6 +560,8 @@ def find_reward(result: MCPToolResult) -> float:
|
|
|
842
560
|
return value
|
|
843
561
|
except json.JSONDecodeError:
|
|
844
562
|
pass
|
|
563
|
+
|
|
564
|
+
logger.error("Couldn't parse reward from result: %s", str(result.structuredContent))
|
|
845
565
|
return 0.0
|
|
846
566
|
|
|
847
567
|
|