hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +70 -5
- hud/agents/base.py +238 -500
- hud/agents/claude.py +236 -247
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +264 -0
- hud/agents/gemini_cua.py +324 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +48 -36
- hud/agents/openai.py +282 -296
- hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
- hud/agents/operator.py +199 -0
- hud/agents/resolver.py +70 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +381 -214
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +377 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +493 -546
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +699 -113
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +889 -732
- hud/cli/eval.py +793 -667
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/pull.py +1 -1
- hud/cli/push.py +38 -13
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +110 -8
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push.py +1 -1
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +70 -1
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +45 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +326 -0
- hud/datasets/runner.py +198 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +52 -0
- hud/environment/connection.py +258 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +137 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +835 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +263 -0
- hud/environment/scenarios.py +620 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +205 -0
- hud/environment/tests/test_environment.py +593 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +242 -0
- hud/environment/tests/test_scenarios.py +1086 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +727 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +187 -0
- hud/eval/manager.py +533 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +372 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +291 -0
- hud/eval/types.py +65 -0
- hud/eval/utils.py +194 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +308 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +165 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +18 -2
- hud/tools/agent.py +223 -0
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +36 -3
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +194 -56
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +89 -18
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.13.dist-info/METADATA +264 -0
- hud_python-0.5.13.dist-info/RECORD +305 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/agents/tests/test_base.py
CHANGED
|
@@ -1,738 +1,416 @@
|
|
|
1
|
-
"""Tests for
|
|
1
|
+
"""Tests for MCPAgent base class with v5 EvalContext pattern."""
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from typing import Any, ClassVar
|
|
6
|
-
from unittest.mock import MagicMock
|
|
7
|
-
|
|
8
|
-
# Import AsyncMock from unittest.mock if available (Python 3.8+)
|
|
9
|
-
try:
|
|
10
|
-
from unittest.mock import AsyncMock
|
|
11
|
-
except ImportError:
|
|
12
|
-
# Fallback for older Python versions
|
|
13
|
-
from unittest.mock import MagicMock as AsyncMock
|
|
14
6
|
|
|
15
7
|
import pytest
|
|
16
8
|
from mcp import types
|
|
17
9
|
|
|
18
10
|
from hud.agents import MCPAgent
|
|
19
|
-
from hud.
|
|
20
|
-
from hud.
|
|
21
|
-
from hud.
|
|
11
|
+
from hud.agents.base import BaseCreateParams
|
|
12
|
+
from hud.environment.router import ToolRouter
|
|
13
|
+
from hud.eval.context import EvalContext
|
|
14
|
+
from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MockConfig(BaseAgentConfig):
|
|
18
|
+
model_name: str = "MockAgent"
|
|
19
|
+
model: str = "mock-model"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MockCreateParams(BaseCreateParams, MockConfig):
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class MockEvalContext(EvalContext):
|
|
27
|
+
"""Mock EvalContext for testing."""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
prompt: str = "Test prompt",
|
|
32
|
+
tools: list[types.Tool] | None = None,
|
|
33
|
+
) -> None:
|
|
34
|
+
# Core attributes
|
|
35
|
+
self.prompt = prompt
|
|
36
|
+
self._tools = tools or [
|
|
37
|
+
types.Tool(name="test_tool", description="A test tool", inputSchema={}),
|
|
38
|
+
types.Tool(name="another_tool", description="Another tool", inputSchema={}),
|
|
39
|
+
]
|
|
40
|
+
self._submitted: str | None = None
|
|
41
|
+
self.reward: float | None = None
|
|
42
|
+
self._tool_calls: list[tuple[str, dict[str, Any]]] = []
|
|
43
|
+
|
|
44
|
+
# Environment attributes
|
|
45
|
+
self._router = ToolRouter()
|
|
46
|
+
self._agent_include: list[str] | None = None
|
|
47
|
+
self._agent_exclude: list[str] | None = None
|
|
48
|
+
|
|
49
|
+
# EvalContext attributes
|
|
50
|
+
self._task = None
|
|
51
|
+
self.trace_id = "test-trace-id"
|
|
52
|
+
self.eval_name = "test-eval"
|
|
53
|
+
self.job_id: str | None = None
|
|
54
|
+
self.group_id: str | None = None
|
|
55
|
+
self.index = 0
|
|
56
|
+
self.variants: dict[str, Any] = {}
|
|
57
|
+
self.answer: str | None = None
|
|
58
|
+
self.system_prompt: str | None = None
|
|
59
|
+
self.error: BaseException | None = None
|
|
60
|
+
self.metadata: dict[str, Any] = {}
|
|
61
|
+
self.results: list[Any] = []
|
|
62
|
+
self._is_summary = False
|
|
63
|
+
|
|
64
|
+
def as_tools(self) -> list[types.Tool]:
|
|
65
|
+
return self._tools
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def has_scenario(self) -> bool:
|
|
69
|
+
return True
|
|
70
|
+
|
|
71
|
+
async def list_tools(self) -> list[types.Tool]:
|
|
72
|
+
return self._tools
|
|
73
|
+
|
|
74
|
+
async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
|
|
75
|
+
# Parse the call
|
|
76
|
+
if isinstance(call, tuple):
|
|
77
|
+
name, args = call[0], call[1] if len(call) > 1 else {}
|
|
78
|
+
elif hasattr(call, "name"):
|
|
79
|
+
name, args = call.name, getattr(call, "arguments", {}) or {}
|
|
80
|
+
else:
|
|
81
|
+
name, args = str(call), kwargs
|
|
82
|
+
self._tool_calls.append((name, args))
|
|
83
|
+
return MCPToolResult(
|
|
84
|
+
content=[types.TextContent(type="text", text=f"Result from {name}")],
|
|
85
|
+
isError=False,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
async def submit(self, answer: str) -> None:
|
|
89
|
+
self._submitted = answer
|
|
22
90
|
|
|
23
91
|
|
|
24
92
|
class MockMCPAgent(MCPAgent):
|
|
25
|
-
"""Concrete implementation of
|
|
26
|
-
|
|
27
|
-
metadata: ClassVar[dict[str, Any]] = {}
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
super().__init__(mcp_client=mcp_client, **kwargs)
|
|
38
|
-
self.executor = BaseExecutor() # Use simulated executor
|
|
39
|
-
self._messages = []
|
|
40
|
-
|
|
41
|
-
async def run(self, task: Task) -> list[dict[str, Any]]:
|
|
42
|
-
"""Mock run method."""
|
|
43
|
-
return self._messages
|
|
44
|
-
|
|
45
|
-
async def create_initial_messages(
|
|
46
|
-
self, prompt: str, initial_screenshot: bool = False
|
|
47
|
-
) -> list[dict[str, Any]]:
|
|
48
|
-
"""Mock create initial messages."""
|
|
49
|
-
messages = [{"role": "user", "content": prompt}]
|
|
50
|
-
if initial_screenshot:
|
|
51
|
-
messages.append({"role": "assistant", "content": "Screenshot: mock_screenshot"})
|
|
52
|
-
return messages
|
|
93
|
+
"""Concrete implementation of MCPAgent for testing."""
|
|
94
|
+
|
|
95
|
+
metadata: ClassVar[dict[str, Any] | None] = {}
|
|
96
|
+
config_cls: ClassVar[type[BaseAgentConfig]] = MockConfig
|
|
97
|
+
|
|
98
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
99
|
+
params = MockCreateParams(**kwargs)
|
|
100
|
+
super().__init__(params)
|
|
101
|
+
self._response = AgentResponse(content="Mock response", tool_calls=[], done=True)
|
|
102
|
+
|
|
103
|
+
def set_response(self, response: AgentResponse) -> None:
|
|
104
|
+
self._response = response
|
|
53
105
|
|
|
54
106
|
async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
|
|
55
|
-
|
|
56
|
-
return AgentResponse(content="Mock response", tool_calls=[], done=True)
|
|
107
|
+
return self._response
|
|
57
108
|
|
|
58
109
|
async def format_tool_results(
|
|
59
110
|
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
|
|
60
111
|
) -> list[dict[str, Any]]:
|
|
61
|
-
"""Mock format tool results."""
|
|
62
112
|
formatted = []
|
|
63
|
-
for tool_call, result in zip(tool_calls, tool_results):
|
|
113
|
+
for tool_call, result in zip(tool_calls, tool_results, strict=True):
|
|
64
114
|
formatted.append({"role": "tool", "name": tool_call.name, "content": str(result)})
|
|
65
115
|
return formatted
|
|
66
116
|
|
|
67
|
-
async def create_user_message(self, text: str) -> Any:
|
|
68
|
-
"""Mock create user message."""
|
|
69
|
-
return {"role": "user", "content": text}
|
|
70
|
-
|
|
71
117
|
async def get_system_messages(self) -> list[Any]:
|
|
72
|
-
"""Mock get system messages."""
|
|
73
118
|
return []
|
|
74
119
|
|
|
75
120
|
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
|
|
76
|
-
"""
|
|
77
|
-
formatted = []
|
|
78
|
-
for block in blocks:
|
|
79
|
-
if isinstance(block, types.TextContent):
|
|
80
|
-
formatted.append({"type": "text", "text": block.text})
|
|
81
|
-
elif isinstance(block, types.ImageContent):
|
|
82
|
-
formatted.append({"type": "image", "data": block.data})
|
|
83
|
-
elif hasattr(block, "type"):
|
|
84
|
-
formatted.append({"type": getattr(block, "type", "unknown")})
|
|
85
|
-
return formatted
|
|
121
|
+
return [{"type": "text", "text": getattr(b, "text", "")} for b in blocks]
|
|
86
122
|
|
|
87
123
|
|
|
88
|
-
class
|
|
89
|
-
"""Tests for
|
|
124
|
+
class TestMCPAgentInit:
|
|
125
|
+
"""Tests for MCPAgent initialization."""
|
|
90
126
|
|
|
91
|
-
def test_init_defaults(self):
|
|
92
|
-
"""Test
|
|
127
|
+
def test_init_defaults(self) -> None:
|
|
128
|
+
"""Test agent initializes with default config."""
|
|
93
129
|
agent = MockMCPAgent()
|
|
130
|
+
assert agent.ctx is None
|
|
131
|
+
assert agent._initialized is False
|
|
132
|
+
assert agent.system_prompt is None
|
|
94
133
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
assert agent.initial_screenshot is True
|
|
99
|
-
assert agent.system_prompt is not None # Default system prompt is set
|
|
100
|
-
|
|
101
|
-
def test_init_with_params(self):
|
|
102
|
-
"""Test initialization with custom parameters."""
|
|
103
|
-
client = MagicMock()
|
|
104
|
-
agent = MockMCPAgent(
|
|
105
|
-
mcp_client=client,
|
|
106
|
-
allowed_tools=["tool1", "tool2"],
|
|
107
|
-
disallowed_tools=["bad_tool"],
|
|
108
|
-
initial_screenshot=True,
|
|
109
|
-
system_prompt="Custom prompt",
|
|
110
|
-
)
|
|
111
|
-
|
|
112
|
-
assert agent.mcp_client == client
|
|
113
|
-
assert agent.allowed_tools == ["tool1", "tool2"]
|
|
114
|
-
assert agent.disallowed_tools == ["bad_tool"]
|
|
115
|
-
assert agent.initial_screenshot is True
|
|
134
|
+
def test_init_with_system_prompt(self) -> None:
|
|
135
|
+
"""Test agent with custom system prompt."""
|
|
136
|
+
agent = MockMCPAgent(system_prompt="Custom prompt")
|
|
116
137
|
assert agent.system_prompt == "Custom prompt"
|
|
117
138
|
|
|
118
|
-
@pytest.mark.asyncio
|
|
119
|
-
async def test_init_no_client_no_task(self):
|
|
120
|
-
"""Test initialize fails without client and without task."""
|
|
121
|
-
|
|
122
|
-
# Create a minimal concrete implementation to test the ValueError
|
|
123
|
-
class TestAgent(MCPAgent):
|
|
124
|
-
async def create_initial_messages(
|
|
125
|
-
self, prompt: str, initial_screenshot: bool = False
|
|
126
|
-
) -> list[dict[str, Any]]:
|
|
127
|
-
return []
|
|
128
|
-
|
|
129
|
-
async def format_tool_results(
|
|
130
|
-
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
|
|
131
|
-
) -> list[dict[str, Any]]:
|
|
132
|
-
return []
|
|
133
|
-
|
|
134
|
-
async def get_response(self, messages: list[dict[str, Any]]) -> dict[str, Any]:
|
|
135
|
-
return {"content": "test"}
|
|
136
|
-
|
|
137
|
-
async def get_system_messages(self) -> list[Any]:
|
|
138
|
-
return []
|
|
139
139
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
# Agent can be created with None client
|
|
144
|
-
agent = TestAgent(mcp_client=None)
|
|
145
|
-
|
|
146
|
-
# But initialize should fail without client or task
|
|
147
|
-
with pytest.raises(ValueError, match="No MCPClient"):
|
|
148
|
-
await agent.initialize()
|
|
140
|
+
class TestMCPAgentRun:
|
|
141
|
+
"""Tests for MCPAgent.run() with EvalContext."""
|
|
149
142
|
|
|
150
143
|
@pytest.mark.asyncio
|
|
151
|
-
async def
|
|
152
|
-
"""Test
|
|
144
|
+
async def test_run_basic(self) -> None:
|
|
145
|
+
"""Test basic run flow with EvalContext."""
|
|
146
|
+
ctx = MockEvalContext(prompt="Do something")
|
|
153
147
|
agent = MockMCPAgent()
|
|
154
148
|
|
|
155
|
-
|
|
156
|
-
mock_session = MagicMock()
|
|
157
|
-
|
|
158
|
-
# Set up the connector and client_session structure
|
|
159
|
-
mock_session.connector = MagicMock()
|
|
160
|
-
mock_session.connector.client_session = MagicMock()
|
|
161
|
-
|
|
162
|
-
# Mock list_tools on the client_session
|
|
163
|
-
async def mock_list_tools():
|
|
164
|
-
return types.ListToolsResult(
|
|
165
|
-
tools=[
|
|
166
|
-
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
167
|
-
types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
|
|
168
|
-
types.Tool(
|
|
169
|
-
name="setup", description="Setup tool", inputSchema={"type": "object"}
|
|
170
|
-
),
|
|
171
|
-
]
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
mock_session.connector.client_session.list_tools = mock_list_tools
|
|
175
|
-
|
|
176
|
-
assert agent.mcp_client is not None
|
|
177
|
-
|
|
178
|
-
# Mock the list_tools method on mcp_client to return the tools
|
|
179
|
-
agent.mcp_client.list_tools = AsyncMock(
|
|
180
|
-
return_value=[
|
|
181
|
-
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
182
|
-
types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
|
|
183
|
-
types.Tool(name="setup", description="Setup tool", inputSchema={"type": "object"}),
|
|
184
|
-
]
|
|
185
|
-
)
|
|
186
|
-
|
|
187
|
-
await agent.initialize()
|
|
149
|
+
result = await agent.run(ctx)
|
|
188
150
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
assert
|
|
192
|
-
|
|
193
|
-
# Ensure names exist in available tools
|
|
194
|
-
names = {t.name for t in tools}
|
|
195
|
-
assert {"tool1", "tool2", "setup"} <= names
|
|
196
|
-
|
|
197
|
-
@pytest.mark.asyncio
|
|
198
|
-
async def test_initialize_with_filtering(self):
|
|
199
|
-
"""Test initialize with tool filtering."""
|
|
200
|
-
agent = MockMCPAgent(allowed_tools=["tool1"], disallowed_tools=["tool3"])
|
|
201
|
-
|
|
202
|
-
# Create proper async mock for session
|
|
203
|
-
mock_session = MagicMock()
|
|
204
|
-
|
|
205
|
-
# Set up the connector and client_session structure
|
|
206
|
-
mock_session.connector = MagicMock()
|
|
207
|
-
mock_session.connector.client_session = MagicMock()
|
|
208
|
-
|
|
209
|
-
async def mock_list_tools():
|
|
210
|
-
return types.ListToolsResult(
|
|
211
|
-
tools=[
|
|
212
|
-
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
213
|
-
types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
|
|
214
|
-
types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
|
|
215
|
-
types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
|
|
216
|
-
]
|
|
217
|
-
)
|
|
218
|
-
|
|
219
|
-
mock_session.connector.client_session.list_tools = mock_list_tools
|
|
220
|
-
|
|
221
|
-
assert agent.mcp_client is not None
|
|
222
|
-
|
|
223
|
-
# Mock the list_tools method on mcp_client to return the tools
|
|
224
|
-
agent.mcp_client.list_tools = AsyncMock(
|
|
225
|
-
return_value=[
|
|
226
|
-
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
227
|
-
types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
|
|
228
|
-
types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
|
|
229
|
-
types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
|
|
230
|
-
]
|
|
231
|
-
)
|
|
232
|
-
|
|
233
|
-
await agent.initialize()
|
|
234
|
-
|
|
235
|
-
# Check filtering worked - get_available_tools excludes lifecycle tools
|
|
236
|
-
tools = agent.get_available_tools()
|
|
237
|
-
tool_names = [t.name for t in tools]
|
|
238
|
-
assert len(tools) == 1 # Only tool1 (tool2 and tool3 are filtered out)
|
|
239
|
-
assert "tool1" in tool_names
|
|
240
|
-
assert "setup" not in tool_names # Lifecycle tool excluded from available tools
|
|
241
|
-
assert "tool2" not in tool_names # Not in allowed list
|
|
242
|
-
assert "tool3" not in tool_names # In disallowed list
|
|
151
|
+
assert result.done is True
|
|
152
|
+
assert result.content == "Mock response"
|
|
153
|
+
assert ctx._submitted == "Mock response"
|
|
243
154
|
|
|
244
155
|
@pytest.mark.asyncio
|
|
245
|
-
async def
|
|
246
|
-
"""Test
|
|
156
|
+
async def test_run_initializes_agent(self) -> None:
|
|
157
|
+
"""Test run() initializes the agent with context."""
|
|
158
|
+
ctx = MockEvalContext(prompt="Do something")
|
|
247
159
|
agent = MockMCPAgent()
|
|
248
160
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
mock_session.connector.client_session = MagicMock()
|
|
253
|
-
|
|
254
|
-
async def mock_list_tools():
|
|
255
|
-
return types.ListToolsResult(
|
|
256
|
-
tools=[
|
|
257
|
-
types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
|
|
258
|
-
]
|
|
259
|
-
)
|
|
260
|
-
|
|
261
|
-
mock_session.connector.client_session.list_tools = mock_list_tools
|
|
262
|
-
|
|
263
|
-
# Mock the call_tool method on the client session
|
|
264
|
-
mock_result = types.CallToolResult(
|
|
265
|
-
content=[types.TextContent(type="text", text="Tool result")], isError=False
|
|
266
|
-
)
|
|
267
|
-
|
|
268
|
-
async def mock_call_tool(name, args):
|
|
269
|
-
return mock_result
|
|
270
|
-
|
|
271
|
-
mock_session.connector.client_session.call_tool = mock_call_tool
|
|
272
|
-
|
|
273
|
-
assert agent.mcp_client is not None
|
|
274
|
-
|
|
275
|
-
# Mock the client's call_tool method directly
|
|
276
|
-
agent.mcp_client.call_tool = AsyncMock(return_value=mock_result)
|
|
277
|
-
|
|
278
|
-
# Mock the list_tools method to return the test tool
|
|
279
|
-
agent.mcp_client.list_tools = AsyncMock(
|
|
280
|
-
return_value=[
|
|
281
|
-
types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
|
|
282
|
-
]
|
|
283
|
-
)
|
|
284
|
-
|
|
285
|
-
await agent.initialize()
|
|
286
|
-
|
|
287
|
-
# Call the tool
|
|
288
|
-
tool_call = MCPToolCall(name="test_tool", arguments={"param": "value"})
|
|
289
|
-
results = await agent.call_tools(tool_call)
|
|
290
|
-
|
|
291
|
-
assert len(results) == 1
|
|
292
|
-
assert results[0] == mock_result
|
|
293
|
-
assert not results[0].isError
|
|
161
|
+
assert not agent._initialized
|
|
162
|
+
await agent.run(ctx)
|
|
163
|
+
assert agent._initialized
|
|
294
164
|
|
|
295
165
|
@pytest.mark.asyncio
|
|
296
|
-
async def
|
|
297
|
-
"""Test
|
|
166
|
+
async def test_run_discovers_tools(self) -> None:
|
|
167
|
+
"""Test run() discovers tools from context."""
|
|
168
|
+
tools = [
|
|
169
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={}),
|
|
170
|
+
types.Tool(name="tool2", description="Tool 2", inputSchema={}),
|
|
171
|
+
]
|
|
172
|
+
ctx = MockEvalContext(prompt="Do something", tools=tools)
|
|
298
173
|
agent = MockMCPAgent()
|
|
299
174
|
|
|
300
|
-
#
|
|
301
|
-
|
|
175
|
+
# We need to check tools before cleanup
|
|
176
|
+
# Store a reference to check
|
|
177
|
+
discovered_tools = []
|
|
302
178
|
|
|
303
|
-
|
|
304
|
-
return types.ListToolsResult(tools=[])
|
|
179
|
+
original_run = agent._run_context
|
|
305
180
|
|
|
306
|
-
|
|
307
|
-
|
|
181
|
+
async def capture_tools(*args: Any, **kwargs: Any) -> Any:
|
|
182
|
+
discovered_tools.extend(agent.get_available_tools())
|
|
183
|
+
return await original_run(*args, **kwargs)
|
|
308
184
|
|
|
309
|
-
|
|
185
|
+
agent._run_context = capture_tools # type: ignore
|
|
186
|
+
await agent.run(ctx)
|
|
310
187
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
188
|
+
assert len(discovered_tools) == 2
|
|
189
|
+
assert discovered_tools[0].name == "tool1"
|
|
190
|
+
assert discovered_tools[1].name == "tool2"
|
|
314
191
|
|
|
315
192
|
@pytest.mark.asyncio
|
|
316
|
-
async def
|
|
317
|
-
"""Test
|
|
318
|
-
# MCPToolCall accepts empty names
|
|
193
|
+
async def test_run_requires_eval_context(self) -> None:
|
|
194
|
+
"""Test run() raises TypeError for non-EvalContext."""
|
|
319
195
|
agent = MockMCPAgent()
|
|
320
|
-
tool_call = MCPToolCall(name="", arguments={})
|
|
321
196
|
|
|
322
|
-
|
|
323
|
-
|
|
197
|
+
with pytest.raises(TypeError, match="must be EvalContext"):
|
|
198
|
+
await agent.run("not a context") # type: ignore
|
|
324
199
|
|
|
325
|
-
|
|
326
|
-
|
|
200
|
+
@pytest.mark.asyncio
|
|
201
|
+
async def test_run_requires_prompt(self) -> None:
|
|
202
|
+
"""Test run() raises ValueError when prompt is empty."""
|
|
203
|
+
ctx = MockEvalContext(prompt="")
|
|
327
204
|
agent = MockMCPAgent()
|
|
328
205
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
agent._available_tools = [
|
|
333
|
-
types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
|
|
334
|
-
types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
|
|
335
|
-
]
|
|
336
|
-
|
|
337
|
-
schemas = agent.get_tool_schemas()
|
|
338
|
-
|
|
339
|
-
# Should include non-lifecycle tools
|
|
340
|
-
assert len(schemas) == 1
|
|
341
|
-
assert schemas[0]["name"] == "tool1"
|
|
206
|
+
with pytest.raises(ValueError, match="prompt is not set"):
|
|
207
|
+
await agent.run(ctx)
|
|
342
208
|
|
|
343
|
-
|
|
344
|
-
|
|
209
|
+
@pytest.mark.asyncio
|
|
210
|
+
async def test_run_clears_context_after(self) -> None:
|
|
211
|
+
"""Test run() clears ctx after completion."""
|
|
212
|
+
ctx = MockEvalContext(prompt="Do something")
|
|
345
213
|
agent = MockMCPAgent()
|
|
346
214
|
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
tool2 = types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"})
|
|
350
|
-
|
|
351
|
-
agent._available_tools = [tool1, tool2]
|
|
352
|
-
tools = agent.get_available_tools()
|
|
353
|
-
assert {t.name for t in tools} == {"tool1", "tool2"}
|
|
215
|
+
await agent.run(ctx)
|
|
216
|
+
assert agent.ctx is None
|
|
354
217
|
|
|
355
218
|
@pytest.mark.asyncio
|
|
356
|
-
async def
|
|
357
|
-
"""Test
|
|
219
|
+
async def test_run_no_submit_on_empty_content(self) -> None:
|
|
220
|
+
"""Test run() doesn't submit when content is empty."""
|
|
221
|
+
ctx = MockEvalContext(prompt="Do something")
|
|
358
222
|
agent = MockMCPAgent()
|
|
223
|
+
agent.set_response(AgentResponse(content="", tool_calls=[], done=True))
|
|
359
224
|
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
assert click_result.output is not None
|
|
363
|
-
assert "[SIMULATED] Click at (100, 200)" in click_result.output
|
|
225
|
+
await agent.run(ctx)
|
|
226
|
+
assert ctx._submitted is None
|
|
364
227
|
|
|
365
|
-
type_result = await agent.executor.write("Test input", take_screenshot=False)
|
|
366
|
-
assert type_result.output is not None
|
|
367
|
-
assert "[SIMULATED] Type 'Test input'" in type_result.output
|
|
368
228
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
assert "[SIMULATED] Scroll" in scroll_result.output
|
|
229
|
+
class TestMCPAgentToolCalling:
|
|
230
|
+
"""Tests for tool calling through context."""
|
|
372
231
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
232
|
+
@pytest.mark.asyncio
|
|
233
|
+
async def test_call_tools_uses_context(self) -> None:
|
|
234
|
+
"""Test call_tools routes through ctx.call_tool."""
|
|
235
|
+
ctx = MockEvalContext(prompt="Do something")
|
|
236
|
+
agent = MockMCPAgent()
|
|
377
237
|
|
|
238
|
+
# Bind context manually
|
|
239
|
+
agent.ctx = ctx
|
|
240
|
+
await agent._initialize_from_ctx(ctx)
|
|
378
241
|
|
|
379
|
-
|
|
380
|
-
|
|
242
|
+
# Call a tool
|
|
243
|
+
results = await agent.call_tools(MCPToolCall(name="test_tool", arguments={"arg": "value"}))
|
|
381
244
|
|
|
382
|
-
|
|
245
|
+
assert len(results) == 1
|
|
246
|
+
assert not results[0].isError
|
|
247
|
+
assert ("test_tool", {"arg": "value"}) in ctx._tool_calls
|
|
383
248
|
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
249
|
+
@pytest.mark.asyncio
|
|
250
|
+
async def test_call_tools_without_context_raises(self) -> None:
|
|
251
|
+
"""Test call_tools raises when no context bound."""
|
|
252
|
+
agent = MockMCPAgent()
|
|
388
253
|
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
) -> list[dict[str, Any]]:
|
|
392
|
-
"""Create initial messages."""
|
|
393
|
-
messages = [{"role": "user", "content": prompt}]
|
|
394
|
-
if initial_screenshot:
|
|
395
|
-
# capture_screenshot doesn't exist, just mock it
|
|
396
|
-
screenshot = "mock_screenshot_data"
|
|
397
|
-
messages.append({"role": "assistant", "content": f"Screenshot: {screenshot}"})
|
|
398
|
-
return messages
|
|
254
|
+
with pytest.raises(ValueError, match="not bound to context"):
|
|
255
|
+
await agent.call_tools(MCPToolCall(name="test_tool", arguments={}))
|
|
399
256
|
|
|
400
|
-
async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
|
|
401
|
-
"""Return predefined responses - must be async."""
|
|
402
|
-
if self.call_count < len(self.responses):
|
|
403
|
-
response_dict = self.responses[self.call_count]
|
|
404
|
-
self.call_count += 1
|
|
405
|
-
# Convert dict to AgentResponse
|
|
406
|
-
return AgentResponse(
|
|
407
|
-
content=response_dict.get("content", ""),
|
|
408
|
-
tool_calls=response_dict.get("tool_calls", []),
|
|
409
|
-
done=response_dict.get("done", not bool(response_dict.get("tool_calls"))),
|
|
410
|
-
)
|
|
411
|
-
return AgentResponse(content="Done", tool_calls=[], done=True)
|
|
412
257
|
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
) -> list[dict[str, Any]]:
|
|
416
|
-
"""Format tool results."""
|
|
417
|
-
formatted = []
|
|
418
|
-
for tool_call, result in zip(tool_calls, tool_results):
|
|
419
|
-
formatted.append({"role": "tool", "name": tool_call.name, "content": str(result)})
|
|
420
|
-
return formatted
|
|
258
|
+
class TestMCPAgentRequiredTools:
|
|
259
|
+
"""Tests for required_tools validation."""
|
|
421
260
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
async def get_system_messages(self) -> list[Any]:
|
|
427
|
-
"""Mock get system messages."""
|
|
428
|
-
return []
|
|
261
|
+
@pytest.mark.asyncio
|
|
262
|
+
async def test_missing_required_tools_raises(self) -> None:
|
|
263
|
+
"""Test run() raises when required tools are missing."""
|
|
429
264
|
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
formatted = []
|
|
433
|
-
for block in blocks:
|
|
434
|
-
if isinstance(block, types.TextContent):
|
|
435
|
-
formatted.append({"type": "text", "text": block.text})
|
|
436
|
-
elif isinstance(block, types.ImageContent):
|
|
437
|
-
formatted.append({"type": "image", "data": block.data})
|
|
438
|
-
elif hasattr(block, "type"):
|
|
439
|
-
formatted.append({"type": getattr(block, "type", "unknown")})
|
|
440
|
-
return formatted
|
|
265
|
+
class AgentWithRequiredTools(MockMCPAgent):
|
|
266
|
+
required_tools: ClassVar[list[str]] = ["must_have_tool"]
|
|
441
267
|
|
|
268
|
+
ctx = MockEvalContext(prompt="Do something", tools=[])
|
|
269
|
+
agent = AgentWithRequiredTools()
|
|
442
270
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
@pytest.fixture
|
|
447
|
-
def mock_client(self):
|
|
448
|
-
"""Create a mock MCP client."""
|
|
449
|
-
client = MagicMock()
|
|
450
|
-
client.get_all_active_sessions = MagicMock(return_value={})
|
|
451
|
-
client.initialize = AsyncMock()
|
|
452
|
-
client.list_tools = AsyncMock(return_value=[])
|
|
453
|
-
client.call_tool = AsyncMock(
|
|
454
|
-
return_value=types.CallToolResult(
|
|
455
|
-
content=[types.TextContent(type="text", text="Success")],
|
|
456
|
-
isError=False,
|
|
457
|
-
)
|
|
458
|
-
)
|
|
459
|
-
return client
|
|
460
|
-
|
|
461
|
-
@pytest.fixture
|
|
462
|
-
def agent_with_tools(self, mock_client):
|
|
463
|
-
"""Create agent with mock tools."""
|
|
464
|
-
mock_client.list_tools = AsyncMock(
|
|
465
|
-
return_value=[
|
|
466
|
-
types.Tool(name="screenshot", description="Take screenshot", inputSchema={}),
|
|
467
|
-
types.Tool(name="click", description="Click at coordinates", inputSchema={}),
|
|
468
|
-
types.Tool(name="type", description="Type text", inputSchema={}),
|
|
469
|
-
types.Tool(name="bad_tool", description="A tool that fails", inputSchema={}),
|
|
470
|
-
]
|
|
471
|
-
)
|
|
472
|
-
return MockAgentExtended(mcp_client=mock_client)
|
|
271
|
+
with pytest.raises(ValueError, match="Required tools are missing"):
|
|
272
|
+
await agent.run(ctx)
|
|
473
273
|
|
|
474
274
|
@pytest.mark.asyncio
|
|
475
|
-
async def
|
|
476
|
-
"""Test
|
|
477
|
-
from hud.types import MCPToolResult
|
|
478
|
-
|
|
479
|
-
task = Task(
|
|
480
|
-
id="test_task",
|
|
481
|
-
prompt="Click the button",
|
|
482
|
-
mcp_config={"test_server": {"url": "http://localhost:8080"}},
|
|
483
|
-
setup_tool={"name": "navigate", "arguments": {"url": "https://example.com"}}, # type: ignore[arg-type]
|
|
484
|
-
evaluate_tool={"name": "check_result", "arguments": {}}, # type: ignore[arg-type]
|
|
485
|
-
)
|
|
486
|
-
|
|
487
|
-
# Set up responses
|
|
488
|
-
agent_with_tools.responses = [
|
|
489
|
-
{
|
|
490
|
-
"role": "assistant",
|
|
491
|
-
"content": "I'll click the button",
|
|
492
|
-
"tool_calls": [MCPToolCall(name="click", arguments={"x": 100, "y": 200})],
|
|
493
|
-
}
|
|
494
|
-
]
|
|
275
|
+
async def test_required_tools_present_succeeds(self) -> None:
|
|
276
|
+
"""Test run() succeeds when required tools are present."""
|
|
495
277
|
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
side_effect=[
|
|
499
|
-
# Setup tool
|
|
500
|
-
MCPToolResult(
|
|
501
|
-
content=[types.TextContent(type="text", text="Navigated")],
|
|
502
|
-
isError=False,
|
|
503
|
-
),
|
|
504
|
-
# Click tool
|
|
505
|
-
MCPToolResult(
|
|
506
|
-
content=[types.TextContent(type="text", text="Clicked")],
|
|
507
|
-
isError=False,
|
|
508
|
-
),
|
|
509
|
-
# Evaluate tool with reward
|
|
510
|
-
MCPToolResult(
|
|
511
|
-
content=[types.TextContent(type="text", text="Success")],
|
|
512
|
-
isError=False,
|
|
513
|
-
structuredContent={"reward": 1.0},
|
|
514
|
-
),
|
|
515
|
-
]
|
|
516
|
-
)
|
|
278
|
+
class AgentWithRequiredTools(MockMCPAgent):
|
|
279
|
+
required_tools: ClassVar[list[str]] = ["required_tool"]
|
|
517
280
|
|
|
518
|
-
|
|
281
|
+
tools = [types.Tool(name="required_tool", description="Required", inputSchema={})]
|
|
282
|
+
ctx = MockEvalContext(prompt="Do something", tools=tools)
|
|
283
|
+
agent = AgentWithRequiredTools()
|
|
519
284
|
|
|
520
|
-
|
|
521
|
-
assert result.reward == 1.0
|
|
522
|
-
assert not result.isError
|
|
285
|
+
result = await agent.run(ctx)
|
|
523
286
|
assert result.done
|
|
524
287
|
|
|
288
|
+
|
|
289
|
+
class TestMCPAgentOnToolsReady:
|
|
290
|
+
"""Tests for _on_tools_ready hook."""
|
|
291
|
+
|
|
525
292
|
@pytest.mark.asyncio
|
|
526
|
-
async def
|
|
527
|
-
"""Test
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
task = Task(
|
|
531
|
-
id="test_task",
|
|
532
|
-
prompt="Do something",
|
|
533
|
-
mcp_config={"test_server": {"url": "http://localhost:8080"}},
|
|
534
|
-
setup_tool={"name": "bad_setup", "arguments": {}}, # type: ignore[arg-type]
|
|
535
|
-
)
|
|
293
|
+
async def test_on_tools_ready_called(self) -> None:
|
|
294
|
+
"""Test _on_tools_ready is called during initialization."""
|
|
295
|
+
hook_called = [False]
|
|
536
296
|
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
content=[types.TextContent(type="text", text="Setup failed")],
|
|
541
|
-
isError=True,
|
|
542
|
-
)
|
|
543
|
-
)
|
|
297
|
+
class AgentWithHook(MockMCPAgent):
|
|
298
|
+
def _on_tools_ready(self) -> None:
|
|
299
|
+
hook_called[0] = True
|
|
544
300
|
|
|
545
|
-
|
|
301
|
+
ctx = MockEvalContext(prompt="Do something")
|
|
302
|
+
agent = AgentWithHook()
|
|
546
303
|
|
|
547
|
-
|
|
548
|
-
assert
|
|
549
|
-
# Error content is the string representation of the MCPToolResult list
|
|
550
|
-
assert result.content is not None
|
|
551
|
-
assert "Setup failed" in result.content
|
|
552
|
-
assert "MCPToolResult" in result.content
|
|
304
|
+
await agent.run(ctx)
|
|
305
|
+
assert hook_called[0]
|
|
553
306
|
|
|
554
307
|
@pytest.mark.asyncio
|
|
555
|
-
async def
|
|
556
|
-
"""Test
|
|
557
|
-
|
|
558
|
-
task = Task(
|
|
559
|
-
id="test_task",
|
|
560
|
-
prompt="Test multiple setup",
|
|
561
|
-
mcp_config={"test_server": {"url": "http://localhost:8080"}},
|
|
562
|
-
setup_tool=[
|
|
563
|
-
MCPToolCall(name="setup1", arguments={}),
|
|
564
|
-
MCPToolCall(name="setup2", arguments={}),
|
|
565
|
-
],
|
|
566
|
-
)
|
|
567
|
-
|
|
568
|
-
agent_with_tools.responses = [{"role": "assistant", "content": "Done", "tool_calls": []}]
|
|
308
|
+
async def test_on_tools_ready_has_access_to_tools(self) -> None:
|
|
309
|
+
"""Test _on_tools_ready can access discovered tools."""
|
|
310
|
+
captured_tools: list[types.Tool] = []
|
|
569
311
|
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
or MCPToolResult(
|
|
574
|
-
content=[types.TextContent(type="text", text=f"{tool_call.name} done")],
|
|
575
|
-
isError=False,
|
|
576
|
-
)
|
|
577
|
-
)
|
|
312
|
+
class AgentWithHook(MockMCPAgent):
|
|
313
|
+
def _on_tools_ready(self) -> None:
|
|
314
|
+
captured_tools.extend(self.get_available_tools())
|
|
578
315
|
|
|
579
|
-
|
|
316
|
+
tools = [
|
|
317
|
+
types.Tool(name="tool1", description="Tool 1", inputSchema={}),
|
|
318
|
+
types.Tool(name="tool2", description="Tool 2", inputSchema={}),
|
|
319
|
+
]
|
|
320
|
+
ctx = MockEvalContext(prompt="Do something", tools=tools)
|
|
321
|
+
agent = AgentWithHook()
|
|
580
322
|
|
|
581
|
-
|
|
582
|
-
setup_names = [call.name for call in setup_calls]
|
|
583
|
-
assert "setup1" in setup_names
|
|
584
|
-
assert "setup2" in setup_names
|
|
585
|
-
assert not result.isError
|
|
323
|
+
await agent.run(ctx)
|
|
586
324
|
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
"""Test that allowed_tools filters available tools."""
|
|
590
|
-
mock_client.list_tools = AsyncMock(
|
|
591
|
-
return_value=[
|
|
592
|
-
types.Tool(name="tool1", description="Tool 1", inputSchema={}),
|
|
593
|
-
types.Tool(name="tool2", description="Tool 2", inputSchema={}),
|
|
594
|
-
types.Tool(name="tool3", description="Tool 3", inputSchema={}),
|
|
595
|
-
]
|
|
596
|
-
)
|
|
325
|
+
assert len(captured_tools) == 2
|
|
326
|
+
assert captured_tools[0].name == "tool1"
|
|
597
327
|
|
|
598
|
-
agent = MockAgentExtended(mcp_client=mock_client, allowed_tools=["tool1", "tool3"])
|
|
599
|
-
await agent.initialize("test")
|
|
600
328
|
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
assert "tool3" in available_names
|
|
604
|
-
assert "tool2" not in available_names
|
|
329
|
+
class TestMCPAgentToolSchemas:
|
|
330
|
+
"""Tests for tool schema generation."""
|
|
605
331
|
|
|
606
332
|
@pytest.mark.asyncio
|
|
607
|
-
async def
|
|
608
|
-
"""Test
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
agent =
|
|
618
|
-
await agent.initialize("test")
|
|
333
|
+
async def test_get_tool_schemas(self) -> None:
|
|
334
|
+
"""Test get_tool_schemas returns correct format."""
|
|
335
|
+
tools = [
|
|
336
|
+
types.Tool(
|
|
337
|
+
name="my_tool",
|
|
338
|
+
description="My tool description",
|
|
339
|
+
inputSchema={"type": "object", "properties": {"x": {"type": "string"}}},
|
|
340
|
+
)
|
|
341
|
+
]
|
|
342
|
+
ctx = MockEvalContext(prompt="Do something", tools=tools)
|
|
343
|
+
agent = MockMCPAgent()
|
|
619
344
|
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
assert "tool2" not in available_names
|
|
345
|
+
# Initialize agent
|
|
346
|
+
agent.ctx = ctx
|
|
347
|
+
await agent._initialize_from_ctx(ctx)
|
|
624
348
|
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
""
|
|
628
|
-
|
|
629
|
-
agent = MockAgentExtended(
|
|
630
|
-
mcp_client=mock_client,
|
|
631
|
-
responses=[{"role": "assistant", "content": "Done", "tool_calls": []}],
|
|
632
|
-
)
|
|
349
|
+
schemas = agent.get_tool_schemas()
|
|
350
|
+
assert len(schemas) == 1
|
|
351
|
+
assert schemas[0]["name"] == "my_tool"
|
|
352
|
+
assert schemas[0]["description"] == "My tool description"
|
|
633
353
|
|
|
634
|
-
# Add screenshot tool to available tools
|
|
635
|
-
mock_client.list_tools = AsyncMock(
|
|
636
|
-
return_value=[
|
|
637
|
-
types.Tool(name="screenshot", description="Take screenshot", inputSchema={})
|
|
638
|
-
]
|
|
639
|
-
)
|
|
640
354
|
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
result = await agent.run("Test lifecycle", max_steps=1)
|
|
645
|
-
assert not result.isError
|
|
646
|
-
|
|
647
|
-
# This test is commented out as screenshot history management may have changed
|
|
648
|
-
# @pytest.mark.asyncio
|
|
649
|
-
# async def test_screenshot_history_management(self, agent_with_tools):
|
|
650
|
-
# """Test screenshot history is maintained."""
|
|
651
|
-
# agent_with_tools.initial_screenshot = True
|
|
652
|
-
|
|
653
|
-
# # Set up responses with tool calls
|
|
654
|
-
# agent_with_tools.responses = [
|
|
655
|
-
# {
|
|
656
|
-
# "role": "assistant",
|
|
657
|
-
# "content": "Action 1",
|
|
658
|
-
# "tool_calls": [MCPToolCall(name="click", arguments={"x": 1, "y": 1})],
|
|
659
|
-
# },
|
|
660
|
-
# {
|
|
661
|
-
# "role": "assistant",
|
|
662
|
-
# "content": "Action 2",
|
|
663
|
-
# "tool_calls": [MCPToolCall(name="click", arguments={"x": 2, "y": 2})],
|
|
664
|
-
# },
|
|
665
|
-
# {
|
|
666
|
-
# "role": "assistant",
|
|
667
|
-
# "content": "Action 3",
|
|
668
|
-
# "tool_calls": [MCPToolCall(name="click", arguments={"x": 3, "y": 3})],
|
|
669
|
-
# },
|
|
670
|
-
# ]
|
|
671
|
-
|
|
672
|
-
# await agent_with_tools.run("Test screenshots", max_steps=3)
|
|
673
|
-
|
|
674
|
-
# # Should have screenshots in history
|
|
675
|
-
# assert len(agent_with_tools.screenshot_history) > 0
|
|
355
|
+
class TestMCPAgentErrorPropagation:
|
|
356
|
+
"""Tests for error propagation to EvalContext."""
|
|
676
357
|
|
|
677
358
|
@pytest.mark.asyncio
|
|
678
|
-
async def
|
|
679
|
-
"""Test run
|
|
680
|
-
with pytest.raises(TypeError, match="prompt_or_task must be str or Task"):
|
|
681
|
-
await agent_with_tools.run(123) # Invalid type
|
|
359
|
+
async def test_exception_propagates_to_ctx_error(self) -> None:
|
|
360
|
+
"""Test that exceptions during run() set ctx.error for platform visibility."""
|
|
682
361
|
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
from hud.types import MCPToolResult
|
|
687
|
-
|
|
688
|
-
task = Task(
|
|
689
|
-
id="test_task",
|
|
690
|
-
prompt="Test evaluation",
|
|
691
|
-
mcp_config={"test_server": {"url": "http://localhost:8080"}},
|
|
692
|
-
evaluate_tool=[
|
|
693
|
-
MCPToolCall(name="eval1", arguments={}),
|
|
694
|
-
MCPToolCall(name="eval2", arguments={"reward": True}),
|
|
695
|
-
],
|
|
696
|
-
)
|
|
362
|
+
class FailingAgent(MockMCPAgent):
|
|
363
|
+
async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
|
|
364
|
+
raise RuntimeError("Agent crashed")
|
|
697
365
|
|
|
698
|
-
|
|
366
|
+
ctx = MockEvalContext(prompt="Do something")
|
|
367
|
+
agent = FailingAgent()
|
|
699
368
|
|
|
700
|
-
|
|
701
|
-
agent_with_tools.mcp_client.call_tool = AsyncMock(
|
|
702
|
-
side_effect=lambda tool_call: eval_calls.append(tool_call)
|
|
703
|
-
or MCPToolResult(
|
|
704
|
-
content=[types.TextContent(type="text", text=f"{tool_call.name} result")],
|
|
705
|
-
isError=False,
|
|
706
|
-
structuredContent={"reward": 0.5} if tool_call.name == "eval1" else {"reward": 1.0},
|
|
707
|
-
)
|
|
708
|
-
)
|
|
369
|
+
result = await agent.run(ctx)
|
|
709
370
|
|
|
710
|
-
|
|
371
|
+
# Should return error trace
|
|
372
|
+
assert result.isError is True
|
|
373
|
+
assert result.content is not None
|
|
374
|
+
assert "Agent crashed" in result.content
|
|
711
375
|
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
assert "
|
|
715
|
-
assert "eval2" in eval_names
|
|
716
|
-
assert result.reward == 0.5 # From eval1 (first evaluation tool)
|
|
376
|
+
assert ctx.error is not None
|
|
377
|
+
assert isinstance(ctx.error, BaseException)
|
|
378
|
+
assert "Agent crashed" in str(ctx.error)
|
|
717
379
|
|
|
718
380
|
@pytest.mark.asyncio
|
|
719
|
-
async def
|
|
720
|
-
"""Test that
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
381
|
+
async def test_step_error_propagates_to_ctx_error(self) -> None:
|
|
382
|
+
"""Test that step-level errors (caught internally) set ctx.error."""
|
|
383
|
+
step_count = [0]
|
|
384
|
+
|
|
385
|
+
class FailOnSecondStepAgent(MockMCPAgent):
|
|
386
|
+
async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
|
|
387
|
+
step_count[0] += 1
|
|
388
|
+
if step_count[0] == 1:
|
|
389
|
+
return AgentResponse(
|
|
390
|
+
content="",
|
|
391
|
+
tool_calls=[MCPToolCall(name="test_tool", arguments={})],
|
|
392
|
+
done=False,
|
|
393
|
+
)
|
|
394
|
+
else:
|
|
395
|
+
raise ValueError("Step 2 failed")
|
|
396
|
+
|
|
397
|
+
ctx = MockEvalContext(prompt="Do something")
|
|
398
|
+
agent = FailOnSecondStepAgent()
|
|
399
|
+
|
|
400
|
+
result = await agent.run(ctx)
|
|
401
|
+
|
|
402
|
+
# Should return error trace
|
|
403
|
+
assert result.isError is True
|
|
404
|
+
assert ctx.error is not None
|
|
405
|
+
assert "Step 2 failed" in str(ctx.error)
|
|
728
406
|
|
|
729
|
-
|
|
730
|
-
|
|
407
|
+
@pytest.mark.asyncio
|
|
408
|
+
async def test_no_error_when_successful(self) -> None:
|
|
409
|
+
"""Test that ctx.error remains None on successful run."""
|
|
410
|
+
ctx = MockEvalContext(prompt="Do something")
|
|
411
|
+
agent = MockMCPAgent()
|
|
731
412
|
|
|
732
|
-
result = await
|
|
413
|
+
result = await agent.run(ctx)
|
|
733
414
|
|
|
734
|
-
assert result.isError
|
|
735
|
-
|
|
736
|
-
assert "Setup explosion" in result.content
|
|
737
|
-
assert "MCPToolResult" in result.content
|
|
738
|
-
assert result.done
|
|
415
|
+
assert result.isError is False
|
|
416
|
+
assert ctx.error is None
|