hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +70 -5
- hud/agents/base.py +238 -500
- hud/agents/claude.py +236 -247
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +264 -0
- hud/agents/gemini_cua.py +324 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +48 -36
- hud/agents/openai.py +282 -296
- hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
- hud/agents/operator.py +199 -0
- hud/agents/resolver.py +70 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +381 -214
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +377 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +493 -546
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +699 -113
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +889 -732
- hud/cli/eval.py +793 -667
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/pull.py +1 -1
- hud/cli/push.py +38 -13
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +110 -8
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push.py +1 -1
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +70 -1
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +45 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +326 -0
- hud/datasets/runner.py +198 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +52 -0
- hud/environment/connection.py +258 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +137 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +835 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +263 -0
- hud/environment/scenarios.py +620 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +205 -0
- hud/environment/tests/test_environment.py +593 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +242 -0
- hud/environment/tests/test_scenarios.py +1086 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +727 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +187 -0
- hud/eval/manager.py +533 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +372 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +291 -0
- hud/eval/types.py +65 -0
- hud/eval/utils.py +194 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +308 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +165 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +18 -2
- hud/tools/agent.py +223 -0
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +36 -3
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +194 -56
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +89 -18
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.13.dist-info/METADATA +264 -0
- hud_python-0.5.13.dist-info/RECORD +305 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Environment-based client adapter for agents."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
import mcp.types as types
|
|
8
|
+
|
|
9
|
+
from hud.types import MCPToolCall, MCPToolResult
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from hud.environment import Environment
|
|
13
|
+
from hud.eval.context import EvalContext
|
|
14
|
+
|
|
15
|
+
__all__ = ["EnvironmentClient"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class EnvironmentClient:
|
|
19
|
+
"""Adapter wrapping Environment/EvalContext as AgentMCPClient."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, env: Environment | EvalContext) -> None:
|
|
22
|
+
self._env = env
|
|
23
|
+
self._initialized = False
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def mcp_config(self) -> dict[str, dict[str, Any]]:
|
|
27
|
+
return {}
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def is_connected(self) -> bool:
|
|
31
|
+
return self._initialized
|
|
32
|
+
|
|
33
|
+
async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None:
|
|
34
|
+
if not self._initialized:
|
|
35
|
+
await self._env.list_tools()
|
|
36
|
+
self._initialized = True
|
|
37
|
+
|
|
38
|
+
async def list_tools(self) -> list[types.Tool]:
|
|
39
|
+
return await self._env.list_tools()
|
|
40
|
+
|
|
41
|
+
async def call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
|
|
42
|
+
result = await self._env.call_tool(tool_call.name, **(tool_call.arguments or {}))
|
|
43
|
+
if isinstance(result, MCPToolResult):
|
|
44
|
+
return result
|
|
45
|
+
return MCPToolResult(
|
|
46
|
+
content=[types.TextContent(type="text", text=str(result))],
|
|
47
|
+
isError=False,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
async def shutdown(self) -> None:
|
|
51
|
+
self._initialized = False
|
hud/clients/fastmcp.py
CHANGED
|
@@ -12,6 +12,7 @@ from fastmcp import Client as FastMCPClient
|
|
|
12
12
|
from mcp import Implementation, types
|
|
13
13
|
from mcp.shared.exceptions import McpError
|
|
14
14
|
|
|
15
|
+
from hud.settings import settings
|
|
15
16
|
from hud.types import MCPToolCall, MCPToolResult
|
|
16
17
|
from hud.version import __version__ as hud_version
|
|
17
18
|
|
|
@@ -73,7 +74,7 @@ class FastMCPHUDClient(BaseHUDClient):
|
|
|
73
74
|
return
|
|
74
75
|
|
|
75
76
|
# Create FastMCP client with the custom transport
|
|
76
|
-
timeout =
|
|
77
|
+
timeout = settings.client_timeout
|
|
77
78
|
os.environ["FASTMCP_CLIENT_INIT_TIMEOUT"] = str(timeout)
|
|
78
79
|
|
|
79
80
|
# Create custom transport with retry support for HTTP servers
|
|
@@ -91,11 +92,11 @@ class FastMCPHUDClient(BaseHUDClient):
|
|
|
91
92
|
# Check if connecting to HUD API
|
|
92
93
|
for server_config in mcp_config.values():
|
|
93
94
|
url = server_config.get("url", "")
|
|
94
|
-
if "mcp.hud.
|
|
95
|
+
if "mcp.hud.ai" in url:
|
|
95
96
|
raise RuntimeError(
|
|
96
97
|
"Authentication failed for HUD API. "
|
|
97
98
|
"Please ensure your HUD_API_KEY environment variable is set correctly." # noqa: E501
|
|
98
|
-
"You can get an API key at https://hud.
|
|
99
|
+
"You can get an API key at https://hud.ai"
|
|
99
100
|
) from e
|
|
100
101
|
# Generic 401 error
|
|
101
102
|
raise RuntimeError(
|
|
@@ -110,7 +111,7 @@ class FastMCPHUDClient(BaseHUDClient):
|
|
|
110
111
|
hasattr(self._client, "_session_state")
|
|
111
112
|
and self._client._session_state.session is not None
|
|
112
113
|
):
|
|
113
|
-
self._client._session_state.session._validate_structured_outputs = (
|
|
114
|
+
self._client._session_state.session._validate_structured_outputs = ( # type: ignore[attr-defined]
|
|
114
115
|
self._strict_validation
|
|
115
116
|
)
|
|
116
117
|
except ImportError:
|
|
@@ -124,6 +125,12 @@ class FastMCPHUDClient(BaseHUDClient):
|
|
|
124
125
|
raise ValueError("Client is not connected, call initialize() first")
|
|
125
126
|
return await self._client.list_tools()
|
|
126
127
|
|
|
128
|
+
async def _list_prompts_impl(self) -> list[types.Prompt]:
|
|
129
|
+
"""List all available prompts (FastMCP supports this)."""
|
|
130
|
+
if self._client is None:
|
|
131
|
+
raise ValueError("Client is not connected, call initialize() first")
|
|
132
|
+
return await self._client.list_prompts()
|
|
133
|
+
|
|
127
134
|
async def _call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
|
|
128
135
|
"""Execute a tool by name."""
|
|
129
136
|
if self._client is None:
|
|
@@ -143,8 +150,8 @@ class FastMCPHUDClient(BaseHUDClient):
|
|
|
143
150
|
structuredContent=result.structured_content,
|
|
144
151
|
)
|
|
145
152
|
|
|
146
|
-
async def
|
|
147
|
-
"""
|
|
153
|
+
async def _list_resources_impl(self) -> list[types.Resource]:
|
|
154
|
+
"""Implementation of resource listing for FastMCP client."""
|
|
148
155
|
if self._client is None:
|
|
149
156
|
raise ValueError("Client is not connected, call initialize() first")
|
|
150
157
|
return await self._client.list_resources()
|
hud/clients/mcp_use.py
CHANGED
|
@@ -9,9 +9,8 @@ from urllib.parse import urlparse
|
|
|
9
9
|
|
|
10
10
|
from mcp import Implementation, types
|
|
11
11
|
from mcp.shared.exceptions import McpError
|
|
12
|
-
from mcp_use.client import MCPClient as MCPUseClient
|
|
13
|
-
from mcp_use.session import MCPSession as MCPUseSession
|
|
14
|
-
from mcp_use.types.http import HttpOptions
|
|
12
|
+
from mcp_use.client.client import MCPClient as MCPUseClient
|
|
13
|
+
from mcp_use.client.session import MCPSession as MCPUseSession
|
|
15
14
|
from pydantic import AnyUrl
|
|
16
15
|
|
|
17
16
|
from hud.settings import settings
|
|
@@ -20,7 +19,6 @@ from hud.utils.hud_console import HUDConsole
|
|
|
20
19
|
from hud.version import __version__ as hud_version
|
|
21
20
|
|
|
22
21
|
from .base import BaseHUDClient
|
|
23
|
-
from .utils.retry_transport import create_retry_httpx_client
|
|
24
22
|
|
|
25
23
|
logger = logging.getLogger(__name__)
|
|
26
24
|
hud_console = HUDConsole(logger=logger)
|
|
@@ -58,12 +56,6 @@ class MCPUseHUDClient(BaseHUDClient):
|
|
|
58
56
|
str, tuple[str, types.Tool, types.Tool]
|
|
59
57
|
] = {} # server_name, original_tool, prefixed_tool
|
|
60
58
|
self._client: Any | None = None # Will be MCPUseClient when available
|
|
61
|
-
# Transport options for MCP-use (disable_sse_fallback, httpx_client_factory, etc.)
|
|
62
|
-
# Default to retry-enabled HTTPX client if factory not provided
|
|
63
|
-
self._http_options: HttpOptions = HttpOptions(
|
|
64
|
-
httpx_client_factory=create_retry_httpx_client,
|
|
65
|
-
disable_sse_fallback=True,
|
|
66
|
-
)
|
|
67
59
|
|
|
68
60
|
async def _connect(self, mcp_config: dict[str, dict[str, Any]]) -> None:
|
|
69
61
|
"""Create all sessions for MCP-use client."""
|
|
@@ -71,6 +63,16 @@ class MCPUseHUDClient(BaseHUDClient):
|
|
|
71
63
|
logger.warning("Client is already connected, cannot connect again")
|
|
72
64
|
return
|
|
73
65
|
|
|
66
|
+
# Use configurable timeout for SSE read operations to support long-running tool calls.
|
|
67
|
+
max_request_timeout = 840
|
|
68
|
+
for server_cfg in mcp_config.values():
|
|
69
|
+
if "sse_read_timeout" not in server_cfg:
|
|
70
|
+
server_cfg["sse_read_timeout"] = (
|
|
71
|
+
min(settings.client_timeout, max_request_timeout)
|
|
72
|
+
if settings.client_timeout > 0
|
|
73
|
+
else max_request_timeout
|
|
74
|
+
)
|
|
75
|
+
|
|
74
76
|
# If a server target matches HUD's MCP host and no auth is provided,
|
|
75
77
|
# inject the HUD API key as a Bearer token to avoid OAuth browser flow.
|
|
76
78
|
try:
|
|
@@ -88,11 +90,13 @@ class MCPUseHUDClient(BaseHUDClient):
|
|
|
88
90
|
config = {"mcpServers": mcp_config}
|
|
89
91
|
if MCPUseClient is None:
|
|
90
92
|
raise ImportError("MCPUseClient is not available")
|
|
91
|
-
self._client = MCPUseClient.from_dict(config
|
|
93
|
+
self._client = MCPUseClient.from_dict(config)
|
|
92
94
|
try:
|
|
93
|
-
assert self._client is not None
|
|
95
|
+
assert self._client is not None
|
|
94
96
|
self._sessions = await self._client.create_all_sessions()
|
|
95
|
-
|
|
97
|
+
session_count = len(self._sessions)
|
|
98
|
+
session_text = "session" if session_count == 1 else "sessions"
|
|
99
|
+
hud_console.info(f"Created {session_count} MCP {session_text}")
|
|
96
100
|
|
|
97
101
|
# Configure validation for all sessions based on client setting
|
|
98
102
|
try:
|
|
@@ -241,8 +245,8 @@ class MCPUseHUDClient(BaseHUDClient):
|
|
|
241
245
|
structuredContent=result.structuredContent,
|
|
242
246
|
)
|
|
243
247
|
|
|
244
|
-
async def
|
|
245
|
-
"""
|
|
248
|
+
async def _list_resources_impl(self) -> list[types.Resource]:
|
|
249
|
+
"""Implementation of resource listing for MCP-use client."""
|
|
246
250
|
if self._client is None or not self._sessions:
|
|
247
251
|
raise ValueError("Client is not connected, call initialize() first")
|
|
248
252
|
|
|
@@ -268,6 +272,32 @@ class MCPUseHUDClient(BaseHUDClient):
|
|
|
268
272
|
continue
|
|
269
273
|
return []
|
|
270
274
|
|
|
275
|
+
async def _list_prompts_impl(self) -> list[types.Prompt]:
|
|
276
|
+
"""Implementation of prompt listing for MCP-use client (best-effort)."""
|
|
277
|
+
if self._client is None or not self._sessions:
|
|
278
|
+
raise ValueError("Client is not connected, call initialize() first")
|
|
279
|
+
|
|
280
|
+
all_prompts: list[types.Prompt] = []
|
|
281
|
+
for server_name, session in self._sessions.items():
|
|
282
|
+
try:
|
|
283
|
+
if not hasattr(session, "connector") or not hasattr(
|
|
284
|
+
session.connector, "client_session"
|
|
285
|
+
):
|
|
286
|
+
continue
|
|
287
|
+
if session.connector.client_session is None:
|
|
288
|
+
continue
|
|
289
|
+
|
|
290
|
+
if not hasattr(session.connector.client_session, "list_prompts"):
|
|
291
|
+
continue
|
|
292
|
+
|
|
293
|
+
prompts_result = await session.connector.client_session.list_prompts()
|
|
294
|
+
all_prompts.extend(prompts_result.prompts)
|
|
295
|
+
except Exception as e:
|
|
296
|
+
if self.verbose:
|
|
297
|
+
hud_console.debug(f"Could not list prompts from server '{server_name}': {e}")
|
|
298
|
+
continue
|
|
299
|
+
return all_prompts
|
|
300
|
+
|
|
271
301
|
async def read_resource(self, uri: str | AnyUrl) -> types.ReadResourceResult | None:
|
|
272
302
|
"""Read a resource by URI from any server that provides it."""
|
|
273
303
|
if self._client is None or not self._sessions:
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""Tests for scenario discovery via prompts/resources in analyze_environment()."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
from mcp import types
|
|
9
|
+
from pydantic import AnyUrl
|
|
10
|
+
|
|
11
|
+
from hud.clients.base import BaseHUDClient
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from hud.types import MCPToolCall, MCPToolResult
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class _MockClient(BaseHUDClient):
|
|
18
|
+
"""Minimal BaseHUDClient for testing analyze_environment scenario derivation."""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
*,
|
|
23
|
+
prompts: list[types.Prompt],
|
|
24
|
+
resources: list[types.Resource],
|
|
25
|
+
) -> None:
|
|
26
|
+
super().__init__(mcp_config={"test": {"url": "mock://test"}}, verbose=True)
|
|
27
|
+
self._mock_prompts = prompts
|
|
28
|
+
self._mock_resources = resources
|
|
29
|
+
# Skip initialize() (which fetches telemetry); we just need analyze_environment().
|
|
30
|
+
self._initialized = True
|
|
31
|
+
|
|
32
|
+
async def _connect(self, mcp_config: dict[str, dict[str, Any]]) -> None: # pragma: no cover
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
async def list_tools(self) -> list[types.Tool]:
|
|
36
|
+
return []
|
|
37
|
+
|
|
38
|
+
async def _list_resources_impl(self) -> list[types.Resource]:
|
|
39
|
+
return self._mock_resources
|
|
40
|
+
|
|
41
|
+
async def _list_prompts_impl(self) -> list[types.Prompt]:
|
|
42
|
+
return self._mock_prompts
|
|
43
|
+
|
|
44
|
+
async def _call_tool(self, tool_call: MCPToolCall) -> MCPToolResult: # pragma: no cover
|
|
45
|
+
raise NotImplementedError
|
|
46
|
+
|
|
47
|
+
async def read_resource(self, uri: str) -> types.ReadResourceResult | None: # pragma: no cover
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
async def _disconnect(self) -> None: # pragma: no cover
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@pytest.mark.asyncio
|
|
55
|
+
async def test_analyze_environment_derives_scenarios_from_scenario_prompt_and_resource() -> None:
|
|
56
|
+
prompts = [
|
|
57
|
+
types.Prompt(
|
|
58
|
+
name="my-env:checkout",
|
|
59
|
+
description="[Setup] Checkout flow",
|
|
60
|
+
arguments=[],
|
|
61
|
+
)
|
|
62
|
+
]
|
|
63
|
+
resources = [
|
|
64
|
+
types.Resource(
|
|
65
|
+
uri=AnyUrl("my-env:checkout"),
|
|
66
|
+
name="checkout",
|
|
67
|
+
description="[Evaluate] Checkout flow",
|
|
68
|
+
)
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
client = _MockClient(prompts=prompts, resources=resources)
|
|
72
|
+
analysis = await client.analyze_environment()
|
|
73
|
+
|
|
74
|
+
assert "scenarios" in analysis
|
|
75
|
+
assert len(analysis["scenarios"]) == 1
|
|
76
|
+
scenario = analysis["scenarios"][0]
|
|
77
|
+
assert scenario["id"] == "my-env:checkout"
|
|
78
|
+
assert scenario["env"] == "my-env"
|
|
79
|
+
assert scenario["name"] == "checkout"
|
|
80
|
+
assert scenario["has_setup_prompt"] is True
|
|
81
|
+
assert scenario["has_evaluate_resource"] is True
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@pytest.mark.asyncio
|
|
85
|
+
async def test_analyze_environment_scenario_from_setup_only() -> None:
|
|
86
|
+
prompts = [
|
|
87
|
+
types.Prompt(
|
|
88
|
+
name="env-x:only_setup",
|
|
89
|
+
description="[Setup] Setup only scenario",
|
|
90
|
+
arguments=[],
|
|
91
|
+
)
|
|
92
|
+
]
|
|
93
|
+
resources: list[types.Resource] = []
|
|
94
|
+
|
|
95
|
+
client = _MockClient(prompts=prompts, resources=resources)
|
|
96
|
+
analysis = await client.analyze_environment()
|
|
97
|
+
|
|
98
|
+
assert len(analysis["scenarios"]) == 1
|
|
99
|
+
scenario = analysis["scenarios"][0]
|
|
100
|
+
assert scenario["id"] == "env-x:only_setup"
|
|
101
|
+
assert scenario["has_setup_prompt"] is True
|
|
102
|
+
assert scenario["has_evaluate_resource"] is False
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@pytest.mark.asyncio
|
|
106
|
+
async def test_analyze_environment_scenario_from_evaluate_only() -> None:
|
|
107
|
+
prompts: list[types.Prompt] = []
|
|
108
|
+
resources = [
|
|
109
|
+
types.Resource(
|
|
110
|
+
uri=AnyUrl("env-y:only_eval"),
|
|
111
|
+
name="only_eval",
|
|
112
|
+
description="[Evaluate] Evaluate only scenario",
|
|
113
|
+
)
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
client = _MockClient(prompts=prompts, resources=resources)
|
|
117
|
+
analysis = await client.analyze_environment()
|
|
118
|
+
|
|
119
|
+
assert len(analysis["scenarios"]) == 1
|
|
120
|
+
scenario = analysis["scenarios"][0]
|
|
121
|
+
assert scenario["id"] == "env-y:only_eval"
|
|
122
|
+
assert scenario["has_setup_prompt"] is False
|
|
123
|
+
assert scenario["has_evaluate_resource"] is True
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@pytest.mark.asyncio
|
|
127
|
+
async def test_analyze_environment_extracts_scenario_code_from_meta() -> None:
|
|
128
|
+
"""Test that scenario code is extracted from the meta field."""
|
|
129
|
+
scenario_code = """@env.scenario()
|
|
130
|
+
async def checkout(product_id: str):
|
|
131
|
+
await env.call_tool("navigate", url="/checkout")
|
|
132
|
+
yield "Complete the checkout"
|
|
133
|
+
result = await env.call_tool("check_order")
|
|
134
|
+
yield 1.0 if result else 0.0
|
|
135
|
+
"""
|
|
136
|
+
# Use model_validate with _meta alias (Pydantic alias for the meta field)
|
|
137
|
+
prompts = [
|
|
138
|
+
types.Prompt.model_validate(
|
|
139
|
+
{
|
|
140
|
+
"name": "my-env:checkout",
|
|
141
|
+
"description": "[Setup] Checkout flow",
|
|
142
|
+
"arguments": [{"name": "product_id", "required": True}],
|
|
143
|
+
"_meta": {"code": scenario_code},
|
|
144
|
+
}
|
|
145
|
+
)
|
|
146
|
+
]
|
|
147
|
+
resources = [
|
|
148
|
+
types.Resource.model_validate(
|
|
149
|
+
{
|
|
150
|
+
"uri": "my-env:checkout",
|
|
151
|
+
"name": "checkout",
|
|
152
|
+
"description": "[Evaluate] Checkout flow",
|
|
153
|
+
"_meta": {"code": scenario_code},
|
|
154
|
+
}
|
|
155
|
+
)
|
|
156
|
+
]
|
|
157
|
+
|
|
158
|
+
client = _MockClient(prompts=prompts, resources=resources)
|
|
159
|
+
analysis = await client.analyze_environment()
|
|
160
|
+
|
|
161
|
+
assert len(analysis["scenarios"]) == 1
|
|
162
|
+
scenario = analysis["scenarios"][0]
|
|
163
|
+
assert scenario["id"] == "my-env:checkout"
|
|
164
|
+
assert "code" in scenario
|
|
165
|
+
assert scenario["code"] == scenario_code
|
|
166
|
+
assert "async def checkout" in scenario["code"]
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@pytest.mark.asyncio
|
|
170
|
+
async def test_analyze_environment_extracts_meta_on_prompts_and_resources() -> None:
|
|
171
|
+
"""Test that meta field is included in prompts and resources analysis."""
|
|
172
|
+
meta_data = {"code": "test code", "extra": "value"}
|
|
173
|
+
# Use model_validate with _meta alias (Pydantic alias for the meta field)
|
|
174
|
+
prompts = [
|
|
175
|
+
types.Prompt.model_validate(
|
|
176
|
+
{
|
|
177
|
+
"name": "test-prompt",
|
|
178
|
+
"description": "A test prompt",
|
|
179
|
+
"arguments": [],
|
|
180
|
+
"_meta": meta_data,
|
|
181
|
+
}
|
|
182
|
+
)
|
|
183
|
+
]
|
|
184
|
+
resources = [
|
|
185
|
+
types.Resource.model_validate(
|
|
186
|
+
{
|
|
187
|
+
"uri": "file:///test",
|
|
188
|
+
"name": "test-resource",
|
|
189
|
+
"description": "A test resource",
|
|
190
|
+
"_meta": meta_data,
|
|
191
|
+
}
|
|
192
|
+
)
|
|
193
|
+
]
|
|
194
|
+
|
|
195
|
+
client = _MockClient(prompts=prompts, resources=resources)
|
|
196
|
+
analysis = await client.analyze_environment()
|
|
197
|
+
|
|
198
|
+
# Check prompts have meta
|
|
199
|
+
assert len(analysis["prompts"]) == 1
|
|
200
|
+
assert "meta" in analysis["prompts"][0]
|
|
201
|
+
assert analysis["prompts"][0]["meta"] == meta_data
|
|
202
|
+
|
|
203
|
+
# Check resources have meta
|
|
204
|
+
assert len(analysis["resources"]) == 1
|
|
205
|
+
assert "meta" in analysis["resources"][0]
|
|
206
|
+
assert analysis["resources"][0]["meta"] == meta_data
|
|
@@ -35,9 +35,15 @@ class MockClient(BaseHUDClient):
|
|
|
35
35
|
raise RuntimeError("Not connected")
|
|
36
36
|
return self._mock_tools
|
|
37
37
|
|
|
38
|
-
async def
|
|
39
|
-
"""Minimal
|
|
40
|
-
|
|
38
|
+
async def _list_resources_impl(self) -> list[types.Resource]:
|
|
39
|
+
"""Minimal resource listing implementation for tests."""
|
|
40
|
+
from pydantic import AnyUrl
|
|
41
|
+
|
|
42
|
+
return [
|
|
43
|
+
types.Resource(
|
|
44
|
+
uri=AnyUrl("telemetry://live"), name="telemetry", description="Live telemetry data"
|
|
45
|
+
)
|
|
46
|
+
]
|
|
41
47
|
|
|
42
48
|
async def _call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
|
|
43
49
|
if tool_call.name == "test_tool":
|
hud/datasets/__init__.py
CHANGED
|
@@ -1,33 +1,36 @@
|
|
|
1
1
|
"""HUD datasets module.
|
|
2
2
|
|
|
3
|
-
Provides
|
|
3
|
+
Provides unified task loading, saving, and execution for HUD evaluations.
|
|
4
|
+
|
|
5
|
+
Key functions:
|
|
6
|
+
- load_tasks(): Load tasks from JSON, JSONL, HuggingFace, or HUD API
|
|
7
|
+
- save_tasks(): Save tasks to the HUD API
|
|
8
|
+
- run_dataset(): Run an agent on a dataset of tasks
|
|
9
|
+
- submit_rollouts(): Submit tasks for remote execution
|
|
10
|
+
|
|
11
|
+
Supports both v4 (LegacyTask) and v5 (Task) formats with automatic conversion.
|
|
4
12
|
"""
|
|
5
13
|
|
|
6
|
-
# Data models
|
|
7
|
-
# Execution functions
|
|
8
14
|
from __future__ import annotations
|
|
9
15
|
|
|
10
|
-
from hud.
|
|
16
|
+
from hud.eval.display import display_results
|
|
11
17
|
|
|
12
|
-
from .
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
18
|
+
from .loader import load_dataset, load_tasks, save_tasks
|
|
19
|
+
from .runner import run_dataset, run_single_task
|
|
20
|
+
from .utils import (
|
|
21
|
+
BatchRequest,
|
|
22
|
+
SingleTaskRequest,
|
|
23
|
+
submit_rollouts,
|
|
16
24
|
)
|
|
17
|
-
from .runner import run_dataset
|
|
18
|
-
|
|
19
|
-
# Utilities
|
|
20
|
-
from .utils import fetch_system_prompt_from_dataset, save_tasks
|
|
21
25
|
|
|
22
26
|
__all__ = [
|
|
23
|
-
|
|
24
|
-
"
|
|
25
|
-
"
|
|
26
|
-
#
|
|
27
|
-
"
|
|
28
|
-
# Execution
|
|
27
|
+
"BatchRequest",
|
|
28
|
+
"SingleTaskRequest",
|
|
29
|
+
"display_results",
|
|
30
|
+
"load_dataset", # Deprecated alias
|
|
31
|
+
"load_tasks",
|
|
29
32
|
"run_dataset",
|
|
30
|
-
"
|
|
31
|
-
"run_dataset_parallel_manual",
|
|
33
|
+
"run_single_task",
|
|
32
34
|
"save_tasks",
|
|
35
|
+
"submit_rollouts",
|
|
33
36
|
]
|