hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,51 @@
1
+ """Environment-based client adapter for agents."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ import mcp.types as types
8
+
9
+ from hud.types import MCPToolCall, MCPToolResult
10
+
11
+ if TYPE_CHECKING:
12
+ from hud.environment import Environment
13
+ from hud.eval.context import EvalContext
14
+
15
+ __all__ = ["EnvironmentClient"]
16
+
17
+
18
+ class EnvironmentClient:
19
+ """Adapter wrapping Environment/EvalContext as AgentMCPClient."""
20
+
21
+ def __init__(self, env: Environment | EvalContext) -> None:
22
+ self._env = env
23
+ self._initialized = False
24
+
25
+ @property
26
+ def mcp_config(self) -> dict[str, dict[str, Any]]:
27
+ return {}
28
+
29
+ @property
30
+ def is_connected(self) -> bool:
31
+ return self._initialized
32
+
33
+ async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None:
34
+ if not self._initialized:
35
+ await self._env.list_tools()
36
+ self._initialized = True
37
+
38
+ async def list_tools(self) -> list[types.Tool]:
39
+ return await self._env.list_tools()
40
+
41
+ async def call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
42
+ result = await self._env.call_tool(tool_call.name, **(tool_call.arguments or {}))
43
+ if isinstance(result, MCPToolResult):
44
+ return result
45
+ return MCPToolResult(
46
+ content=[types.TextContent(type="text", text=str(result))],
47
+ isError=False,
48
+ )
49
+
50
+ async def shutdown(self) -> None:
51
+ self._initialized = False
hud/clients/fastmcp.py CHANGED
@@ -12,6 +12,7 @@ from fastmcp import Client as FastMCPClient
12
12
  from mcp import Implementation, types
13
13
  from mcp.shared.exceptions import McpError
14
14
 
15
+ from hud.settings import settings
15
16
  from hud.types import MCPToolCall, MCPToolResult
16
17
  from hud.version import __version__ as hud_version
17
18
 
@@ -73,7 +74,7 @@ class FastMCPHUDClient(BaseHUDClient):
73
74
  return
74
75
 
75
76
  # Create FastMCP client with the custom transport
76
- timeout = 10 * 60 # 5 minutes
77
+ timeout = settings.client_timeout
77
78
  os.environ["FASTMCP_CLIENT_INIT_TIMEOUT"] = str(timeout)
78
79
 
79
80
  # Create custom transport with retry support for HTTP servers
@@ -91,11 +92,11 @@ class FastMCPHUDClient(BaseHUDClient):
91
92
  # Check if connecting to HUD API
92
93
  for server_config in mcp_config.values():
93
94
  url = server_config.get("url", "")
94
- if "mcp.hud.so" in url:
95
+ if "mcp.hud.ai" in url:
95
96
  raise RuntimeError(
96
97
  "Authentication failed for HUD API. "
97
98
  "Please ensure your HUD_API_KEY environment variable is set correctly." # noqa: E501
98
- "You can get an API key at https://hud.so"
99
+ "You can get an API key at https://hud.ai"
99
100
  ) from e
100
101
  # Generic 401 error
101
102
  raise RuntimeError(
@@ -110,7 +111,7 @@ class FastMCPHUDClient(BaseHUDClient):
110
111
  hasattr(self._client, "_session_state")
111
112
  and self._client._session_state.session is not None
112
113
  ):
113
- self._client._session_state.session._validate_structured_outputs = (
114
+ self._client._session_state.session._validate_structured_outputs = ( # type: ignore[attr-defined]
114
115
  self._strict_validation
115
116
  )
116
117
  except ImportError:
@@ -124,6 +125,12 @@ class FastMCPHUDClient(BaseHUDClient):
124
125
  raise ValueError("Client is not connected, call initialize() first")
125
126
  return await self._client.list_tools()
126
127
 
128
+ async def _list_prompts_impl(self) -> list[types.Prompt]:
129
+ """List all available prompts (FastMCP supports this)."""
130
+ if self._client is None:
131
+ raise ValueError("Client is not connected, call initialize() first")
132
+ return await self._client.list_prompts()
133
+
127
134
  async def _call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
128
135
  """Execute a tool by name."""
129
136
  if self._client is None:
@@ -143,8 +150,8 @@ class FastMCPHUDClient(BaseHUDClient):
143
150
  structuredContent=result.structured_content,
144
151
  )
145
152
 
146
- async def list_resources(self) -> list[types.Resource]:
147
- """List all available resources."""
153
+ async def _list_resources_impl(self) -> list[types.Resource]:
154
+ """Implementation of resource listing for FastMCP client."""
148
155
  if self._client is None:
149
156
  raise ValueError("Client is not connected, call initialize() first")
150
157
  return await self._client.list_resources()
hud/clients/mcp_use.py CHANGED
@@ -9,9 +9,8 @@ from urllib.parse import urlparse
9
9
 
10
10
  from mcp import Implementation, types
11
11
  from mcp.shared.exceptions import McpError
12
- from mcp_use.client import MCPClient as MCPUseClient
13
- from mcp_use.session import MCPSession as MCPUseSession
14
- from mcp_use.types.http import HttpOptions
12
+ from mcp_use.client.client import MCPClient as MCPUseClient
13
+ from mcp_use.client.session import MCPSession as MCPUseSession
15
14
  from pydantic import AnyUrl
16
15
 
17
16
  from hud.settings import settings
@@ -20,7 +19,6 @@ from hud.utils.hud_console import HUDConsole
20
19
  from hud.version import __version__ as hud_version
21
20
 
22
21
  from .base import BaseHUDClient
23
- from .utils.retry_transport import create_retry_httpx_client
24
22
 
25
23
  logger = logging.getLogger(__name__)
26
24
  hud_console = HUDConsole(logger=logger)
@@ -58,12 +56,6 @@ class MCPUseHUDClient(BaseHUDClient):
58
56
  str, tuple[str, types.Tool, types.Tool]
59
57
  ] = {} # server_name, original_tool, prefixed_tool
60
58
  self._client: Any | None = None # Will be MCPUseClient when available
61
- # Transport options for MCP-use (disable_sse_fallback, httpx_client_factory, etc.)
62
- # Default to retry-enabled HTTPX client if factory not provided
63
- self._http_options: HttpOptions = HttpOptions(
64
- httpx_client_factory=create_retry_httpx_client,
65
- disable_sse_fallback=True,
66
- )
67
59
 
68
60
  async def _connect(self, mcp_config: dict[str, dict[str, Any]]) -> None:
69
61
  """Create all sessions for MCP-use client."""
@@ -71,6 +63,16 @@ class MCPUseHUDClient(BaseHUDClient):
71
63
  logger.warning("Client is already connected, cannot connect again")
72
64
  return
73
65
 
66
+ # Use configurable timeout for SSE read operations to support long-running tool calls.
67
+ max_request_timeout = 840
68
+ for server_cfg in mcp_config.values():
69
+ if "sse_read_timeout" not in server_cfg:
70
+ server_cfg["sse_read_timeout"] = (
71
+ min(settings.client_timeout, max_request_timeout)
72
+ if settings.client_timeout > 0
73
+ else max_request_timeout
74
+ )
75
+
74
76
  # If a server target matches HUD's MCP host and no auth is provided,
75
77
  # inject the HUD API key as a Bearer token to avoid OAuth browser flow.
76
78
  try:
@@ -88,11 +90,13 @@ class MCPUseHUDClient(BaseHUDClient):
88
90
  config = {"mcpServers": mcp_config}
89
91
  if MCPUseClient is None:
90
92
  raise ImportError("MCPUseClient is not available")
91
- self._client = MCPUseClient.from_dict(config, http_options=self._http_options)
93
+ self._client = MCPUseClient.from_dict(config)
92
94
  try:
93
- assert self._client is not None # noqa: S101
95
+ assert self._client is not None
94
96
  self._sessions = await self._client.create_all_sessions()
95
- hud_console.info(f"Created {len(self._sessions)} MCP sessions")
97
+ session_count = len(self._sessions)
98
+ session_text = "session" if session_count == 1 else "sessions"
99
+ hud_console.info(f"Created {session_count} MCP {session_text}")
96
100
 
97
101
  # Configure validation for all sessions based on client setting
98
102
  try:
@@ -241,8 +245,8 @@ class MCPUseHUDClient(BaseHUDClient):
241
245
  structuredContent=result.structuredContent,
242
246
  )
243
247
 
244
- async def list_resources(self) -> list[types.Resource]:
245
- """List all available resources."""
248
+ async def _list_resources_impl(self) -> list[types.Resource]:
249
+ """Implementation of resource listing for MCP-use client."""
246
250
  if self._client is None or not self._sessions:
247
251
  raise ValueError("Client is not connected, call initialize() first")
248
252
 
@@ -268,6 +272,32 @@ class MCPUseHUDClient(BaseHUDClient):
268
272
  continue
269
273
  return []
270
274
 
275
+ async def _list_prompts_impl(self) -> list[types.Prompt]:
276
+ """Implementation of prompt listing for MCP-use client (best-effort)."""
277
+ if self._client is None or not self._sessions:
278
+ raise ValueError("Client is not connected, call initialize() first")
279
+
280
+ all_prompts: list[types.Prompt] = []
281
+ for server_name, session in self._sessions.items():
282
+ try:
283
+ if not hasattr(session, "connector") or not hasattr(
284
+ session.connector, "client_session"
285
+ ):
286
+ continue
287
+ if session.connector.client_session is None:
288
+ continue
289
+
290
+ if not hasattr(session.connector.client_session, "list_prompts"):
291
+ continue
292
+
293
+ prompts_result = await session.connector.client_session.list_prompts()
294
+ all_prompts.extend(prompts_result.prompts)
295
+ except Exception as e:
296
+ if self.verbose:
297
+ hud_console.debug(f"Could not list prompts from server '{server_name}': {e}")
298
+ continue
299
+ return all_prompts
300
+
271
301
  async def read_resource(self, uri: str | AnyUrl) -> types.ReadResourceResult | None:
272
302
  """Read a resource by URI from any server that provides it."""
273
303
  if self._client is None or not self._sessions:
@@ -0,0 +1,206 @@
1
+ """Tests for scenario discovery via prompts/resources in analyze_environment()."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ import pytest
8
+ from mcp import types
9
+ from pydantic import AnyUrl
10
+
11
+ from hud.clients.base import BaseHUDClient
12
+
13
+ if TYPE_CHECKING:
14
+ from hud.types import MCPToolCall, MCPToolResult
15
+
16
+
17
+ class _MockClient(BaseHUDClient):
18
+ """Minimal BaseHUDClient for testing analyze_environment scenario derivation."""
19
+
20
+ def __init__(
21
+ self,
22
+ *,
23
+ prompts: list[types.Prompt],
24
+ resources: list[types.Resource],
25
+ ) -> None:
26
+ super().__init__(mcp_config={"test": {"url": "mock://test"}}, verbose=True)
27
+ self._mock_prompts = prompts
28
+ self._mock_resources = resources
29
+ # Skip initialize() (which fetches telemetry); we just need analyze_environment().
30
+ self._initialized = True
31
+
32
+ async def _connect(self, mcp_config: dict[str, dict[str, Any]]) -> None: # pragma: no cover
33
+ return None
34
+
35
+ async def list_tools(self) -> list[types.Tool]:
36
+ return []
37
+
38
+ async def _list_resources_impl(self) -> list[types.Resource]:
39
+ return self._mock_resources
40
+
41
+ async def _list_prompts_impl(self) -> list[types.Prompt]:
42
+ return self._mock_prompts
43
+
44
+ async def _call_tool(self, tool_call: MCPToolCall) -> MCPToolResult: # pragma: no cover
45
+ raise NotImplementedError
46
+
47
+ async def read_resource(self, uri: str) -> types.ReadResourceResult | None: # pragma: no cover
48
+ return None
49
+
50
+ async def _disconnect(self) -> None: # pragma: no cover
51
+ return None
52
+
53
+
54
+ @pytest.mark.asyncio
55
+ async def test_analyze_environment_derives_scenarios_from_scenario_prompt_and_resource() -> None:
56
+ prompts = [
57
+ types.Prompt(
58
+ name="my-env:checkout",
59
+ description="[Setup] Checkout flow",
60
+ arguments=[],
61
+ )
62
+ ]
63
+ resources = [
64
+ types.Resource(
65
+ uri=AnyUrl("my-env:checkout"),
66
+ name="checkout",
67
+ description="[Evaluate] Checkout flow",
68
+ )
69
+ ]
70
+
71
+ client = _MockClient(prompts=prompts, resources=resources)
72
+ analysis = await client.analyze_environment()
73
+
74
+ assert "scenarios" in analysis
75
+ assert len(analysis["scenarios"]) == 1
76
+ scenario = analysis["scenarios"][0]
77
+ assert scenario["id"] == "my-env:checkout"
78
+ assert scenario["env"] == "my-env"
79
+ assert scenario["name"] == "checkout"
80
+ assert scenario["has_setup_prompt"] is True
81
+ assert scenario["has_evaluate_resource"] is True
82
+
83
+
84
+ @pytest.mark.asyncio
85
+ async def test_analyze_environment_scenario_from_setup_only() -> None:
86
+ prompts = [
87
+ types.Prompt(
88
+ name="env-x:only_setup",
89
+ description="[Setup] Setup only scenario",
90
+ arguments=[],
91
+ )
92
+ ]
93
+ resources: list[types.Resource] = []
94
+
95
+ client = _MockClient(prompts=prompts, resources=resources)
96
+ analysis = await client.analyze_environment()
97
+
98
+ assert len(analysis["scenarios"]) == 1
99
+ scenario = analysis["scenarios"][0]
100
+ assert scenario["id"] == "env-x:only_setup"
101
+ assert scenario["has_setup_prompt"] is True
102
+ assert scenario["has_evaluate_resource"] is False
103
+
104
+
105
+ @pytest.mark.asyncio
106
+ async def test_analyze_environment_scenario_from_evaluate_only() -> None:
107
+ prompts: list[types.Prompt] = []
108
+ resources = [
109
+ types.Resource(
110
+ uri=AnyUrl("env-y:only_eval"),
111
+ name="only_eval",
112
+ description="[Evaluate] Evaluate only scenario",
113
+ )
114
+ ]
115
+
116
+ client = _MockClient(prompts=prompts, resources=resources)
117
+ analysis = await client.analyze_environment()
118
+
119
+ assert len(analysis["scenarios"]) == 1
120
+ scenario = analysis["scenarios"][0]
121
+ assert scenario["id"] == "env-y:only_eval"
122
+ assert scenario["has_setup_prompt"] is False
123
+ assert scenario["has_evaluate_resource"] is True
124
+
125
+
126
+ @pytest.mark.asyncio
127
+ async def test_analyze_environment_extracts_scenario_code_from_meta() -> None:
128
+ """Test that scenario code is extracted from the meta field."""
129
+ scenario_code = """@env.scenario()
130
+ async def checkout(product_id: str):
131
+ await env.call_tool("navigate", url="/checkout")
132
+ yield "Complete the checkout"
133
+ result = await env.call_tool("check_order")
134
+ yield 1.0 if result else 0.0
135
+ """
136
+ # Use model_validate with _meta alias (Pydantic alias for the meta field)
137
+ prompts = [
138
+ types.Prompt.model_validate(
139
+ {
140
+ "name": "my-env:checkout",
141
+ "description": "[Setup] Checkout flow",
142
+ "arguments": [{"name": "product_id", "required": True}],
143
+ "_meta": {"code": scenario_code},
144
+ }
145
+ )
146
+ ]
147
+ resources = [
148
+ types.Resource.model_validate(
149
+ {
150
+ "uri": "my-env:checkout",
151
+ "name": "checkout",
152
+ "description": "[Evaluate] Checkout flow",
153
+ "_meta": {"code": scenario_code},
154
+ }
155
+ )
156
+ ]
157
+
158
+ client = _MockClient(prompts=prompts, resources=resources)
159
+ analysis = await client.analyze_environment()
160
+
161
+ assert len(analysis["scenarios"]) == 1
162
+ scenario = analysis["scenarios"][0]
163
+ assert scenario["id"] == "my-env:checkout"
164
+ assert "code" in scenario
165
+ assert scenario["code"] == scenario_code
166
+ assert "async def checkout" in scenario["code"]
167
+
168
+
169
+ @pytest.mark.asyncio
170
+ async def test_analyze_environment_extracts_meta_on_prompts_and_resources() -> None:
171
+ """Test that meta field is included in prompts and resources analysis."""
172
+ meta_data = {"code": "test code", "extra": "value"}
173
+ # Use model_validate with _meta alias (Pydantic alias for the meta field)
174
+ prompts = [
175
+ types.Prompt.model_validate(
176
+ {
177
+ "name": "test-prompt",
178
+ "description": "A test prompt",
179
+ "arguments": [],
180
+ "_meta": meta_data,
181
+ }
182
+ )
183
+ ]
184
+ resources = [
185
+ types.Resource.model_validate(
186
+ {
187
+ "uri": "file:///test",
188
+ "name": "test-resource",
189
+ "description": "A test resource",
190
+ "_meta": meta_data,
191
+ }
192
+ )
193
+ ]
194
+
195
+ client = _MockClient(prompts=prompts, resources=resources)
196
+ analysis = await client.analyze_environment()
197
+
198
+ # Check prompts have meta
199
+ assert len(analysis["prompts"]) == 1
200
+ assert "meta" in analysis["prompts"][0]
201
+ assert analysis["prompts"][0]["meta"] == meta_data
202
+
203
+ # Check resources have meta
204
+ assert len(analysis["resources"]) == 1
205
+ assert "meta" in analysis["resources"][0]
206
+ assert analysis["resources"][0]["meta"] == meta_data
@@ -35,9 +35,15 @@ class MockClient(BaseHUDClient):
35
35
  raise RuntimeError("Not connected")
36
36
  return self._mock_tools
37
37
 
38
- async def list_resources(self) -> list[types.Resource]:
39
- """Minimal list_resources for protocol satisfaction in tests."""
40
- return []
38
+ async def _list_resources_impl(self) -> list[types.Resource]:
39
+ """Minimal resource listing implementation for tests."""
40
+ from pydantic import AnyUrl
41
+
42
+ return [
43
+ types.Resource(
44
+ uri=AnyUrl("telemetry://live"), name="telemetry", description="Live telemetry data"
45
+ )
46
+ ]
41
47
 
42
48
  async def _call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
43
49
  if tool_call.name == "test_tool":
hud/datasets/__init__.py CHANGED
@@ -1,33 +1,36 @@
1
1
  """HUD datasets module.
2
2
 
3
- Provides data models, utilities, and execution functions for working with HUD datasets.
3
+ Provides unified task loading, saving, and execution for HUD evaluations.
4
+
5
+ Key functions:
6
+ - load_tasks(): Load tasks from JSON, JSONL, HuggingFace, or HUD API
7
+ - save_tasks(): Save tasks to the HUD API
8
+ - run_dataset(): Run an agent on a dataset of tasks
9
+ - submit_rollouts(): Submit tasks for remote execution
10
+
11
+ Supports both v4 (LegacyTask) and v5 (Task) formats with automatic conversion.
4
12
  """
5
13
 
6
- # Data models
7
- # Execution functions
8
14
  from __future__ import annotations
9
15
 
10
- from hud.types import Task
16
+ from hud.eval.display import display_results
11
17
 
12
- from .parallel import (
13
- calculate_optimal_workers,
14
- run_dataset_parallel,
15
- run_dataset_parallel_manual,
18
+ from .loader import load_dataset, load_tasks, save_tasks
19
+ from .runner import run_dataset, run_single_task
20
+ from .utils import (
21
+ BatchRequest,
22
+ SingleTaskRequest,
23
+ submit_rollouts,
16
24
  )
17
- from .runner import run_dataset
18
-
19
- # Utilities
20
- from .utils import fetch_system_prompt_from_dataset, save_tasks
21
25
 
22
26
  __all__ = [
23
- # Core data model
24
- "Task",
25
- "calculate_optimal_workers",
26
- # Utilities
27
- "fetch_system_prompt_from_dataset",
28
- # Execution
27
+ "BatchRequest",
28
+ "SingleTaskRequest",
29
+ "display_results",
30
+ "load_dataset", # Deprecated alias
31
+ "load_tasks",
29
32
  "run_dataset",
30
- "run_dataset_parallel",
31
- "run_dataset_parallel_manual",
33
+ "run_single_task",
32
34
  "save_tasks",
35
+ "submit_rollouts",
33
36
  ]