hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,51 @@
1
+ """Environment-based client adapter for agents."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ import mcp.types as types
8
+
9
+ from hud.types import MCPToolCall, MCPToolResult
10
+
11
+ if TYPE_CHECKING:
12
+ from hud.environment import Environment
13
+ from hud.eval.context import EvalContext
14
+
15
+ __all__ = ["EnvironmentClient"]
16
+
17
+
18
+ class EnvironmentClient:
19
+ """Adapter wrapping Environment/EvalContext as AgentMCPClient."""
20
+
21
+ def __init__(self, env: Environment | EvalContext) -> None:
22
+ self._env = env
23
+ self._initialized = False
24
+
25
+ @property
26
+ def mcp_config(self) -> dict[str, dict[str, Any]]:
27
+ return {}
28
+
29
+ @property
30
+ def is_connected(self) -> bool:
31
+ return self._initialized
32
+
33
+ async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None:
34
+ if not self._initialized:
35
+ await self._env.list_tools()
36
+ self._initialized = True
37
+
38
+ async def list_tools(self) -> list[types.Tool]:
39
+ return await self._env.list_tools()
40
+
41
+ async def call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
42
+ result = await self._env.call_tool(tool_call.name, **(tool_call.arguments or {}))
43
+ if isinstance(result, MCPToolResult):
44
+ return result
45
+ return MCPToolResult(
46
+ content=[types.TextContent(type="text", text=str(result))],
47
+ isError=False,
48
+ )
49
+
50
+ async def shutdown(self) -> None:
51
+ self._initialized = False
hud/clients/fastmcp.py CHANGED
@@ -12,6 +12,7 @@ from fastmcp import Client as FastMCPClient
12
12
  from mcp import Implementation, types
13
13
  from mcp.shared.exceptions import McpError
14
14
 
15
+ from hud.settings import settings
15
16
  from hud.types import MCPToolCall, MCPToolResult
16
17
  from hud.version import __version__ as hud_version
17
18
 
@@ -73,7 +74,7 @@ class FastMCPHUDClient(BaseHUDClient):
73
74
  return
74
75
 
75
76
  # Create FastMCP client with the custom transport
76
- timeout = 10 * 60 # 5 minutes
77
+ timeout = settings.client_timeout
77
78
  os.environ["FASTMCP_CLIENT_INIT_TIMEOUT"] = str(timeout)
78
79
 
79
80
  # Create custom transport with retry support for HTTP servers
@@ -91,11 +92,11 @@ class FastMCPHUDClient(BaseHUDClient):
91
92
  # Check if connecting to HUD API
92
93
  for server_config in mcp_config.values():
93
94
  url = server_config.get("url", "")
94
- if "mcp.hud.so" in url:
95
+ if "mcp.hud.ai" in url:
95
96
  raise RuntimeError(
96
97
  "Authentication failed for HUD API. "
97
98
  "Please ensure your HUD_API_KEY environment variable is set correctly." # noqa: E501
98
- "You can get an API key at https://hud.so"
99
+ "You can get an API key at https://hud.ai"
99
100
  ) from e
100
101
  # Generic 401 error
101
102
  raise RuntimeError(
@@ -110,7 +111,7 @@ class FastMCPHUDClient(BaseHUDClient):
110
111
  hasattr(self._client, "_session_state")
111
112
  and self._client._session_state.session is not None
112
113
  ):
113
- self._client._session_state.session._validate_structured_outputs = (
114
+ self._client._session_state.session._validate_structured_outputs = ( # type: ignore[attr-defined]
114
115
  self._strict_validation
115
116
  )
116
117
  except ImportError:
@@ -124,6 +125,12 @@ class FastMCPHUDClient(BaseHUDClient):
124
125
  raise ValueError("Client is not connected, call initialize() first")
125
126
  return await self._client.list_tools()
126
127
 
128
+ async def _list_prompts_impl(self) -> list[types.Prompt]:
129
+ """List all available prompts (FastMCP supports this)."""
130
+ if self._client is None:
131
+ raise ValueError("Client is not connected, call initialize() first")
132
+ return await self._client.list_prompts()
133
+
127
134
  async def _call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
128
135
  """Execute a tool by name."""
129
136
  if self._client is None:
@@ -143,8 +150,8 @@ class FastMCPHUDClient(BaseHUDClient):
143
150
  structuredContent=result.structured_content,
144
151
  )
145
152
 
146
- async def list_resources(self) -> list[types.Resource]:
147
- """List all available resources."""
153
+ async def _list_resources_impl(self) -> list[types.Resource]:
154
+ """Implementation of resource listing for FastMCP client."""
148
155
  if self._client is None:
149
156
  raise ValueError("Client is not connected, call initialize() first")
150
157
  return await self._client.list_resources()
hud/clients/mcp_use.py CHANGED
@@ -9,9 +9,8 @@ from urllib.parse import urlparse
9
9
 
10
10
  from mcp import Implementation, types
11
11
  from mcp.shared.exceptions import McpError
12
- from mcp_use.client import MCPClient as MCPUseClient
13
- from mcp_use.session import MCPSession as MCPUseSession
14
- from mcp_use.types.http import HttpOptions
12
+ from mcp_use.client.client import MCPClient as MCPUseClient
13
+ from mcp_use.client.session import MCPSession as MCPUseSession
15
14
  from pydantic import AnyUrl
16
15
 
17
16
  from hud.settings import settings
@@ -20,7 +19,6 @@ from hud.utils.hud_console import HUDConsole
20
19
  from hud.version import __version__ as hud_version
21
20
 
22
21
  from .base import BaseHUDClient
23
- from .utils.retry_transport import create_retry_httpx_client
24
22
 
25
23
  logger = logging.getLogger(__name__)
26
24
  hud_console = HUDConsole(logger=logger)
@@ -58,12 +56,6 @@ class MCPUseHUDClient(BaseHUDClient):
58
56
  str, tuple[str, types.Tool, types.Tool]
59
57
  ] = {} # server_name, original_tool, prefixed_tool
60
58
  self._client: Any | None = None # Will be MCPUseClient when available
61
- # Transport options for MCP-use (disable_sse_fallback, httpx_client_factory, etc.)
62
- # Default to retry-enabled HTTPX client if factory not provided
63
- self._http_options: HttpOptions = HttpOptions(
64
- httpx_client_factory=create_retry_httpx_client,
65
- disable_sse_fallback=True,
66
- )
67
59
 
68
60
  async def _connect(self, mcp_config: dict[str, dict[str, Any]]) -> None:
69
61
  """Create all sessions for MCP-use client."""
@@ -71,6 +63,11 @@ class MCPUseHUDClient(BaseHUDClient):
71
63
  logger.warning("Client is already connected, cannot connect again")
72
64
  return
73
65
 
66
+ # Use configurable timeout for SSE read operations to support long-running tool calls.
67
+ for server_cfg in mcp_config.values():
68
+ if "sse_read_timeout" not in server_cfg:
69
+ server_cfg["sse_read_timeout"] = settings.client_timeout
70
+
74
71
  # If a server target matches HUD's MCP host and no auth is provided,
75
72
  # inject the HUD API key as a Bearer token to avoid OAuth browser flow.
76
73
  try:
@@ -88,11 +85,13 @@ class MCPUseHUDClient(BaseHUDClient):
88
85
  config = {"mcpServers": mcp_config}
89
86
  if MCPUseClient is None:
90
87
  raise ImportError("MCPUseClient is not available")
91
- self._client = MCPUseClient.from_dict(config, http_options=self._http_options)
88
+ self._client = MCPUseClient.from_dict(config)
92
89
  try:
93
- assert self._client is not None # noqa: S101
90
+ assert self._client is not None
94
91
  self._sessions = await self._client.create_all_sessions()
95
- hud_console.info(f"Created {len(self._sessions)} MCP sessions")
92
+ session_count = len(self._sessions)
93
+ session_text = "session" if session_count == 1 else "sessions"
94
+ hud_console.info(f"Created {session_count} MCP {session_text}")
96
95
 
97
96
  # Configure validation for all sessions based on client setting
98
97
  try:
@@ -241,8 +240,8 @@ class MCPUseHUDClient(BaseHUDClient):
241
240
  structuredContent=result.structuredContent,
242
241
  )
243
242
 
244
- async def list_resources(self) -> list[types.Resource]:
245
- """List all available resources."""
243
+ async def _list_resources_impl(self) -> list[types.Resource]:
244
+ """Implementation of resource listing for MCP-use client."""
246
245
  if self._client is None or not self._sessions:
247
246
  raise ValueError("Client is not connected, call initialize() first")
248
247
 
@@ -268,6 +267,32 @@ class MCPUseHUDClient(BaseHUDClient):
268
267
  continue
269
268
  return []
270
269
 
270
+ async def _list_prompts_impl(self) -> list[types.Prompt]:
271
+ """Implementation of prompt listing for MCP-use client (best-effort)."""
272
+ if self._client is None or not self._sessions:
273
+ raise ValueError("Client is not connected, call initialize() first")
274
+
275
+ all_prompts: list[types.Prompt] = []
276
+ for server_name, session in self._sessions.items():
277
+ try:
278
+ if not hasattr(session, "connector") or not hasattr(
279
+ session.connector, "client_session"
280
+ ):
281
+ continue
282
+ if session.connector.client_session is None:
283
+ continue
284
+
285
+ if not hasattr(session.connector.client_session, "list_prompts"):
286
+ continue
287
+
288
+ prompts_result = await session.connector.client_session.list_prompts()
289
+ all_prompts.extend(prompts_result.prompts)
290
+ except Exception as e:
291
+ if self.verbose:
292
+ hud_console.debug(f"Could not list prompts from server '{server_name}': {e}")
293
+ continue
294
+ return all_prompts
295
+
271
296
  async def read_resource(self, uri: str | AnyUrl) -> types.ReadResourceResult | None:
272
297
  """Read a resource by URI from any server that provides it."""
273
298
  if self._client is None or not self._sessions:
@@ -0,0 +1,206 @@
1
+ """Tests for scenario discovery via prompts/resources in analyze_environment()."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ import pytest
8
+ from mcp import types
9
+ from pydantic import AnyUrl
10
+
11
+ from hud.clients.base import BaseHUDClient
12
+
13
+ if TYPE_CHECKING:
14
+ from hud.types import MCPToolCall, MCPToolResult
15
+
16
+
17
+ class _MockClient(BaseHUDClient):
18
+ """Minimal BaseHUDClient for testing analyze_environment scenario derivation."""
19
+
20
+ def __init__(
21
+ self,
22
+ *,
23
+ prompts: list[types.Prompt],
24
+ resources: list[types.Resource],
25
+ ) -> None:
26
+ super().__init__(mcp_config={"test": {"url": "mock://test"}}, verbose=True)
27
+ self._mock_prompts = prompts
28
+ self._mock_resources = resources
29
+ # Skip initialize() (which fetches telemetry); we just need analyze_environment().
30
+ self._initialized = True
31
+
32
+ async def _connect(self, mcp_config: dict[str, dict[str, Any]]) -> None: # pragma: no cover
33
+ return None
34
+
35
+ async def list_tools(self) -> list[types.Tool]:
36
+ return []
37
+
38
+ async def _list_resources_impl(self) -> list[types.Resource]:
39
+ return self._mock_resources
40
+
41
+ async def _list_prompts_impl(self) -> list[types.Prompt]:
42
+ return self._mock_prompts
43
+
44
+ async def _call_tool(self, tool_call: MCPToolCall) -> MCPToolResult: # pragma: no cover
45
+ raise NotImplementedError
46
+
47
+ async def read_resource(self, uri: str) -> types.ReadResourceResult | None: # pragma: no cover
48
+ return None
49
+
50
+ async def _disconnect(self) -> None: # pragma: no cover
51
+ return None
52
+
53
+
54
+ @pytest.mark.asyncio
55
+ async def test_analyze_environment_derives_scenarios_from_scenario_prompt_and_resource() -> None:
56
+ prompts = [
57
+ types.Prompt(
58
+ name="my-env:checkout",
59
+ description="[Setup] Checkout flow",
60
+ arguments=[],
61
+ )
62
+ ]
63
+ resources = [
64
+ types.Resource(
65
+ uri=AnyUrl("my-env:checkout"),
66
+ name="checkout",
67
+ description="[Evaluate] Checkout flow",
68
+ )
69
+ ]
70
+
71
+ client = _MockClient(prompts=prompts, resources=resources)
72
+ analysis = await client.analyze_environment()
73
+
74
+ assert "scenarios" in analysis
75
+ assert len(analysis["scenarios"]) == 1
76
+ scenario = analysis["scenarios"][0]
77
+ assert scenario["id"] == "my-env:checkout"
78
+ assert scenario["env"] == "my-env"
79
+ assert scenario["name"] == "checkout"
80
+ assert scenario["has_setup_prompt"] is True
81
+ assert scenario["has_evaluate_resource"] is True
82
+
83
+
84
+ @pytest.mark.asyncio
85
+ async def test_analyze_environment_scenario_from_setup_only() -> None:
86
+ prompts = [
87
+ types.Prompt(
88
+ name="env-x:only_setup",
89
+ description="[Setup] Setup only scenario",
90
+ arguments=[],
91
+ )
92
+ ]
93
+ resources: list[types.Resource] = []
94
+
95
+ client = _MockClient(prompts=prompts, resources=resources)
96
+ analysis = await client.analyze_environment()
97
+
98
+ assert len(analysis["scenarios"]) == 1
99
+ scenario = analysis["scenarios"][0]
100
+ assert scenario["id"] == "env-x:only_setup"
101
+ assert scenario["has_setup_prompt"] is True
102
+ assert scenario["has_evaluate_resource"] is False
103
+
104
+
105
+ @pytest.mark.asyncio
106
+ async def test_analyze_environment_scenario_from_evaluate_only() -> None:
107
+ prompts: list[types.Prompt] = []
108
+ resources = [
109
+ types.Resource(
110
+ uri=AnyUrl("env-y:only_eval"),
111
+ name="only_eval",
112
+ description="[Evaluate] Evaluate only scenario",
113
+ )
114
+ ]
115
+
116
+ client = _MockClient(prompts=prompts, resources=resources)
117
+ analysis = await client.analyze_environment()
118
+
119
+ assert len(analysis["scenarios"]) == 1
120
+ scenario = analysis["scenarios"][0]
121
+ assert scenario["id"] == "env-y:only_eval"
122
+ assert scenario["has_setup_prompt"] is False
123
+ assert scenario["has_evaluate_resource"] is True
124
+
125
+
126
+ @pytest.mark.asyncio
127
+ async def test_analyze_environment_extracts_scenario_code_from_meta() -> None:
128
+ """Test that scenario code is extracted from the meta field."""
129
+ scenario_code = """@env.scenario()
130
+ async def checkout(product_id: str):
131
+ await env.call_tool("navigate", url="/checkout")
132
+ yield "Complete the checkout"
133
+ result = await env.call_tool("check_order")
134
+ yield 1.0 if result else 0.0
135
+ """
136
+ # Use model_validate with _meta alias (Pydantic alias for the meta field)
137
+ prompts = [
138
+ types.Prompt.model_validate(
139
+ {
140
+ "name": "my-env:checkout",
141
+ "description": "[Setup] Checkout flow",
142
+ "arguments": [{"name": "product_id", "required": True}],
143
+ "_meta": {"code": scenario_code},
144
+ }
145
+ )
146
+ ]
147
+ resources = [
148
+ types.Resource.model_validate(
149
+ {
150
+ "uri": "my-env:checkout",
151
+ "name": "checkout",
152
+ "description": "[Evaluate] Checkout flow",
153
+ "_meta": {"code": scenario_code},
154
+ }
155
+ )
156
+ ]
157
+
158
+ client = _MockClient(prompts=prompts, resources=resources)
159
+ analysis = await client.analyze_environment()
160
+
161
+ assert len(analysis["scenarios"]) == 1
162
+ scenario = analysis["scenarios"][0]
163
+ assert scenario["id"] == "my-env:checkout"
164
+ assert "code" in scenario
165
+ assert scenario["code"] == scenario_code
166
+ assert "async def checkout" in scenario["code"]
167
+
168
+
169
+ @pytest.mark.asyncio
170
+ async def test_analyze_environment_extracts_meta_on_prompts_and_resources() -> None:
171
+ """Test that meta field is included in prompts and resources analysis."""
172
+ meta_data = {"code": "test code", "extra": "value"}
173
+ # Use model_validate with _meta alias (Pydantic alias for the meta field)
174
+ prompts = [
175
+ types.Prompt.model_validate(
176
+ {
177
+ "name": "test-prompt",
178
+ "description": "A test prompt",
179
+ "arguments": [],
180
+ "_meta": meta_data,
181
+ }
182
+ )
183
+ ]
184
+ resources = [
185
+ types.Resource.model_validate(
186
+ {
187
+ "uri": "file:///test",
188
+ "name": "test-resource",
189
+ "description": "A test resource",
190
+ "_meta": meta_data,
191
+ }
192
+ )
193
+ ]
194
+
195
+ client = _MockClient(prompts=prompts, resources=resources)
196
+ analysis = await client.analyze_environment()
197
+
198
+ # Check prompts have meta
199
+ assert len(analysis["prompts"]) == 1
200
+ assert "meta" in analysis["prompts"][0]
201
+ assert analysis["prompts"][0]["meta"] == meta_data
202
+
203
+ # Check resources have meta
204
+ assert len(analysis["resources"]) == 1
205
+ assert "meta" in analysis["resources"][0]
206
+ assert analysis["resources"][0]["meta"] == meta_data
@@ -35,9 +35,15 @@ class MockClient(BaseHUDClient):
35
35
  raise RuntimeError("Not connected")
36
36
  return self._mock_tools
37
37
 
38
- async def list_resources(self) -> list[types.Resource]:
39
- """Minimal list_resources for protocol satisfaction in tests."""
40
- return []
38
+ async def _list_resources_impl(self) -> list[types.Resource]:
39
+ """Minimal resource listing implementation for tests."""
40
+ from pydantic import AnyUrl
41
+
42
+ return [
43
+ types.Resource(
44
+ uri=AnyUrl("telemetry://live"), name="telemetry", description="Live telemetry data"
45
+ )
46
+ ]
41
47
 
42
48
  async def _call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
43
49
  if tool_call.name == "test_tool":
hud/datasets/__init__.py CHANGED
@@ -1,33 +1,36 @@
1
1
  """HUD datasets module.
2
2
 
3
- Provides data models, utilities, and execution functions for working with HUD datasets.
3
+ Provides unified task loading, saving, and execution for HUD evaluations.
4
+
5
+ Key functions:
6
+ - load_tasks(): Load tasks from JSON, JSONL, HuggingFace, or HUD API
7
+ - save_tasks(): Save tasks to the HUD API
8
+ - run_dataset(): Run an agent on a dataset of tasks
9
+ - submit_rollouts(): Submit tasks for remote execution
10
+
11
+ Supports both v4 (LegacyTask) and v5 (Task) formats with automatic conversion.
4
12
  """
5
13
 
6
- # Data models
7
- # Execution functions
8
14
  from __future__ import annotations
9
15
 
10
- from hud.types import Task
16
+ from hud.eval.display import display_results
11
17
 
12
- from .parallel import (
13
- calculate_optimal_workers,
14
- run_dataset_parallel,
15
- run_dataset_parallel_manual,
18
+ from .loader import load_dataset, load_tasks, save_tasks
19
+ from .runner import run_dataset, run_single_task
20
+ from .utils import (
21
+ BatchRequest,
22
+ SingleTaskRequest,
23
+ submit_rollouts,
16
24
  )
17
- from .runner import run_dataset
18
-
19
- # Utilities
20
- from .utils import fetch_system_prompt_from_dataset, save_tasks
21
25
 
22
26
  __all__ = [
23
- # Core data model
24
- "Task",
25
- "calculate_optimal_workers",
26
- # Utilities
27
- "fetch_system_prompt_from_dataset",
28
- # Execution
27
+ "BatchRequest",
28
+ "SingleTaskRequest",
29
+ "display_results",
30
+ "load_dataset", # Deprecated alias
31
+ "load_tasks",
29
32
  "run_dataset",
30
- "run_dataset_parallel",
31
- "run_dataset_parallel_manual",
33
+ "run_single_task",
32
34
  "save_tasks",
35
+ "submit_rollouts",
33
36
  ]