hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,694 @@
1
+ """Environment class - unified MCP server and client."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ from collections.abc import Awaitable, Callable
8
+ from typing import TYPE_CHECKING, Any, Literal, Self
9
+
10
+ import mcp.types as mcp_types
11
+
12
+ from hud.environment.connectors import ConnectorsMixin
13
+ from hud.environment.integrations import IntegrationsMixin
14
+ from hud.environment.mock import MockMixin
15
+ from hud.environment.router import ConflictResolution, ToolRouter
16
+ from hud.environment.scenarios import ScenarioMixin
17
+ from hud.server.server import MCPServer
18
+ from hud.types import MCPToolResult
19
+
20
+ if TYPE_CHECKING:
21
+ import types
22
+
23
+ from hud.environment.connection import Connector
24
+ from hud.eval.task import Task
25
+
26
+ __all__ = ["Environment"]
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # Suppress verbose fastmcp logging
31
+ logging.getLogger("fastmcp.server.server").setLevel(logging.WARNING)
32
+ logging.getLogger("fastmcp.server.openapi").setLevel(logging.WARNING)
33
+
34
+ # Type alias for async callables (no-arg functions that return awaitable)
35
+ AsyncCallable = Callable[[], Awaitable[Any]]
36
+
37
+
38
+ class Environment(
39
+ ConnectorsMixin,
40
+ IntegrationsMixin,
41
+ MockMixin,
42
+ ScenarioMixin,
43
+ MCPServer,
44
+ ):
45
+ """Unified MCP environment that acts as both server and client.
46
+
47
+ Features:
48
+ - Define local tools with @env.tool decorator
49
+ - Connect to HUD Hub, URLs, or mcp_config dicts
50
+ - Automatic tool routing (local vs remote)
51
+ - Format tools for any LLM provider
52
+ - Integrate with popular agent frameworks
53
+ - Mock mode for testing without real connections
54
+
55
+ Connector methods (connect to sources):
56
+ connect_hub(name) - HUD Hub environment
57
+ connect_url(url) - MCP server via URL
58
+ connect_mcp(config) - Single mcp_config server
59
+ connect_mcp_config(mcp_config) - Multiple mcp_config servers
60
+ connect_image(image) - Docker image via stdio
61
+ connect_fastapi(app) - Mount FastAPI app as MCP server
62
+ connect_openapi(spec) - Mount OpenAPI spec as MCP server
63
+ connect_server(server) - Mount MCPServer/FastMCP directly
64
+
65
+ Mock methods (for testing):
66
+ mock() - Enable mock mode, all tools return mock values
67
+ unmock() - Disable mock mode
68
+ mock_tool(name, output) - Set specific mock output for a tool
69
+ is_mock - Check if mock mode is enabled
70
+
71
+ OpenAI integrations:
72
+ as_openai_chat_tools() - Chat Completions format
73
+ as_openai_responses_tools() - Responses API format
74
+ as_openai_agent_tools() - Agents SDK (requires openai-agents)
75
+
76
+ Anthropic/Claude integrations:
77
+ as_claude_tools() - Claude API format
78
+ as_claude_programmatic_tools() - Programmatic tool use
79
+ as_anthropic_runner() - Tool runner (requires anthropic)
80
+
81
+ Google/Gemini integrations:
82
+ as_gemini_tools() - Gemini format
83
+ as_gemini_tool_config() - Tool execution config
84
+
85
+ LangChain integrations:
86
+ as_langchain_tools() - StructuredTools (requires langchain-core)
87
+
88
+ Example:
89
+ ```python
90
+ env = Environment("my-env")
91
+
92
+
93
+ @env.tool
94
+ def greet(name: str) -> str:
95
+ return f"Hello, {name}!"
96
+
97
+
98
+ env.connect_hub("browser", prefix="browser")
99
+
100
+ async with env:
101
+ # Get tools in any format
102
+ openai_tools = env.as_openai_chat_tools()
103
+ claude_tools = env.as_claude_tools()
104
+
105
+ # Call tools - automatically routed
106
+ result = await env.call_tool("greet", name="World")
107
+
108
+ # Or pass provider-specific format - auto-detected
109
+ result = await env.call_tool(response.choices[0].message.tool_calls[0])
110
+
111
+ # Mock mode for testing
112
+ env.mock()
113
+ env.mock_tool("browser_navigate", "Navigation successful")
114
+ async with env:
115
+ result = await env.call_tool("browser_navigate", url="https://example.com")
116
+ # Returns mock value instead of actually navigating
117
+ ```
118
+ """
119
+
120
+ MAX_CONCURRENT_CONNECTIONS = 10
121
+
122
+ def __init__(
123
+ self,
124
+ name: str = "environment",
125
+ instructions: str | None = None,
126
+ conflict_resolution: ConflictResolution = ConflictResolution.PREFIX,
127
+ **fastmcp_kwargs: Any,
128
+ ) -> None:
129
+ super().__init__(name=name, instructions=instructions, **fastmcp_kwargs)
130
+ self._connections: dict[str, Connector] = {}
131
+ self._router = ToolRouter(conflict_resolution=conflict_resolution)
132
+ self._in_context = False
133
+
134
+ # Tool call queues - run after connections established
135
+ self._setup_calls: list[tuple[str, dict[str, Any]]] = []
136
+ self._evaluate_calls: list[tuple[str, dict[str, Any]]] = []
137
+
138
+ # Default prompt (EvalContext has per-run prompt)
139
+ self.prompt: str | None = None
140
+
141
+ # Serialization support
142
+ # _hub_config: set by connect_hub() for v5 format {"name": "hub", "include": [...]}
143
+ # _mcp_config: set by connect_mcp_config() for v4 format {"server_name": {...}}
144
+ self._hub_config: dict[str, Any] | None = None
145
+ self._mcp_config: dict[str, dict[str, Any]] | None = None
146
+
147
+ # Agent-level tool filtering (applied in as_tools(), not at connection level)
148
+ # This allows Environment to call all tools while limiting agent visibility
149
+ self._agent_include: list[str] | None = None
150
+ self._agent_exclude: list[str] | None = None
151
+
152
+ # Initialize mock state
153
+ self._init_mock()
154
+
155
+ # Initialize scenario state
156
+ self._init_scenarios()
157
+
158
+ # =========================================================================
159
+ # Core Methods
160
+ # =========================================================================
161
+
162
+ def as_tools(self) -> list[mcp_types.Tool]:
163
+ """Return tools in MCP format (base format).
164
+
165
+ Applies agent-level include/exclude filtering if set.
166
+ """
167
+ tools = self._router.tools
168
+
169
+ # Apply agent-level filtering (from v4 allowed_tools/disallowed_tools)
170
+ if self._agent_include is not None or self._agent_exclude is not None:
171
+ filtered = []
172
+ for tool in tools:
173
+ # Include filter: None means include all
174
+ if self._agent_include is not None and tool.name not in self._agent_include:
175
+ continue
176
+ # Exclude filter
177
+ if self._agent_exclude is not None and tool.name in self._agent_exclude:
178
+ continue
179
+ filtered.append(tool)
180
+ return filtered
181
+
182
+ return tools
183
+
184
+ async def call_tool(self, call: Any, /, **kwargs: Any) -> Any:
185
+ """Call a tool, auto-detecting format and returning matching result format.
186
+
187
+ Accepts any format:
188
+ - String with kwargs: call_tool("navigate", url="...")
189
+ - Tuple: call_tool(("navigate", {"url": "..."}))
190
+ - MCPToolCall: call_tool(MCPToolCall(name="navigate", ...))
191
+ - OpenAI: call_tool(response.choices[0].message.tool_calls[0])
192
+ - Claude: call_tool(response.content[0]) # tool_use block
193
+ - Gemini: call_tool(response.candidates[0].content.parts[0])
194
+
195
+ Returns:
196
+ Result formatted to match input format (OpenAI -> OpenAI tool message, etc.)
197
+ """
198
+ from hud.environment.utils import format_result, parse_tool_call
199
+
200
+ # Parse the tool call (kwargs merged when call is string)
201
+ parsed, fmt = parse_tool_call(call, **kwargs)
202
+ result = await self._execute_tool(parsed.name, parsed.arguments or {})
203
+ return format_result(result, parsed, fmt)
204
+
205
+ def _connections_with_tool(self, tool_name: str) -> set[str]:
206
+ """Get connection names that have a specific tool.
207
+
208
+ Uses cached_tools from each Connector to check availability.
209
+ """
210
+ result = set()
211
+ for name, connector in self._connections.items():
212
+ tool_names = {t.name for t in connector.cached_tools}
213
+ if tool_name in tool_names:
214
+ result.add(name)
215
+ return result
216
+
217
+ async def _broadcast_tool(
218
+ self,
219
+ tool_name: str,
220
+ **kwargs: Any,
221
+ ) -> dict[str, Any]:
222
+ """Broadcast a tool call to all connections that have the tool.
223
+
224
+ Automatically filters to only connections where the tool exists
225
+ (based on cached_tools from initial discovery).
226
+
227
+ Args:
228
+ tool_name: Name of the tool to call
229
+ **kwargs: Arguments to pass to the tool
230
+
231
+ Returns:
232
+ Dict mapping connection name to result (or exception)
233
+ """
234
+ import asyncio
235
+
236
+ # Only call connections that have this tool
237
+ targets = self._connections_with_tool(tool_name)
238
+ if not targets:
239
+ return {}
240
+
241
+ results: dict[str, Any] = {}
242
+
243
+ async def call_one(name: str) -> None:
244
+ connector = self._connections.get(name)
245
+ if not connector or not connector.client:
246
+ return
247
+ try:
248
+ results[name] = await connector.client.call_tool(tool_name, **kwargs)
249
+ logger.debug("Broadcast '%s' to '%s' succeeded", tool_name, name)
250
+ except Exception as e:
251
+ results[name] = e
252
+ logger.debug("Broadcast '%s' to '%s' failed: %s", tool_name, name, e)
253
+
254
+ await asyncio.gather(*[call_one(n) for n in targets], return_exceptions=True)
255
+ return results
256
+
257
+ async def call_tools(self, calls: Any) -> list[Any]:
258
+ """Call multiple tools, returning results in matching formats."""
259
+ if calls is None:
260
+ return []
261
+ if not isinstance(calls, list):
262
+ return [await self.call_tool(calls)]
263
+
264
+ # Filter to tool calls only (skip text blocks, etc.)
265
+ tool_calls = []
266
+ for call in calls:
267
+ t = call.get("type") if isinstance(call, dict) else getattr(call, "type", None)
268
+ if t is None or t in ("tool_use", "function"):
269
+ tool_calls.append(call)
270
+
271
+ return await asyncio.gather(*[self.call_tool(c) for c in tool_calls])
272
+
273
+ # =========================================================================
274
+ # Lifecycle Configuration
275
+ # =========================================================================
276
+
277
+ def setup_tool(self, call: Any, /, **kwargs: Any) -> Environment:
278
+ """Add a tool call to execute after connections are established."""
279
+ from hud.environment.utils import parse_tool_call
280
+
281
+ if isinstance(call, str) and kwargs:
282
+ self._setup_calls.append((call, kwargs))
283
+ else:
284
+ parsed, _ = parse_tool_call(call)
285
+ self._setup_calls.append((parsed.name, parsed.arguments or {}))
286
+ return self
287
+
288
+ def evaluate_tool(self, call: Any, /, **kwargs: Any) -> Environment:
289
+ """Add a tool call to execute before disconnecting."""
290
+ from hud.environment.utils import parse_tool_call
291
+
292
+ if isinstance(call, str) and kwargs:
293
+ self._evaluate_calls.append((call, kwargs))
294
+ else:
295
+ parsed, _ = parse_tool_call(call)
296
+ self._evaluate_calls.append((parsed.name, parsed.arguments or {}))
297
+ return self
298
+
299
+ # =========================================================================
300
+ # Context Manager
301
+ # =========================================================================
302
+
303
+ async def __aenter__(self) -> Self:
304
+ """Connect all connectors, build routing, run setup tools."""
305
+ self._in_context = True
306
+
307
+ # Connect to all servers (on_connect callbacks run first within connect())
308
+ sem = asyncio.Semaphore(self.MAX_CONCURRENT_CONNECTIONS)
309
+ errors: list[tuple[str, Exception]] = []
310
+
311
+ async def connect_one(name: str, conn: Connector) -> None:
312
+ async with sem:
313
+ try:
314
+ await conn.connect()
315
+ await conn.list_tools()
316
+ except Exception as e:
317
+ errors.append((name, e))
318
+
319
+ if self._connections:
320
+ await asyncio.gather(*[connect_one(n, c) for n, c in self._connections.items()])
321
+ if errors:
322
+ for conn in self._connections.values():
323
+ if conn.is_connected:
324
+ await conn.disconnect()
325
+ name, err = errors[0]
326
+ str_err = str(err).replace("Client failed to connect: ", "") # Strip from FastMCP
327
+ raise ConnectionError(f"Failed to connect to {name}: {str_err}") from err
328
+
329
+ await self._build_routing()
330
+
331
+ # Setup tool calls (after connections)
332
+ for name, args in self._setup_calls:
333
+ await self._execute_tool(name, args)
334
+
335
+ return self
336
+
337
+ async def __aexit__(
338
+ self,
339
+ exc_type: type[BaseException] | None,
340
+ exc_val: BaseException | None,
341
+ exc_tb: types.TracebackType | None,
342
+ ) -> None:
343
+ """Run evaluate tools, exit queue, then disconnect."""
344
+ from hud.agents.base import find_reward
345
+
346
+ # Evaluate tool calls and collect rewards
347
+ rewards: list[float] = []
348
+ for name, args in self._evaluate_calls:
349
+ try:
350
+ result = await self._execute_tool(name, args)
351
+ rewards.append(find_reward(result))
352
+ except Exception as e:
353
+ logger.warning("Evaluate tool %s failed: %s", name, e)
354
+
355
+ # Store average reward from evaluate tools
356
+ self._evaluate_reward: float | None = None
357
+ if rewards:
358
+ self._evaluate_reward = sum(rewards) / len(rewards)
359
+
360
+ self._in_context = False
361
+ if self._connections:
362
+ await asyncio.gather(*[c.disconnect() for c in self._connections.values()])
363
+ self._router.clear()
364
+
365
+ async def _build_routing(self) -> None:
366
+ """Build tool routing from local tools and connection caches."""
367
+ # Use get_tools() not list_tools() - it includes mounted servers without
368
+ # requiring MCP server communication (via_server=False)
369
+ local_tools_dict = await self._tool_manager.get_tools()
370
+ local_tools = list(local_tools_dict.values())
371
+ self._router.build(
372
+ local_tools=[t.to_mcp_tool() for t in local_tools],
373
+ connections=self._connections,
374
+ connection_order=list(self._connections.keys()),
375
+ )
376
+ # Populate mock schemas for auto-generated mock values
377
+ self._populate_mock_schemas()
378
+
379
+ # =========================================================================
380
+ # Tool Operations
381
+ # =========================================================================
382
+
383
+ async def list_tools(self) -> list[mcp_types.Tool]:
384
+ """Refresh tools from all connections and rebuild routing."""
385
+ if self._connections:
386
+ await asyncio.gather(*[c.list_tools() for c in self._connections.values()])
387
+ await self._build_routing()
388
+ return self._router.tools
389
+
390
+ async def _execute_tool(self, name: str, arguments: dict[str, Any]) -> MCPToolResult:
391
+ """Execute a tool by name. Routes to local or remote handler.
392
+
393
+ If mock mode is enabled, returns a mock result instead of executing.
394
+ """
395
+ # Check mock mode first
396
+ if self._mock_mode:
397
+ logger.debug("Mock mode: returning mock result for tool %s", name)
398
+ return self._get_mock_result(name, arguments)
399
+
400
+ if self._router.is_local(name):
401
+ # Call tool manager directly to avoid FastMCP context requirement
402
+ result = await self._tool_manager.call_tool(name, arguments)
403
+ return MCPToolResult(
404
+ content=result.content,
405
+ structuredContent=result.structured_content,
406
+ )
407
+
408
+ connection_name = self._router.get_connection(name)
409
+ if connection_name:
410
+ conn = self._connections[connection_name]
411
+ result = await conn.call_tool(name, arguments)
412
+ return MCPToolResult(
413
+ content=result.content,
414
+ isError=result.isError,
415
+ structuredContent=result.structuredContent,
416
+ )
417
+
418
+ raise ValueError(f"Tool not found: {name}")
419
+
420
+ # =========================================================================
421
+ # Resource Operations
422
+ # =========================================================================
423
+
424
+ async def list_resources(self) -> list[mcp_types.Resource]:
425
+ """List all resources (local + remote)."""
426
+ local = list((await self._resource_manager.get_resources()).values())
427
+ resources: list[mcp_types.Resource] = [r.to_mcp_resource() for r in local]
428
+
429
+ if self._connections:
430
+ results = await asyncio.gather(
431
+ *[c.list_resources() for c in self._connections.values()], return_exceptions=True
432
+ )
433
+ for r in results:
434
+ if isinstance(r, list):
435
+ resources.extend(r)
436
+
437
+ return resources
438
+
439
+ async def read_resource(
440
+ self, uri: str
441
+ ) -> list[mcp_types.TextResourceContents | mcp_types.BlobResourceContents]:
442
+ """Read a resource by URI (tries local first, then remote)."""
443
+ from pydantic import AnyUrl
444
+
445
+ try:
446
+ result = await self._resource_manager.read_resource(uri)
447
+ resource_uri = AnyUrl(uri)
448
+ if isinstance(result, str):
449
+ return [mcp_types.TextResourceContents(uri=resource_uri, text=result)]
450
+ import base64
451
+
452
+ return [
453
+ mcp_types.BlobResourceContents(
454
+ uri=resource_uri, blob=base64.b64encode(result).decode()
455
+ )
456
+ ]
457
+ except Exception as e:
458
+ logger.debug("Local resource read failed for %s: %s", uri, e)
459
+
460
+ for conn in self._connections.values():
461
+ try:
462
+ return await conn.read_resource(uri)
463
+ except Exception as e:
464
+ logger.debug("Remote resource read failed for %s: %s", uri, e)
465
+ continue
466
+
467
+ raise ValueError(f"Resource not found: {uri}")
468
+
469
+ # =========================================================================
470
+ # Prompt Operations
471
+ # =========================================================================
472
+
473
+ async def list_prompts(self) -> list[mcp_types.Prompt]:
474
+ """List all prompts (local + remote)."""
475
+ local = list((await self._prompt_manager.get_prompts()).values())
476
+ prompts: list[mcp_types.Prompt] = [p.to_mcp_prompt() for p in local]
477
+
478
+ if self._connections:
479
+ results = await asyncio.gather(
480
+ *[c.list_prompts() for c in self._connections.values()], return_exceptions=True
481
+ )
482
+ for r in results:
483
+ if isinstance(r, list):
484
+ prompts.extend(r)
485
+
486
+ return prompts
487
+
488
+ async def get_prompt(
489
+ self, name: str, arguments: dict[str, Any] | None = None
490
+ ) -> mcp_types.GetPromptResult:
491
+ """Get a prompt by name (tries local first, then remote)."""
492
+ try:
493
+ return await self._prompt_manager.render_prompt(name, arguments or {})
494
+ except Exception as e:
495
+ logger.debug("Local prompt render failed for %s: %s", name, e)
496
+
497
+ for conn in self._connections.values():
498
+ try:
499
+ return await conn.get_prompt(name, arguments)
500
+ except Exception as e:
501
+ logger.debug("Remote prompt get failed for %s: %s", name, e)
502
+ continue
503
+
504
+ raise ValueError(f"Prompt not found: {name}")
505
+
506
+ # =========================================================================
507
+ # Server Methods
508
+ # =========================================================================
509
+
510
+ def serve(
511
+ self,
512
+ transport: Literal["stdio", "sse", "streamable-http"] = "streamable-http",
513
+ host: str = "0.0.0.0", # noqa: S104
514
+ port: int = 8000,
515
+ **kwargs: Any,
516
+ ) -> None:
517
+ """Start serving as an MCP server."""
518
+ self.run(transport=transport, host=host, port=port, **kwargs)
519
+
520
+ # =========================================================================
521
+ # Properties
522
+ # =========================================================================
523
+
524
+ @property
525
+ def connections(self) -> dict[str, Connector]:
526
+ return self._connections
527
+
528
+ @property
529
+ def is_connected(self) -> bool:
530
+ return self._in_context
531
+
532
+ @property
533
+ def is_parallelizable(self) -> bool:
534
+ """True if all connections are remote (can spawn multiple instances)."""
535
+ if not self._connections:
536
+ return True # No connections = can parallelize (local tools only)
537
+ return all(conn.is_remote for conn in self._connections.values())
538
+
539
+ @property
540
+ def local_connections(self) -> list[str]:
541
+ """Names of local (non-parallelizable) connections."""
542
+ return [name for name, conn in self._connections.items() if conn.is_local]
543
+
544
+ # =========================================================================
545
+ # Serialization
546
+ # =========================================================================
547
+
548
+ @property
549
+ def is_serializable(self) -> bool:
550
+ """True if environment can be serialized (no local tools/scenarios).
551
+
552
+ For v5 format: requires hub config from connect_hub()
553
+ For v4 format: requires mcp_config, prompt, AND evaluate_tool
554
+ """
555
+ # Check for local tools (registered via @env.tool)
556
+ if self._router._local_names:
557
+ return False
558
+ # Check for local scenarios (registered via @env.scenario)
559
+ if getattr(self, "_scenarios", {}):
560
+ return False
561
+ # v5 hub format
562
+ if self._hub_config is not None:
563
+ return True
564
+ # v4 format requires mcp_config + prompt + evaluate_tool
565
+ if self._mcp_config is not None:
566
+ return bool(self.prompt and self._evaluate_calls)
567
+ return False
568
+
569
+ def to_config(self) -> dict[str, Any]:
570
+ """Serialize environment config for remote submission.
571
+
572
+ Returns the config in either v5 format (hub-based) or v4 format (legacy).
573
+ For v4 format, automatically includes prompt, setup_tool, and evaluate_tool
574
+ from the Environment's state.
575
+
576
+ Returns:
577
+ dict: Serializable config
578
+
579
+ Raises:
580
+ ValueError: If environment has local tools/scenarios that can't be serialized
581
+
582
+ Example:
583
+ ```python
584
+ # v5 hub-based
585
+ env = Environment("my").connect_hub("browser", include=["navigate"])
586
+ env.to_config() # {"name": "browser", "include": ["navigate"]}
587
+
588
+ # v4 legacy (from Task.from_v4())
589
+ task = Task.from_v4(legacy_task)
590
+ task.env.to_config() # {"prompt": "...", "mcp_config": {...}, ...}
591
+ ```
592
+ """
593
+ if self._router._local_names:
594
+ raise ValueError(
595
+ f"Cannot serialize Environment with local tools: "
596
+ f"{list(self._router._local_names)}. "
597
+ "Local tools require local execution. For remote submission, "
598
+ "use dict config or connect to a remote hub."
599
+ )
600
+ if getattr(self, "_scenarios", {}):
601
+ raise ValueError(
602
+ f"Cannot serialize Environment with local scenarios: "
603
+ f"{list(self._scenarios.keys())}. "
604
+ "Local scenarios require local execution. For remote submission, "
605
+ "define scenarios on the remote environment."
606
+ )
607
+
608
+ # v5 hub-based format
609
+ if self._hub_config is not None:
610
+ return self._hub_config.copy()
611
+
612
+ # v4 legacy format - requires mcp_config, prompt, AND evaluate_tool
613
+ if self._mcp_config is not None:
614
+ # Validate required fields for v4 format
615
+ if not self.prompt:
616
+ raise ValueError(
617
+ "Cannot serialize v4 Environment without prompt. "
618
+ "Set env.prompt before serializing."
619
+ )
620
+ if not self._evaluate_calls:
621
+ raise ValueError(
622
+ "Cannot serialize v4 Environment without evaluate_tool. "
623
+ "Use env.evaluate_tool() to define evaluation criteria."
624
+ )
625
+
626
+ config: dict[str, Any] = {
627
+ "prompt": self.prompt,
628
+ "mcp_config": self._mcp_config,
629
+ "evaluate_tool": [
630
+ {"name": name, "arguments": args} for name, args in self._evaluate_calls
631
+ ],
632
+ }
633
+ if self._setup_calls:
634
+ config["setup_tool"] = [
635
+ {"name": name, "arguments": args} for name, args in self._setup_calls
636
+ ]
637
+ return config
638
+
639
+ raise ValueError(
640
+ "Cannot serialize Environment without config. "
641
+ "Use connect_hub() for v5 tasks or connect_mcp_config() for legacy tasks."
642
+ )
643
+
644
+ def __repr__(self) -> str:
645
+ return f"Environment({self.name!r}, connections={list(self._connections.keys())})"
646
+
647
+ # =========================================================================
648
+ # Task Creation
649
+ # =========================================================================
650
+
651
+ def __call__(
652
+ self,
653
+ scenario: str | None = None,
654
+ **args: Any,
655
+ ) -> Task:
656
+ """Create a Task from this environment.
657
+
658
+ Returns a Task that can be passed to hud.eval() for orchestration.
659
+
660
+ Args:
661
+ scenario: Scenario name to run (from @env.scenario). Optional for v4 legacy.
662
+ **args: Arguments for the scenario
663
+
664
+ Returns:
665
+ Task: A runnable evaluation unit
666
+
667
+ Example:
668
+ ```python
669
+ env = Environment("my-env").connect_hub("browser")
670
+
671
+
672
+ @env.scenario()
673
+ async def checkout(user_id: str):
674
+ yield "Complete checkout"
675
+ yield 1.0
676
+
677
+
678
+ # Single task via hud.eval
679
+ async with hud.eval(env("checkout", user_id="alice")) as ctx:
680
+ await agent.run(ctx.prompt)
681
+
682
+ # Multiple tasks with variants
683
+ tasks = [env("checkout", user_id="alice"), env("checkout", user_id="bob")]
684
+ async with hud.eval(tasks, variants={"model": ["gpt-4o"]}, group=4) as ctx:
685
+ ...
686
+ ```
687
+ """
688
+ from hud.eval.task import Task
689
+
690
+ return Task(
691
+ env=self,
692
+ scenario=scenario,
693
+ args=args,
694
+ )