hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,620 @@
1
+ """Scenario decorator for Environment - defines setup/evaluate phases."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import inspect
6
+ import json
7
+ import logging
8
+ from typing import TYPE_CHECKING, Any, get_type_hints
9
+
10
+ from pydantic import BaseModel, ConfigDict
11
+
12
+ if TYPE_CHECKING:
13
+ from collections.abc import AsyncGenerator, Callable
14
+
15
+ from fastmcp.prompts import PromptManager
16
+ from fastmcp.resources import ResourceManager
17
+ from fastmcp.tools import ToolManager
18
+
19
+ __all__ = ["ScenarioMixin", "ScenarioSession"]
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class ScenarioSession(BaseModel):
25
+ """Tracks an active scenario from setup through evaluate.
26
+
27
+ Created during run_scenario_setup(), used by submit() and run_scenario_evaluate().
28
+ """
29
+
30
+ model_config = ConfigDict(arbitrary_types_allowed=True)
31
+
32
+ local_name: str # Canonical short name (e.g., "investigate")
33
+ full_name: str # Full name as called (e.g., "sentry-agent:investigate")
34
+ is_local: bool # True if running locally (generator exists)
35
+ connection_name: str | None # Which connection served it (if remote)
36
+ resource_uri: str # Full URI for reading evaluation result
37
+ generator: Any | None = None # AsyncGenerator (if local) - Any to avoid validation issues
38
+ answer: str | None = None # Submitted answer
39
+
40
+
41
+ class ScenarioMixin:
42
+ """Mixin providing @env.scenario decorator for setup/evaluate phases.
43
+
44
+ Scenarios are async generators that yield twice:
45
+ - First yield: prompt string (setup phase)
46
+ - Second yield: reward float (evaluate phase)
47
+
48
+ The scenario can receive the agent's answer via yield:
49
+ answer = yield "Do the task"
50
+ yield 1.0 if "success" in answer else 0.0
51
+
52
+ The answer is passed via the hud_submit tool or ctx.submit().
53
+
54
+ The decorator registers both an MCP prompt and resource with the same
55
+ identifier ({env_name}:{scenario_name}), linked by session state.
56
+
57
+ Example:
58
+ @env.scenario()
59
+ async def search_cats(url: str):
60
+ await env.call_tool("navigate", url=url)
61
+ answer = yield "Find all cat images on the page"
62
+ result = await env.call_tool("count_cats")
63
+ yield float(result > 0 or "found" in answer.lower())
64
+ """
65
+
66
+ # These come from Environment/MCPServer (type hints for mixin)
67
+ name: str
68
+ _prompt_manager: PromptManager
69
+ _resource_manager: ResourceManager
70
+ _tool_manager: ToolManager
71
+
72
+ # Scenario function registry
73
+ _scenarios: dict[str, Callable[..., AsyncGenerator[Any, Any]]]
74
+
75
+ # Single active scenario session - used for BOTH:
76
+ # - Client-side: when we run scenarios (local or remote)
77
+ # - Server-side: when external clients call our scenarios via MCP
78
+ # Only one scenario can be active at a time.
79
+ _active_session: ScenarioSession | None
80
+
81
+ def _init_scenarios(self) -> None:
82
+ """Initialize scenario state. Called from Environment.__init__."""
83
+ self._scenarios = {}
84
+ self._active_session = None
85
+
86
+ # Register _hud_submit tool (underscore = hidden from agent)
87
+ self._register_hud_submit_tool()
88
+
89
+ async def submit(self, scenario: str, answer: str) -> None:
90
+ """Submit the agent's answer for a scenario's evaluate phase.
91
+
92
+ Uses _active_session to route to the correct connection (if remote)
93
+ or store locally (if local scenario).
94
+
95
+ Args:
96
+ scenario: Name of the scenario (may include env prefix like "env:name")
97
+ answer: The agent's answer/result to submit
98
+ """
99
+ local_name = scenario.split(":")[-1] if ":" in scenario else scenario
100
+
101
+ if not self._active_session:
102
+ raise ValueError(
103
+ "No active scenario session. Call run_scenario_setup() before submit()."
104
+ )
105
+
106
+ if self._active_session.local_name != local_name:
107
+ raise ValueError(
108
+ f"Scenario mismatch: active session is '{self._active_session.local_name}', "
109
+ f"but submit() called with '{local_name}'"
110
+ )
111
+
112
+ self._active_session.answer = answer
113
+ logger.debug("Stored answer in session for scenario '%s'", local_name)
114
+
115
+ if not self._active_session.is_local:
116
+ # Remote scenario - send to specific connection
117
+ conn_name = self._active_session.connection_name
118
+ if not conn_name:
119
+ raise ValueError(f"Remote scenario '{local_name}' has no connection")
120
+
121
+ conn = self._connections.get(conn_name) # type: ignore[attr-defined]
122
+ if not conn or not conn.client:
123
+ raise ValueError(f"Connection '{conn_name}' not available")
124
+
125
+ await conn.call_tool("_hud_submit", {"scenario": local_name, "answer": answer})
126
+ logger.debug("Sent answer to connection '%s' for scenario '%s'", conn_name, local_name)
127
+
128
+ def _register_hud_submit_tool(self) -> None:
129
+ """Register the _hud_submit tool for receiving agent answers.
130
+
131
+ Named with underscore prefix to hide from agent tool listings.
132
+ """
133
+ from fastmcp.tools import Tool
134
+
135
+ scenario_self = self
136
+
137
+ async def _hud_submit(scenario: str, answer: str) -> str:
138
+ """Receive an agent's answer from an external client.
139
+
140
+ Called when an external client's Environment.submit() sends an answer
141
+ to us via MCP. Stores in _active_session for resource_handler to use.
142
+
143
+ Args:
144
+ scenario: Name of the scenario (may include env prefix like "env:name")
145
+ answer: The agent's answer/result to submit
146
+ """
147
+ local_name = scenario.split(":")[-1] if ":" in scenario else scenario
148
+
149
+ if not scenario_self._active_session:
150
+ raise ValueError(f"No active scenario session for '{local_name}'")
151
+
152
+ if scenario_self._active_session.local_name != local_name:
153
+ raise ValueError(
154
+ f"Scenario mismatch: active is '{scenario_self._active_session.local_name}', "
155
+ f"but received answer for '{local_name}'"
156
+ )
157
+
158
+ scenario_self._active_session.answer = answer
159
+ logger.debug(
160
+ "_hud_submit stored answer for scenario '%s': %s...",
161
+ local_name,
162
+ answer[:50] if len(answer) > 50 else answer,
163
+ )
164
+ return f"Answer submitted for scenario '{local_name}'"
165
+
166
+ # Register the tool with underscore name
167
+ tool = Tool.from_function(_hud_submit)
168
+ self._tool_manager.add_tool(tool)
169
+ logger.debug("Registered _hud_submit tool")
170
+
171
+ async def run_scenario_setup(self, scenario_name: str, args: dict[str, Any]) -> str | None:
172
+ """Run a scenario's setup phase and return the prompt.
173
+
174
+ Handles both local scenarios (registered via @env.scenario) and remote
175
+ scenarios (via MCP prompt). Creates _active_session for use by submit/evaluate.
176
+
177
+ Args:
178
+ scenario_name: Name of the scenario to run (may include "env:" prefix)
179
+ args: Arguments to pass to the scenario
180
+
181
+ Returns:
182
+ The prompt string from the scenario's setup phase, or None if failed
183
+ """
184
+ # Determine if this should be local or remote:
185
+ # - No prefix ("greet") → check local first
186
+ # - Prefix matches our env name ("my-env:greet" when self.name="my-env") → local
187
+ # - Prefix is different ("other-env:greet") → remote only
188
+ local_name: str | None = None
189
+ is_explicitly_remote = False
190
+ if ":" in scenario_name:
191
+ prefix, short_name = scenario_name.rsplit(":", 1)
192
+ # self.name is already normalized (underscores → hyphens) in Environment.__init__
193
+ if prefix == self.name:
194
+ # Prefix matches our env - check local
195
+ local_name = short_name
196
+ else:
197
+ # Different prefix - explicitly remote
198
+ local_name = short_name
199
+ is_explicitly_remote = True
200
+ else:
201
+ # No prefix - check local
202
+ local_name = scenario_name
203
+
204
+ # Check if scenario is registered locally (unless explicitly remote)
205
+ if not is_explicitly_remote and local_name in self._scenarios:
206
+ # Local scenario - run setup via generator
207
+ scenario_fn = self._scenarios[local_name]
208
+ gen = scenario_fn(**args)
209
+
210
+ # Run setup phase (code before first yield)
211
+ prompt = await gen.__anext__()
212
+
213
+ # Create session for local scenario
214
+ self._active_session = ScenarioSession(
215
+ local_name=local_name,
216
+ full_name=scenario_name,
217
+ is_local=True,
218
+ connection_name=None,
219
+ resource_uri=f"{self.name}:{local_name}",
220
+ generator=gen,
221
+ )
222
+
223
+ logger.debug(
224
+ "Local scenario setup: %s (session=%s)",
225
+ local_name,
226
+ self._active_session,
227
+ )
228
+ return str(prompt)
229
+ else:
230
+ # Remote scenario - call via MCP prompt
231
+ # If scenario_name already contains ":", it's already namespaced - use directly
232
+ # Otherwise, prefix with env name: {env_name}:{scenario_name}
233
+ if ":" in scenario_name:
234
+ prompt_id = scenario_name
235
+ else:
236
+ # Use _source_env_name (from EvalContext) or self.name - both are normalized
237
+ env_name = getattr(self, "_source_env_name", None) or self.name
238
+ prompt_id = f"{env_name}:{scenario_name}"
239
+
240
+ # Serialize args for MCP prompt (only supports string values)
241
+ serialized_args: dict[str, str] = {}
242
+ for key, value in args.items():
243
+ serialized_args[key] = value if isinstance(value, str) else json.dumps(value)
244
+
245
+ try:
246
+ result = await self.get_prompt(prompt_id, serialized_args) # type: ignore[attr-defined]
247
+ # Get connection AFTER get_prompt succeeds (routing is now guaranteed built)
248
+ conn_name = self._router.get_prompt_connection(prompt_id) # type: ignore[attr-defined]
249
+ logger.debug(
250
+ "Remote scenario: prompt_id=%s, connection=%s",
251
+ prompt_id,
252
+ conn_name or "(not found in router)",
253
+ )
254
+ except Exception as e:
255
+ # Fetch available scenarios for error context
256
+ try:
257
+ prompts = await self.list_prompts() # type: ignore[attr-defined]
258
+ scenario_prompts = [p.name for p in prompts if ":" in p.name]
259
+ available = "\n ".join(scenario_prompts) if scenario_prompts else "(none)"
260
+ except Exception:
261
+ available = "(could not fetch)"
262
+ scenario_prompts = []
263
+
264
+ original_error = str(e)
265
+ if prompt_id in scenario_prompts:
266
+ raise ValueError(
267
+ f"⚠️ ERROR: Scenario '{prompt_id}' exists but failed to execute.\n\n"
268
+ f"The scenario was found but encountered an error during setup:\n"
269
+ f" {original_error}\n\n"
270
+ f"This could be caused by:\n"
271
+ f" - Missing or invalid scenario arguments\n"
272
+ f" - An error in the scenario's setup function\n"
273
+ f" - Connection or serialization issues\n\n"
274
+ f"Check the scenario definition and required arguments."
275
+ ) from e
276
+
277
+ raise ValueError(
278
+ f"⚠️ ERROR: Scenario not found.\n\n"
279
+ f"Scenario IDs have the format 'environment_name:scenario_name'.\n"
280
+ f"If you only specify 'scenario_name', the SDK uses your task's env name "
281
+ f"as the prefix.\n"
282
+ f"This won't work if the HUD environment was declared with a different name."
283
+ f"\n\n"
284
+ f" You requested: {scenario_name}\n"
285
+ f" SDK looked for: {prompt_id}\n\n"
286
+ f"Available scenarios:\n {available}\n\n"
287
+ f"Fix: Use one of the scenario IDs above in your task JSON."
288
+ ) from e
289
+
290
+ # Extract prompt text from response
291
+ prompt_text: str | None = None
292
+ if result.messages:
293
+ first_msg = result.messages[0]
294
+ content = first_msg.content
295
+ if hasattr(content, "text") and isinstance(content.text, str): # type: ignore[union-attr]
296
+ prompt_text = content.text # type: ignore[union-attr]
297
+ elif isinstance(content, str):
298
+ prompt_text = content
299
+
300
+ if not prompt_text:
301
+ raise ValueError(
302
+ f"Scenario '{scenario_name}' returned an empty response.\n\n"
303
+ f"The scenario's setup function was called but returned no messages.\n"
304
+ f"Check that the scenario returns a valid prompt string."
305
+ )
306
+
307
+ # Create session for remote scenario - use router's connection info
308
+ self._active_session = ScenarioSession(
309
+ local_name=local_name,
310
+ full_name=scenario_name,
311
+ is_local=False,
312
+ connection_name=conn_name,
313
+ resource_uri=prompt_id, # Resource has same URI as prompt
314
+ generator=None,
315
+ )
316
+
317
+ logger.debug(
318
+ "Remote scenario setup: %s (connection=%s)",
319
+ prompt_id,
320
+ conn_name,
321
+ )
322
+ return prompt_text
323
+
324
+ async def run_scenario_evaluate(self, scenario_name: str) -> float | None:
325
+ """Run a scenario's evaluate phase and return the reward.
326
+
327
+ Uses _active_session created by run_scenario_setup():
328
+ - Local: use stored generator with submitted answer
329
+ - Remote: read resource from the connection that served setup
330
+
331
+ Args:
332
+ scenario_name: Name of the scenario to evaluate
333
+
334
+ Returns:
335
+ The reward from the scenario's evaluate phase, or None if failed
336
+ """
337
+ if not self._active_session:
338
+ logger.warning("No active session for scenario '%s'", scenario_name)
339
+ return None
340
+
341
+ session = self._active_session
342
+ self._active_session = None # Clear after use
343
+
344
+ if session.is_local:
345
+ # Local scenario - use generator
346
+ if not session.generator:
347
+ logger.warning("Local scenario '%s' has no generator", session.local_name)
348
+ return None
349
+
350
+ answer = session.answer
351
+ try:
352
+ reward = await session.generator.asend(answer)
353
+ logger.debug(
354
+ "Local scenario %s evaluate: answer=%s, reward=%s",
355
+ session.local_name,
356
+ answer[:50] if answer and len(answer) > 50 else answer,
357
+ reward,
358
+ )
359
+ return float(reward)
360
+ except StopAsyncIteration:
361
+ return 1.0
362
+ else:
363
+ # Remote scenario - read resource via router
364
+ try:
365
+ contents = await self.read_resource(session.resource_uri) # type: ignore[attr-defined]
366
+ if contents:
367
+ first = contents[0]
368
+ if hasattr(first, "text") and isinstance(first.text, str): # type: ignore[union-attr]
369
+ data = json.loads(first.text) # type: ignore[union-attr]
370
+ if "reward" in data:
371
+ logger.debug(
372
+ "Remote scenario %s evaluate: reward=%s",
373
+ session.local_name,
374
+ data["reward"],
375
+ )
376
+ return float(data["reward"])
377
+ except Exception as e:
378
+ logger.warning("Failed to get scenario reward from %s: %s", session.resource_uri, e)
379
+ return None
380
+
381
+ def scenario(
382
+ self,
383
+ name: str | None = None,
384
+ description: str | None = None,
385
+ required_env_vars: list[str] | None = None,
386
+ ) -> Callable[
387
+ [Callable[..., AsyncGenerator[Any, None]]],
388
+ Callable[..., AsyncGenerator[Any, None]],
389
+ ]:
390
+ """Decorator to register a scenario with setup and evaluate phases.
391
+
392
+ Creates both a prompt and resource with identifier scenario:{name}.
393
+ The scenario function should yield twice:
394
+ - First yield: the prompt string (returned from prompt)
395
+ - Second yield: the reward float (returned from resource)
396
+
397
+ Args:
398
+ name: Optional name for the scenario (defaults to function name)
399
+ description: Optional description of what the scenario does
400
+ required_env_vars: Optional list of environment variable names this scenario requires.
401
+ These are used by the HUD platform to check if users have configured the
402
+ necessary API keys/credentials before running this specific scenario.
403
+
404
+ Example:
405
+ @env.scenario(required_env_vars=["OPENAI_API_KEY"])
406
+ async def chat(query: str):
407
+ yield f"Answer this question: {query}"
408
+ # ... evaluate
409
+ yield 1.0
410
+
411
+ # MCP client usage:
412
+ # 1. get_prompt("{env_name}:chat", {query: "..."}) -> prompt messages
413
+ # 2. agent runs...
414
+ # 3. read_resource("{env_name}:chat") -> {"reward": 0.95}
415
+ """
416
+
417
+ def decorator(
418
+ fn: Callable[..., AsyncGenerator[Any, None]],
419
+ ) -> Callable[..., AsyncGenerator[Any, None]]:
420
+ scenario_name = name or fn.__name__
421
+
422
+ # Validate scenario name - colons are reserved as env:scenario separator
423
+ if ":" in scenario_name:
424
+ raise ValueError(
425
+ f"Scenario name '{scenario_name}' cannot contain ':' "
426
+ "(reserved as separator between environment and scenario names)"
427
+ )
428
+
429
+ # self.name is already normalized (lowercase, hyphens) by Environment.__init__
430
+ scenario_id = f"{self.name}:{scenario_name}"
431
+ scenario_desc = description or fn.__doc__ or f"Scenario: {scenario_name}"
432
+
433
+ # Capture source code for reproducibility
434
+ try:
435
+ source_code = inspect.getsource(fn)
436
+ except (OSError, TypeError) as e:
437
+ logger.warning(
438
+ "Could not capture source code for scenario '%s': %s",
439
+ scenario_name,
440
+ e,
441
+ )
442
+ source_code = None
443
+
444
+ # Store the generator function
445
+ self._scenarios[scenario_name] = fn
446
+
447
+ # Get function signature for prompt arguments with type info
448
+ sig = inspect.signature(fn)
449
+ prompt_args: list[dict[str, Any]] = []
450
+ for p in sig.parameters.values():
451
+ is_required = p.default is inspect.Parameter.empty
452
+ arg_info: dict[str, Any] = {"name": p.name, "required": is_required}
453
+
454
+ # Include default value if present
455
+ if not is_required:
456
+ # Only include JSON-serializable defaults
457
+ default_val = p.default
458
+ if default_val is None or isinstance(
459
+ default_val, (str | int | float | bool | list | dict)
460
+ ):
461
+ arg_info["default"] = default_val
462
+
463
+ # Extract type annotation
464
+ if p.annotation is not inspect.Parameter.empty:
465
+ try:
466
+ # Use pydantic to convert annotation to JSON schema
467
+ from pydantic import TypeAdapter
468
+
469
+ adapter = TypeAdapter(p.annotation)
470
+ param_schema = adapter.json_schema()
471
+ # Extract type from schema (could be "string", "integer", etc.)
472
+ if "type" in param_schema:
473
+ arg_info["type"] = param_schema["type"]
474
+ elif "$ref" in param_schema or "anyOf" in param_schema:
475
+ # Complex type - store the full schema
476
+ arg_info["inputSchema"] = param_schema
477
+ except Exception:
478
+ arg_info["type"] = "string"
479
+ else:
480
+ arg_info["type"] = "string"
481
+
482
+ prompt_args.append(arg_info)
483
+
484
+ # Register PROMPT - runs setup, returns prompt messages
485
+ # We need a reference to self and the outer variables
486
+ scenario_self = self
487
+ scenario_name_ref = scenario_name
488
+
489
+ # Resolve parameter type hints for deserialization
490
+ # Use get_type_hints() to handle `from __future__ import annotations`
491
+ # which makes annotations lazy strings (PEP 563)
492
+ # MCP prompts only support string arguments, so we JSON-serialize complex types
493
+ # and use Pydantic TypeAdapter to properly deserialize them
494
+ try:
495
+ param_annotations = get_type_hints(fn)
496
+ except Exception:
497
+ # Fall back to raw annotations if get_type_hints fails
498
+ param_annotations = {
499
+ p.name: p.annotation
500
+ for p in sig.parameters.values()
501
+ if p.annotation is not inspect.Parameter.empty
502
+ }
503
+
504
+ async def prompt_handler(**handler_args: Any) -> list[str]:
505
+ from pydantic import TypeAdapter
506
+
507
+ # Deserialize JSON-encoded arguments using Pydantic TypeAdapter
508
+ # MCP prompts only support string arguments, so complex types are
509
+ # JSON-serialized on the sending side and deserialized here
510
+ deserialized_args: dict[str, Any] = {}
511
+ for arg_name, arg_value in handler_args.items():
512
+ annotation = param_annotations.get(arg_name)
513
+
514
+ # Only attempt deserialization on string values
515
+ if not isinstance(arg_value, str):
516
+ deserialized_args[arg_name] = arg_value
517
+ continue
518
+
519
+ # If annotation is explicitly str, keep as string
520
+ if annotation is str:
521
+ deserialized_args[arg_name] = arg_value
522
+ continue
523
+
524
+ # If we have a non-str type annotation, use TypeAdapter
525
+ if annotation is not None:
526
+ try:
527
+ adapter = TypeAdapter(annotation)
528
+ deserialized_args[arg_name] = adapter.validate_json(arg_value)
529
+ continue
530
+ except Exception: # noqa: S110
531
+ pass # Fall through to generic JSON decode
532
+
533
+ # Try JSON decode for strings that look like JSON
534
+ stripped = arg_value.strip()
535
+ if (stripped and stripped[0] in "[{") or stripped in ("true", "false", "null"):
536
+ try:
537
+ deserialized_args[arg_name] = json.loads(arg_value)
538
+ continue
539
+ except json.JSONDecodeError:
540
+ pass
541
+
542
+ # Try to decode if it looks like a number
543
+ if stripped.lstrip("-").replace(".", "", 1).isdigit():
544
+ try:
545
+ deserialized_args[arg_name] = json.loads(arg_value)
546
+ continue
547
+ except json.JSONDecodeError:
548
+ pass
549
+
550
+ # Keep as string
551
+ deserialized_args[arg_name] = arg_value
552
+
553
+ # Delegate to run_scenario_setup (consolidates client/server logic)
554
+ prompt_text = await scenario_self.run_scenario_setup(
555
+ scenario_name_ref, deserialized_args
556
+ )
557
+
558
+ if prompt_text is None:
559
+ raise ValueError(f"Scenario '{scenario_name_ref}' setup returned no prompt")
560
+
561
+ # Return just the string - FastMCP wraps it in PromptMessage
562
+ return [str(prompt_text)]
563
+
564
+ # Register prompt using FastMCP - create FunctionPrompt directly
565
+ # to bypass the **kwargs validation in from_function()
566
+ from fastmcp.prompts.prompt import FunctionPrompt, PromptArgument
567
+
568
+ # Build meta with source code and full arguments info (with types/defaults)
569
+ scenario_meta: dict[str, Any] = {}
570
+ if source_code:
571
+ scenario_meta["code"] = source_code
572
+ if prompt_args:
573
+ scenario_meta["arguments"] = prompt_args
574
+ if required_env_vars:
575
+ scenario_meta["required_env_vars"] = required_env_vars
576
+
577
+ prompt = FunctionPrompt(
578
+ name=scenario_id,
579
+ description=f"[Setup] {scenario_desc}",
580
+ arguments=[
581
+ PromptArgument(name=arg["name"], required=arg["required"])
582
+ for arg in prompt_args
583
+ ],
584
+ fn=prompt_handler,
585
+ meta=scenario_meta if scenario_meta else None,
586
+ )
587
+ self._prompt_manager.add_prompt(prompt)
588
+
589
+ # Register RESOURCE - runs evaluate, returns reward
590
+ async def resource_handler() -> str:
591
+ # Delegate to run_scenario_evaluate (consolidates client/server logic)
592
+ reward = await scenario_self.run_scenario_evaluate(scenario_name_ref)
593
+
594
+ if reward is None:
595
+ raise ValueError(f"Scenario '{scenario_name_ref}' evaluation failed")
596
+
597
+ return json.dumps({"reward": float(reward)})
598
+
599
+ # Register as resource with same scenario: URI
600
+ from fastmcp.resources.resource import FunctionResource
601
+
602
+ resource = FunctionResource.from_function(
603
+ fn=resource_handler,
604
+ uri=scenario_id,
605
+ name=scenario_name,
606
+ description=f"[Evaluate] {scenario_desc}",
607
+ mime_type="application/json",
608
+ meta=scenario_meta,
609
+ )
610
+ self._resource_manager.add_resource(resource)
611
+
612
+ logger.debug(
613
+ "Registered scenario '%s' as prompt and resource: %s",
614
+ scenario_name,
615
+ scenario_id,
616
+ )
617
+
618
+ return fn
619
+
620
+ return decorator
@@ -0,0 +1 @@
1
+ """Tests for hud.environment module."""