hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,493 @@
1
+ """Scenario decorator for Environment - defines setup/evaluate phases."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import inspect
6
+ import json
7
+ import logging
8
+ import uuid
9
+ from typing import TYPE_CHECKING, Any
10
+
11
+ if TYPE_CHECKING:
12
+ from collections.abc import AsyncGenerator, Callable
13
+
14
+ from fastmcp.prompts import PromptManager
15
+ from fastmcp.resources import ResourceManager
16
+ from fastmcp.tools import ToolManager
17
+
18
+ __all__ = ["ScenarioMixin"]
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class ScenarioMixin:
24
+ """Mixin providing @env.scenario decorator for setup/evaluate phases.
25
+
26
+ Scenarios are async generators that yield twice:
27
+ - First yield: prompt string (setup phase)
28
+ - Second yield: reward float (evaluate phase)
29
+
30
+ The scenario can receive the agent's answer via yield:
31
+ answer = yield "Do the task"
32
+ yield 1.0 if "success" in answer else 0.0
33
+
34
+ The answer is passed via the hud_submit tool or ctx.submit().
35
+
36
+ The decorator registers both an MCP prompt and resource with the same
37
+ identifier ({env_name}:{scenario_name}), linked by session state.
38
+
39
+ Example:
40
+ @env.scenario()
41
+ async def search_cats(url: str):
42
+ await env.call_tool("navigate", url=url)
43
+ answer = yield "Find all cat images on the page"
44
+ result = await env.call_tool("count_cats")
45
+ yield float(result > 0 or "found" in answer.lower())
46
+ """
47
+
48
+ # These come from Environment/MCPServer
49
+ name: str
50
+ _prompt_manager: PromptManager
51
+ _resource_manager: ResourceManager
52
+ _tool_manager: ToolManager
53
+
54
+ # Scenario state
55
+ _scenarios: dict[str, Callable[..., AsyncGenerator[Any, Any]]]
56
+ _scenario_sessions: dict[str, AsyncGenerator[Any, Any]] # session_id -> generator
57
+ _scenario_latest: dict[str, str] # scenario_name -> latest session_id
58
+ _scenario_answers: dict[str, str] # scenario_name -> submitted answer
59
+
60
+ def _init_scenarios(self) -> None:
61
+ """Initialize scenario state. Called from Environment.__init__."""
62
+ self._scenarios = {}
63
+ self._scenario_sessions = {}
64
+ self._scenario_latest = {}
65
+ self._scenario_answers = {}
66
+
67
+ # Register _hud_submit tool (underscore = hidden from agent)
68
+ self._register_hud_submit_tool()
69
+
70
+ async def submit(self, scenario: str, answer: str) -> None:
71
+ """Submit the agent's answer for a scenario's evaluate phase.
72
+
73
+ This stores the answer locally and broadcasts to connected hubs
74
+ that have the _hud_submit tool (auto-detected by Environment).
75
+
76
+ Args:
77
+ scenario: Name of the scenario (without env prefix)
78
+ answer: The agent's answer/result to submit
79
+
80
+ Example:
81
+ # Direct call with scenario name
82
+ await env.submit("checkout", "Order completed successfully")
83
+
84
+ # Or via EvalContext (knows its own scenario)
85
+ await ctx.submit("Order completed successfully")
86
+ """
87
+ # Store locally for our scenarios
88
+ self._scenario_answers[scenario] = answer
89
+ logger.debug(
90
+ "Stored answer for scenario '%s': %s...",
91
+ scenario,
92
+ answer[:50] if len(answer) > 50 else answer,
93
+ )
94
+
95
+ # Broadcast to connections that have _hud_submit
96
+ # Environment._broadcast_tool auto-filters to connections with the tool
97
+ await self._broadcast_tool( # type: ignore[attr-defined]
98
+ "_hud_submit",
99
+ scenario=scenario,
100
+ answer=answer,
101
+ )
102
+
103
+ def _register_hud_submit_tool(self) -> None:
104
+ """Register the _hud_submit tool for receiving agent answers.
105
+
106
+ Named with underscore prefix to hide from agent tool listings.
107
+ """
108
+ from fastmcp.tools import Tool
109
+
110
+ scenario_self = self
111
+
112
+ async def _hud_submit(scenario: str, answer: str) -> str:
113
+ """Submit the agent's answer for a scenario's evaluate phase.
114
+
115
+ Internal tool - called by Environment.submit() on connected hubs.
116
+
117
+ Args:
118
+ scenario: Name of the scenario (without env prefix)
119
+ answer: The agent's answer/result to submit
120
+ """
121
+ # Store locally (don't broadcast - we ARE the target)
122
+ scenario_self._scenario_answers[scenario] = answer
123
+ logger.debug(
124
+ "_hud_submit received answer for scenario '%s': %s...",
125
+ scenario,
126
+ answer[:50] if len(answer) > 50 else answer,
127
+ )
128
+ return f"Answer submitted for scenario '{scenario}'"
129
+
130
+ # Register the tool with underscore name
131
+ tool = Tool.from_function(_hud_submit)
132
+ self._tool_manager.add_tool(tool)
133
+ logger.debug("Registered _hud_submit tool")
134
+
135
+ async def run_scenario_setup(self, scenario_name: str, args: dict[str, Any]) -> str | None:
136
+ """Run a scenario's setup phase and return the prompt.
137
+
138
+ Handles both local scenarios (registered via @env.scenario) and remote
139
+ scenarios (via MCP prompt).
140
+
141
+ Args:
142
+ scenario_name: Name of the scenario to run
143
+ args: Arguments to pass to the scenario
144
+
145
+ Returns:
146
+ The prompt string from the scenario's setup phase, or None if failed
147
+ """
148
+ # Check if scenario is registered locally
149
+ if scenario_name in self._scenarios:
150
+ # Local scenario - run setup via generator
151
+ scenario_fn = self._scenarios[scenario_name]
152
+ gen = scenario_fn(**args)
153
+
154
+ # Run setup phase (code before first yield)
155
+ prompt = await gen.__anext__()
156
+
157
+ # Store generator for evaluate phase
158
+ session_id = uuid.uuid4().hex[:8]
159
+ self._scenario_sessions[session_id] = gen
160
+ self._scenario_latest[scenario_name] = session_id
161
+
162
+ logger.debug(
163
+ "Scenario %s setup complete, session=%s",
164
+ scenario_name,
165
+ session_id,
166
+ )
167
+ return str(prompt)
168
+ else:
169
+ # Remote scenario - call via MCP prompt
170
+ # If scenario_name already contains ":", it's already namespaced - use directly
171
+ # Otherwise, prefix with env name: {env_name}:{scenario_name}
172
+ if ":" in scenario_name:
173
+ prompt_id = scenario_name
174
+ logger.debug("Remote scenario (already namespaced): prompt_id=%s", prompt_id)
175
+ else:
176
+ env_name = getattr(self, "_source_env_name", None) or self.name
177
+ safe_env_name = env_name.replace("_", "-")
178
+ prompt_id = f"{safe_env_name}:{scenario_name}"
179
+ logger.debug("Remote scenario (adding namespace): prompt_id=%s", prompt_id)
180
+ try:
181
+ result = await self.get_prompt(prompt_id, args) # type: ignore[attr-defined]
182
+ except Exception as e:
183
+ # Fetch available scenarios for error context
184
+ try:
185
+ prompts = await self.list_prompts() # type: ignore[attr-defined]
186
+ scenario_prompts = [p.name for p in prompts if ":" in p.name]
187
+ available = (
188
+ "\n ".join(scenario_prompts) if scenario_prompts else "(none found)"
189
+ )
190
+ except Exception:
191
+ available = "(could not fetch available scenarios)"
192
+
193
+ raise ValueError(
194
+ f"Scenario not found.\n\n"
195
+ f"Scenario IDs have the format 'environment_name:scenario_name'.\n"
196
+ f"If you only specify 'scenario_name', the SDK uses your task's env name "
197
+ f"as the prefix.\n"
198
+ f"This won't work if the HUD environment was declared with a different name."
199
+ f"\n\n"
200
+ f" You requested: {scenario_name}\n"
201
+ f" SDK looked for: {prompt_id}\n\n"
202
+ f"Available scenarios:\n {available}\n\n"
203
+ f"Fix: Use one of the scenario IDs above in your task JSON."
204
+ ) from e
205
+
206
+ # Validate the response (outside try/except so errors aren't wrapped)
207
+ if result.messages:
208
+ first_msg = result.messages[0]
209
+ content = first_msg.content
210
+ if hasattr(content, "text") and isinstance(content.text, str): # type: ignore[union-attr]
211
+ return content.text # type: ignore[union-attr]
212
+ elif isinstance(content, str):
213
+ return content
214
+ else:
215
+ # Content exists but is neither text object nor string
216
+ raise ValueError(
217
+ f"Scenario '{scenario_name}' returned malformed content.\n\n"
218
+ f"Expected: content with .text attribute (str) or content as str\n"
219
+ f"Got: {type(content).__name__}\n\n"
220
+ f"Check that the scenario's setup function returns a valid prompt."
221
+ )
222
+ else:
223
+ # get_prompt succeeded but returned empty messages
224
+ raise ValueError(
225
+ f"Scenario '{scenario_name}' returned an empty response.\n\n"
226
+ f"The scenario's setup function was called but returned no messages.\n"
227
+ f"Check that the scenario returns a valid prompt string."
228
+ )
229
+
230
+ async def run_scenario_evaluate(self, scenario_name: str) -> float | None:
231
+ """Run a scenario's evaluate phase and return the reward.
232
+
233
+ Uses the submitted answer (if any) via gen.asend().
234
+ Handles both local and remote scenarios.
235
+
236
+ Args:
237
+ scenario_name: Name of the scenario to evaluate
238
+
239
+ Returns:
240
+ The reward from the scenario's evaluate phase, or None if failed
241
+ """
242
+ # Check if we have a stored generator (local scenario)
243
+ session_id = self._scenario_latest.get(scenario_name)
244
+ if session_id:
245
+ gen = self._scenario_sessions.pop(session_id, None)
246
+ if gen:
247
+ # Get submitted answer (if any)
248
+ answer = self._scenario_answers.pop(scenario_name, None)
249
+
250
+ try:
251
+ # Use asend to pass the answer to the scenario
252
+ reward = await gen.asend(answer)
253
+ logger.debug(
254
+ "Scenario %s evaluate complete, answer=%s, reward=%s",
255
+ scenario_name,
256
+ answer[:50] if answer and len(answer) > 50 else answer,
257
+ reward,
258
+ )
259
+ return float(reward)
260
+ except StopAsyncIteration:
261
+ # Generator ended without second yield - assume success
262
+ return 1.0
263
+ finally:
264
+ # Clean up latest pointer
265
+ if self._scenario_latest.get(scenario_name) == session_id:
266
+ del self._scenario_latest[scenario_name]
267
+
268
+ # Remote scenario - read via MCP resource
269
+ # If scenario_name already contains ":", it's already namespaced - use directly
270
+ if ":" in scenario_name:
271
+ resource_id = scenario_name
272
+ else:
273
+ env_name = getattr(self, "_source_env_name", None) or self.name
274
+ safe_env_name = env_name.replace("_", "-")
275
+ resource_id = f"{safe_env_name}:{scenario_name}"
276
+ try:
277
+ contents = await self.read_resource(resource_id) # type: ignore[attr-defined]
278
+ if contents:
279
+ first_content = contents[0]
280
+ if hasattr(first_content, "text") and isinstance(first_content.text, str): # type: ignore[union-attr]
281
+ data = json.loads(first_content.text) # type: ignore[union-attr]
282
+ if "reward" in data:
283
+ return float(data["reward"])
284
+ except Exception as e:
285
+ logger.warning("Failed to get scenario reward: %s", e)
286
+ return None
287
+
288
+ def scenario(
289
+ self,
290
+ name: str | None = None,
291
+ description: str | None = None,
292
+ ) -> Callable[
293
+ [Callable[..., AsyncGenerator[Any, None]]],
294
+ Callable[..., AsyncGenerator[Any, None]],
295
+ ]:
296
+ """Decorator to register a scenario with setup and evaluate phases.
297
+
298
+ Creates both a prompt and resource with identifier scenario:{name}.
299
+ The scenario function should yield twice:
300
+ - First yield: the prompt string (returned from prompt)
301
+ - Second yield: the reward float (returned from resource)
302
+
303
+ Args:
304
+ name: Optional name for the scenario (defaults to function name)
305
+ description: Optional description of what the scenario does
306
+
307
+ Example:
308
+ @env.scenario()
309
+ async def search_cats(url: str):
310
+ await env.call_tool("navigate", url=url)
311
+ yield "Find cat images"
312
+ result = await env.call_tool("count_cats")
313
+ yield float(result > 0)
314
+
315
+ # MCP client usage:
316
+ # 1. get_prompt("{env_name}:search_cats", {url: "..."}) -> prompt messages
317
+ # 2. agent runs...
318
+ # 3. read_resource("{env_name}:search_cats") -> {"reward": 0.95}
319
+ """
320
+
321
+ def decorator(
322
+ fn: Callable[..., AsyncGenerator[Any, None]],
323
+ ) -> Callable[..., AsyncGenerator[Any, None]]:
324
+ scenario_name = name or fn.__name__
325
+ # Sanitize env name for URI scheme (no underscores allowed)
326
+ safe_env_name = self.name.replace("_", "-")
327
+ scenario_id = f"{safe_env_name}:{scenario_name}"
328
+ scenario_desc = description or fn.__doc__ or f"Scenario: {scenario_name}"
329
+
330
+ # Capture source code for reproducibility
331
+ try:
332
+ source_code = inspect.getsource(fn)
333
+ except (OSError, TypeError) as e:
334
+ logger.warning(
335
+ "Could not capture source code for scenario '%s': %s",
336
+ scenario_name,
337
+ e,
338
+ )
339
+ source_code = None
340
+
341
+ # Store the generator function
342
+ self._scenarios[scenario_name] = fn
343
+
344
+ # Get function signature for prompt arguments with type info
345
+ sig = inspect.signature(fn)
346
+ prompt_args: list[dict[str, Any]] = []
347
+ for p in sig.parameters.values():
348
+ is_required = p.default is inspect.Parameter.empty
349
+ arg_info: dict[str, Any] = {"name": p.name, "required": is_required}
350
+
351
+ # Include default value if present
352
+ if not is_required:
353
+ # Only include JSON-serializable defaults
354
+ default_val = p.default
355
+ if default_val is None or isinstance(
356
+ default_val, (str, int, float, bool, list, dict)
357
+ ):
358
+ arg_info["default"] = default_val
359
+
360
+ # Extract type annotation
361
+ if p.annotation is not inspect.Parameter.empty:
362
+ try:
363
+ # Use pydantic to convert annotation to JSON schema
364
+ from pydantic import TypeAdapter
365
+
366
+ adapter = TypeAdapter(p.annotation)
367
+ param_schema = adapter.json_schema()
368
+ # Extract type from schema (could be "string", "integer", etc.)
369
+ if "type" in param_schema:
370
+ arg_info["type"] = param_schema["type"]
371
+ elif "$ref" in param_schema or "anyOf" in param_schema:
372
+ # Complex type - store the full schema
373
+ arg_info["inputSchema"] = param_schema
374
+ except Exception:
375
+ arg_info["type"] = "string"
376
+ else:
377
+ arg_info["type"] = "string"
378
+
379
+ prompt_args.append(arg_info)
380
+
381
+ # Register PROMPT - runs setup, returns prompt messages
382
+ # We need a reference to self and the outer variables
383
+ scenario_self = self
384
+ scenario_fn = fn
385
+ scenario_name_ref = scenario_name
386
+
387
+ async def prompt_handler(**handler_args: Any) -> list[str]:
388
+ # Create generator instance
389
+ gen = scenario_fn(**handler_args)
390
+
391
+ # Run setup phase (code before first yield)
392
+ prompt_text = await gen.__anext__()
393
+
394
+ # Store generator with session ID
395
+ session_id = uuid.uuid4().hex[:8]
396
+ scenario_self._scenario_sessions[session_id] = gen
397
+ scenario_self._scenario_latest[scenario_name_ref] = session_id
398
+
399
+ logger.debug(
400
+ "Scenario %s setup complete, session=%s, prompt=%s",
401
+ scenario_name_ref,
402
+ session_id,
403
+ prompt_text[:50] if isinstance(prompt_text, str) else prompt_text,
404
+ )
405
+
406
+ # Return just the string - FastMCP wraps it in PromptMessage
407
+ # Don't return dict or it gets JSON-serialized as text content
408
+ return [str(prompt_text)]
409
+
410
+ # Register prompt using FastMCP - create FunctionPrompt directly
411
+ # to bypass the **kwargs validation in from_function()
412
+ from fastmcp.prompts.prompt import FunctionPrompt, PromptArgument
413
+
414
+ # Build meta with source code and full arguments info (with types/defaults)
415
+ scenario_meta: dict[str, Any] = {}
416
+ if source_code:
417
+ scenario_meta["code"] = source_code
418
+ if prompt_args:
419
+ scenario_meta["arguments"] = prompt_args
420
+
421
+ prompt = FunctionPrompt(
422
+ name=scenario_id,
423
+ description=f"[Setup] {scenario_desc}",
424
+ arguments=[
425
+ PromptArgument(name=arg["name"], required=arg["required"])
426
+ for arg in prompt_args
427
+ ],
428
+ fn=prompt_handler,
429
+ meta=scenario_meta if scenario_meta else None,
430
+ )
431
+ self._prompt_manager.add_prompt(prompt)
432
+
433
+ # Register RESOURCE - runs evaluate, returns reward
434
+ async def resource_handler() -> str:
435
+ # Get latest session for this scenario
436
+ session_id = scenario_self._scenario_latest.get(scenario_name_ref)
437
+ if not session_id:
438
+ raise ValueError(
439
+ f"No active session for scenario '{scenario_name_ref}'. "
440
+ "Call the prompt first to run setup."
441
+ )
442
+
443
+ gen = scenario_self._scenario_sessions.pop(session_id, None)
444
+ if gen is None:
445
+ raise ValueError(f"Session '{session_id}' not found or already evaluated.")
446
+
447
+ # Get submitted answer (if any)
448
+ answer = scenario_self._scenario_answers.pop(scenario_name_ref, None)
449
+
450
+ # Run evaluate phase (code after first yield)
451
+ # Use asend to pass the answer (or None if not submitted)
452
+ try:
453
+ reward = await gen.asend(answer)
454
+ except StopAsyncIteration:
455
+ # Generator ended without second yield - assume success
456
+ reward = 1.0
457
+
458
+ logger.debug(
459
+ "Scenario %s evaluate complete, session=%s, answer=%s, reward=%s",
460
+ scenario_name_ref,
461
+ session_id,
462
+ answer[:50] if answer and len(answer) > 50 else answer,
463
+ reward,
464
+ )
465
+
466
+ # Clean up latest pointer if it matches
467
+ if scenario_self._scenario_latest.get(scenario_name_ref) == session_id:
468
+ del scenario_self._scenario_latest[scenario_name_ref]
469
+
470
+ return json.dumps({"reward": float(reward)})
471
+
472
+ # Register as resource with same scenario: URI
473
+ from fastmcp.resources.resource import FunctionResource
474
+
475
+ resource = FunctionResource.from_function(
476
+ fn=resource_handler,
477
+ uri=scenario_id,
478
+ name=scenario_name,
479
+ description=f"[Evaluate] {scenario_desc}",
480
+ mime_type="application/json",
481
+ meta=scenario_meta,
482
+ )
483
+ self._resource_manager.add_resource(resource)
484
+
485
+ logger.debug(
486
+ "Registered scenario '%s' as prompt and resource: %s",
487
+ scenario_name,
488
+ scenario_id,
489
+ )
490
+
491
+ return fn
492
+
493
+ return decorator
@@ -0,0 +1 @@
1
+ """Tests for hud.environment module."""