hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
"""Scenario decorator for Environment - defines setup/evaluate phases."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import inspect
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import uuid
|
|
9
|
+
from typing import TYPE_CHECKING, Any
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import AsyncGenerator, Callable
|
|
13
|
+
|
|
14
|
+
from fastmcp.prompts import PromptManager
|
|
15
|
+
from fastmcp.resources import ResourceManager
|
|
16
|
+
from fastmcp.tools import ToolManager
|
|
17
|
+
|
|
18
|
+
__all__ = ["ScenarioMixin"]
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ScenarioMixin:
|
|
24
|
+
"""Mixin providing @env.scenario decorator for setup/evaluate phases.
|
|
25
|
+
|
|
26
|
+
Scenarios are async generators that yield twice:
|
|
27
|
+
- First yield: prompt string (setup phase)
|
|
28
|
+
- Second yield: reward float (evaluate phase)
|
|
29
|
+
|
|
30
|
+
The scenario can receive the agent's answer via yield:
|
|
31
|
+
answer = yield "Do the task"
|
|
32
|
+
yield 1.0 if "success" in answer else 0.0
|
|
33
|
+
|
|
34
|
+
The answer is passed via the hud_submit tool or ctx.submit().
|
|
35
|
+
|
|
36
|
+
The decorator registers both an MCP prompt and resource with the same
|
|
37
|
+
identifier ({env_name}:{scenario_name}), linked by session state.
|
|
38
|
+
|
|
39
|
+
Example:
|
|
40
|
+
@env.scenario()
|
|
41
|
+
async def search_cats(url: str):
|
|
42
|
+
await env.call_tool("navigate", url=url)
|
|
43
|
+
answer = yield "Find all cat images on the page"
|
|
44
|
+
result = await env.call_tool("count_cats")
|
|
45
|
+
yield float(result > 0 or "found" in answer.lower())
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
# These come from Environment/MCPServer
|
|
49
|
+
name: str
|
|
50
|
+
_prompt_manager: PromptManager
|
|
51
|
+
_resource_manager: ResourceManager
|
|
52
|
+
_tool_manager: ToolManager
|
|
53
|
+
|
|
54
|
+
# Scenario state
|
|
55
|
+
_scenarios: dict[str, Callable[..., AsyncGenerator[Any, Any]]]
|
|
56
|
+
_scenario_sessions: dict[str, AsyncGenerator[Any, Any]] # session_id -> generator
|
|
57
|
+
_scenario_latest: dict[str, str] # scenario_name -> latest session_id
|
|
58
|
+
_scenario_answers: dict[str, str] # scenario_name -> submitted answer
|
|
59
|
+
|
|
60
|
+
def _init_scenarios(self) -> None:
|
|
61
|
+
"""Initialize scenario state. Called from Environment.__init__."""
|
|
62
|
+
self._scenarios = {}
|
|
63
|
+
self._scenario_sessions = {}
|
|
64
|
+
self._scenario_latest = {}
|
|
65
|
+
self._scenario_answers = {}
|
|
66
|
+
|
|
67
|
+
# Register _hud_submit tool (underscore = hidden from agent)
|
|
68
|
+
self._register_hud_submit_tool()
|
|
69
|
+
|
|
70
|
+
async def submit(self, scenario: str, answer: str) -> None:
|
|
71
|
+
"""Submit the agent's answer for a scenario's evaluate phase.
|
|
72
|
+
|
|
73
|
+
This stores the answer locally and broadcasts to connected hubs
|
|
74
|
+
that have the _hud_submit tool (auto-detected by Environment).
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
scenario: Name of the scenario (without env prefix)
|
|
78
|
+
answer: The agent's answer/result to submit
|
|
79
|
+
|
|
80
|
+
Example:
|
|
81
|
+
# Direct call with scenario name
|
|
82
|
+
await env.submit("checkout", "Order completed successfully")
|
|
83
|
+
|
|
84
|
+
# Or via EvalContext (knows its own scenario)
|
|
85
|
+
await ctx.submit("Order completed successfully")
|
|
86
|
+
"""
|
|
87
|
+
# Store locally for our scenarios
|
|
88
|
+
self._scenario_answers[scenario] = answer
|
|
89
|
+
logger.debug(
|
|
90
|
+
"Stored answer for scenario '%s': %s...",
|
|
91
|
+
scenario,
|
|
92
|
+
answer[:50] if len(answer) > 50 else answer,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Broadcast to connections that have _hud_submit
|
|
96
|
+
# Environment._broadcast_tool auto-filters to connections with the tool
|
|
97
|
+
await self._broadcast_tool( # type: ignore[attr-defined]
|
|
98
|
+
"_hud_submit",
|
|
99
|
+
scenario=scenario,
|
|
100
|
+
answer=answer,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def _register_hud_submit_tool(self) -> None:
|
|
104
|
+
"""Register the _hud_submit tool for receiving agent answers.
|
|
105
|
+
|
|
106
|
+
Named with underscore prefix to hide from agent tool listings.
|
|
107
|
+
"""
|
|
108
|
+
from fastmcp.tools import Tool
|
|
109
|
+
|
|
110
|
+
scenario_self = self
|
|
111
|
+
|
|
112
|
+
async def _hud_submit(scenario: str, answer: str) -> str:
|
|
113
|
+
"""Submit the agent's answer for a scenario's evaluate phase.
|
|
114
|
+
|
|
115
|
+
Internal tool - called by Environment.submit() on connected hubs.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
scenario: Name of the scenario (without env prefix)
|
|
119
|
+
answer: The agent's answer/result to submit
|
|
120
|
+
"""
|
|
121
|
+
# Store locally (don't broadcast - we ARE the target)
|
|
122
|
+
scenario_self._scenario_answers[scenario] = answer
|
|
123
|
+
logger.debug(
|
|
124
|
+
"_hud_submit received answer for scenario '%s': %s...",
|
|
125
|
+
scenario,
|
|
126
|
+
answer[:50] if len(answer) > 50 else answer,
|
|
127
|
+
)
|
|
128
|
+
return f"Answer submitted for scenario '{scenario}'"
|
|
129
|
+
|
|
130
|
+
# Register the tool with underscore name
|
|
131
|
+
tool = Tool.from_function(_hud_submit)
|
|
132
|
+
self._tool_manager.add_tool(tool)
|
|
133
|
+
logger.debug("Registered _hud_submit tool")
|
|
134
|
+
|
|
135
|
+
async def run_scenario_setup(self, scenario_name: str, args: dict[str, Any]) -> str | None:
|
|
136
|
+
"""Run a scenario's setup phase and return the prompt.
|
|
137
|
+
|
|
138
|
+
Handles both local scenarios (registered via @env.scenario) and remote
|
|
139
|
+
scenarios (via MCP prompt).
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
scenario_name: Name of the scenario to run
|
|
143
|
+
args: Arguments to pass to the scenario
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
The prompt string from the scenario's setup phase, or None if failed
|
|
147
|
+
"""
|
|
148
|
+
# Check if scenario is registered locally
|
|
149
|
+
if scenario_name in self._scenarios:
|
|
150
|
+
# Local scenario - run setup via generator
|
|
151
|
+
scenario_fn = self._scenarios[scenario_name]
|
|
152
|
+
gen = scenario_fn(**args)
|
|
153
|
+
|
|
154
|
+
# Run setup phase (code before first yield)
|
|
155
|
+
prompt = await gen.__anext__()
|
|
156
|
+
|
|
157
|
+
# Store generator for evaluate phase
|
|
158
|
+
session_id = uuid.uuid4().hex[:8]
|
|
159
|
+
self._scenario_sessions[session_id] = gen
|
|
160
|
+
self._scenario_latest[scenario_name] = session_id
|
|
161
|
+
|
|
162
|
+
logger.debug(
|
|
163
|
+
"Scenario %s setup complete, session=%s",
|
|
164
|
+
scenario_name,
|
|
165
|
+
session_id,
|
|
166
|
+
)
|
|
167
|
+
return str(prompt)
|
|
168
|
+
else:
|
|
169
|
+
# Remote scenario - call via MCP prompt
|
|
170
|
+
# If scenario_name already contains ":", it's already namespaced - use directly
|
|
171
|
+
# Otherwise, prefix with env name: {env_name}:{scenario_name}
|
|
172
|
+
if ":" in scenario_name:
|
|
173
|
+
prompt_id = scenario_name
|
|
174
|
+
logger.debug("Remote scenario (already namespaced): prompt_id=%s", prompt_id)
|
|
175
|
+
else:
|
|
176
|
+
env_name = getattr(self, "_source_env_name", None) or self.name
|
|
177
|
+
safe_env_name = env_name.replace("_", "-")
|
|
178
|
+
prompt_id = f"{safe_env_name}:{scenario_name}"
|
|
179
|
+
logger.debug("Remote scenario (adding namespace): prompt_id=%s", prompt_id)
|
|
180
|
+
try:
|
|
181
|
+
result = await self.get_prompt(prompt_id, args) # type: ignore[attr-defined]
|
|
182
|
+
except Exception as e:
|
|
183
|
+
# Fetch available scenarios for error context
|
|
184
|
+
try:
|
|
185
|
+
prompts = await self.list_prompts() # type: ignore[attr-defined]
|
|
186
|
+
scenario_prompts = [p.name for p in prompts if ":" in p.name]
|
|
187
|
+
available = (
|
|
188
|
+
"\n ".join(scenario_prompts) if scenario_prompts else "(none found)"
|
|
189
|
+
)
|
|
190
|
+
except Exception:
|
|
191
|
+
available = "(could not fetch available scenarios)"
|
|
192
|
+
|
|
193
|
+
raise ValueError(
|
|
194
|
+
f"Scenario not found.\n\n"
|
|
195
|
+
f"Scenario IDs have the format 'environment_name:scenario_name'.\n"
|
|
196
|
+
f"If you only specify 'scenario_name', the SDK uses your task's env name "
|
|
197
|
+
f"as the prefix.\n"
|
|
198
|
+
f"This won't work if the HUD environment was declared with a different name."
|
|
199
|
+
f"\n\n"
|
|
200
|
+
f" You requested: {scenario_name}\n"
|
|
201
|
+
f" SDK looked for: {prompt_id}\n\n"
|
|
202
|
+
f"Available scenarios:\n {available}\n\n"
|
|
203
|
+
f"Fix: Use one of the scenario IDs above in your task JSON."
|
|
204
|
+
) from e
|
|
205
|
+
|
|
206
|
+
# Validate the response (outside try/except so errors aren't wrapped)
|
|
207
|
+
if result.messages:
|
|
208
|
+
first_msg = result.messages[0]
|
|
209
|
+
content = first_msg.content
|
|
210
|
+
if hasattr(content, "text") and isinstance(content.text, str): # type: ignore[union-attr]
|
|
211
|
+
return content.text # type: ignore[union-attr]
|
|
212
|
+
elif isinstance(content, str):
|
|
213
|
+
return content
|
|
214
|
+
else:
|
|
215
|
+
# Content exists but is neither text object nor string
|
|
216
|
+
raise ValueError(
|
|
217
|
+
f"Scenario '{scenario_name}' returned malformed content.\n\n"
|
|
218
|
+
f"Expected: content with .text attribute (str) or content as str\n"
|
|
219
|
+
f"Got: {type(content).__name__}\n\n"
|
|
220
|
+
f"Check that the scenario's setup function returns a valid prompt."
|
|
221
|
+
)
|
|
222
|
+
else:
|
|
223
|
+
# get_prompt succeeded but returned empty messages
|
|
224
|
+
raise ValueError(
|
|
225
|
+
f"Scenario '{scenario_name}' returned an empty response.\n\n"
|
|
226
|
+
f"The scenario's setup function was called but returned no messages.\n"
|
|
227
|
+
f"Check that the scenario returns a valid prompt string."
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
async def run_scenario_evaluate(self, scenario_name: str) -> float | None:
|
|
231
|
+
"""Run a scenario's evaluate phase and return the reward.
|
|
232
|
+
|
|
233
|
+
Uses the submitted answer (if any) via gen.asend().
|
|
234
|
+
Handles both local and remote scenarios.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
scenario_name: Name of the scenario to evaluate
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
The reward from the scenario's evaluate phase, or None if failed
|
|
241
|
+
"""
|
|
242
|
+
# Check if we have a stored generator (local scenario)
|
|
243
|
+
session_id = self._scenario_latest.get(scenario_name)
|
|
244
|
+
if session_id:
|
|
245
|
+
gen = self._scenario_sessions.pop(session_id, None)
|
|
246
|
+
if gen:
|
|
247
|
+
# Get submitted answer (if any)
|
|
248
|
+
answer = self._scenario_answers.pop(scenario_name, None)
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
# Use asend to pass the answer to the scenario
|
|
252
|
+
reward = await gen.asend(answer)
|
|
253
|
+
logger.debug(
|
|
254
|
+
"Scenario %s evaluate complete, answer=%s, reward=%s",
|
|
255
|
+
scenario_name,
|
|
256
|
+
answer[:50] if answer and len(answer) > 50 else answer,
|
|
257
|
+
reward,
|
|
258
|
+
)
|
|
259
|
+
return float(reward)
|
|
260
|
+
except StopAsyncIteration:
|
|
261
|
+
# Generator ended without second yield - assume success
|
|
262
|
+
return 1.0
|
|
263
|
+
finally:
|
|
264
|
+
# Clean up latest pointer
|
|
265
|
+
if self._scenario_latest.get(scenario_name) == session_id:
|
|
266
|
+
del self._scenario_latest[scenario_name]
|
|
267
|
+
|
|
268
|
+
# Remote scenario - read via MCP resource
|
|
269
|
+
# If scenario_name already contains ":", it's already namespaced - use directly
|
|
270
|
+
if ":" in scenario_name:
|
|
271
|
+
resource_id = scenario_name
|
|
272
|
+
else:
|
|
273
|
+
env_name = getattr(self, "_source_env_name", None) or self.name
|
|
274
|
+
safe_env_name = env_name.replace("_", "-")
|
|
275
|
+
resource_id = f"{safe_env_name}:{scenario_name}"
|
|
276
|
+
try:
|
|
277
|
+
contents = await self.read_resource(resource_id) # type: ignore[attr-defined]
|
|
278
|
+
if contents:
|
|
279
|
+
first_content = contents[0]
|
|
280
|
+
if hasattr(first_content, "text") and isinstance(first_content.text, str): # type: ignore[union-attr]
|
|
281
|
+
data = json.loads(first_content.text) # type: ignore[union-attr]
|
|
282
|
+
if "reward" in data:
|
|
283
|
+
return float(data["reward"])
|
|
284
|
+
except Exception as e:
|
|
285
|
+
logger.warning("Failed to get scenario reward: %s", e)
|
|
286
|
+
return None
|
|
287
|
+
|
|
288
|
+
def scenario(
|
|
289
|
+
self,
|
|
290
|
+
name: str | None = None,
|
|
291
|
+
description: str | None = None,
|
|
292
|
+
) -> Callable[
|
|
293
|
+
[Callable[..., AsyncGenerator[Any, None]]],
|
|
294
|
+
Callable[..., AsyncGenerator[Any, None]],
|
|
295
|
+
]:
|
|
296
|
+
"""Decorator to register a scenario with setup and evaluate phases.
|
|
297
|
+
|
|
298
|
+
Creates both a prompt and resource with identifier scenario:{name}.
|
|
299
|
+
The scenario function should yield twice:
|
|
300
|
+
- First yield: the prompt string (returned from prompt)
|
|
301
|
+
- Second yield: the reward float (returned from resource)
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
name: Optional name for the scenario (defaults to function name)
|
|
305
|
+
description: Optional description of what the scenario does
|
|
306
|
+
|
|
307
|
+
Example:
|
|
308
|
+
@env.scenario()
|
|
309
|
+
async def search_cats(url: str):
|
|
310
|
+
await env.call_tool("navigate", url=url)
|
|
311
|
+
yield "Find cat images"
|
|
312
|
+
result = await env.call_tool("count_cats")
|
|
313
|
+
yield float(result > 0)
|
|
314
|
+
|
|
315
|
+
# MCP client usage:
|
|
316
|
+
# 1. get_prompt("{env_name}:search_cats", {url: "..."}) -> prompt messages
|
|
317
|
+
# 2. agent runs...
|
|
318
|
+
# 3. read_resource("{env_name}:search_cats") -> {"reward": 0.95}
|
|
319
|
+
"""
|
|
320
|
+
|
|
321
|
+
def decorator(
|
|
322
|
+
fn: Callable[..., AsyncGenerator[Any, None]],
|
|
323
|
+
) -> Callable[..., AsyncGenerator[Any, None]]:
|
|
324
|
+
scenario_name = name or fn.__name__
|
|
325
|
+
# Sanitize env name for URI scheme (no underscores allowed)
|
|
326
|
+
safe_env_name = self.name.replace("_", "-")
|
|
327
|
+
scenario_id = f"{safe_env_name}:{scenario_name}"
|
|
328
|
+
scenario_desc = description or fn.__doc__ or f"Scenario: {scenario_name}"
|
|
329
|
+
|
|
330
|
+
# Capture source code for reproducibility
|
|
331
|
+
try:
|
|
332
|
+
source_code = inspect.getsource(fn)
|
|
333
|
+
except (OSError, TypeError) as e:
|
|
334
|
+
logger.warning(
|
|
335
|
+
"Could not capture source code for scenario '%s': %s",
|
|
336
|
+
scenario_name,
|
|
337
|
+
e,
|
|
338
|
+
)
|
|
339
|
+
source_code = None
|
|
340
|
+
|
|
341
|
+
# Store the generator function
|
|
342
|
+
self._scenarios[scenario_name] = fn
|
|
343
|
+
|
|
344
|
+
# Get function signature for prompt arguments with type info
|
|
345
|
+
sig = inspect.signature(fn)
|
|
346
|
+
prompt_args: list[dict[str, Any]] = []
|
|
347
|
+
for p in sig.parameters.values():
|
|
348
|
+
is_required = p.default is inspect.Parameter.empty
|
|
349
|
+
arg_info: dict[str, Any] = {"name": p.name, "required": is_required}
|
|
350
|
+
|
|
351
|
+
# Include default value if present
|
|
352
|
+
if not is_required:
|
|
353
|
+
# Only include JSON-serializable defaults
|
|
354
|
+
default_val = p.default
|
|
355
|
+
if default_val is None or isinstance(
|
|
356
|
+
default_val, (str, int, float, bool, list, dict)
|
|
357
|
+
):
|
|
358
|
+
arg_info["default"] = default_val
|
|
359
|
+
|
|
360
|
+
# Extract type annotation
|
|
361
|
+
if p.annotation is not inspect.Parameter.empty:
|
|
362
|
+
try:
|
|
363
|
+
# Use pydantic to convert annotation to JSON schema
|
|
364
|
+
from pydantic import TypeAdapter
|
|
365
|
+
|
|
366
|
+
adapter = TypeAdapter(p.annotation)
|
|
367
|
+
param_schema = adapter.json_schema()
|
|
368
|
+
# Extract type from schema (could be "string", "integer", etc.)
|
|
369
|
+
if "type" in param_schema:
|
|
370
|
+
arg_info["type"] = param_schema["type"]
|
|
371
|
+
elif "$ref" in param_schema or "anyOf" in param_schema:
|
|
372
|
+
# Complex type - store the full schema
|
|
373
|
+
arg_info["inputSchema"] = param_schema
|
|
374
|
+
except Exception:
|
|
375
|
+
arg_info["type"] = "string"
|
|
376
|
+
else:
|
|
377
|
+
arg_info["type"] = "string"
|
|
378
|
+
|
|
379
|
+
prompt_args.append(arg_info)
|
|
380
|
+
|
|
381
|
+
# Register PROMPT - runs setup, returns prompt messages
|
|
382
|
+
# We need a reference to self and the outer variables
|
|
383
|
+
scenario_self = self
|
|
384
|
+
scenario_fn = fn
|
|
385
|
+
scenario_name_ref = scenario_name
|
|
386
|
+
|
|
387
|
+
async def prompt_handler(**handler_args: Any) -> list[str]:
|
|
388
|
+
# Create generator instance
|
|
389
|
+
gen = scenario_fn(**handler_args)
|
|
390
|
+
|
|
391
|
+
# Run setup phase (code before first yield)
|
|
392
|
+
prompt_text = await gen.__anext__()
|
|
393
|
+
|
|
394
|
+
# Store generator with session ID
|
|
395
|
+
session_id = uuid.uuid4().hex[:8]
|
|
396
|
+
scenario_self._scenario_sessions[session_id] = gen
|
|
397
|
+
scenario_self._scenario_latest[scenario_name_ref] = session_id
|
|
398
|
+
|
|
399
|
+
logger.debug(
|
|
400
|
+
"Scenario %s setup complete, session=%s, prompt=%s",
|
|
401
|
+
scenario_name_ref,
|
|
402
|
+
session_id,
|
|
403
|
+
prompt_text[:50] if isinstance(prompt_text, str) else prompt_text,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# Return just the string - FastMCP wraps it in PromptMessage
|
|
407
|
+
# Don't return dict or it gets JSON-serialized as text content
|
|
408
|
+
return [str(prompt_text)]
|
|
409
|
+
|
|
410
|
+
# Register prompt using FastMCP - create FunctionPrompt directly
|
|
411
|
+
# to bypass the **kwargs validation in from_function()
|
|
412
|
+
from fastmcp.prompts.prompt import FunctionPrompt, PromptArgument
|
|
413
|
+
|
|
414
|
+
# Build meta with source code and full arguments info (with types/defaults)
|
|
415
|
+
scenario_meta: dict[str, Any] = {}
|
|
416
|
+
if source_code:
|
|
417
|
+
scenario_meta["code"] = source_code
|
|
418
|
+
if prompt_args:
|
|
419
|
+
scenario_meta["arguments"] = prompt_args
|
|
420
|
+
|
|
421
|
+
prompt = FunctionPrompt(
|
|
422
|
+
name=scenario_id,
|
|
423
|
+
description=f"[Setup] {scenario_desc}",
|
|
424
|
+
arguments=[
|
|
425
|
+
PromptArgument(name=arg["name"], required=arg["required"])
|
|
426
|
+
for arg in prompt_args
|
|
427
|
+
],
|
|
428
|
+
fn=prompt_handler,
|
|
429
|
+
meta=scenario_meta if scenario_meta else None,
|
|
430
|
+
)
|
|
431
|
+
self._prompt_manager.add_prompt(prompt)
|
|
432
|
+
|
|
433
|
+
# Register RESOURCE - runs evaluate, returns reward
|
|
434
|
+
async def resource_handler() -> str:
|
|
435
|
+
# Get latest session for this scenario
|
|
436
|
+
session_id = scenario_self._scenario_latest.get(scenario_name_ref)
|
|
437
|
+
if not session_id:
|
|
438
|
+
raise ValueError(
|
|
439
|
+
f"No active session for scenario '{scenario_name_ref}'. "
|
|
440
|
+
"Call the prompt first to run setup."
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
gen = scenario_self._scenario_sessions.pop(session_id, None)
|
|
444
|
+
if gen is None:
|
|
445
|
+
raise ValueError(f"Session '{session_id}' not found or already evaluated.")
|
|
446
|
+
|
|
447
|
+
# Get submitted answer (if any)
|
|
448
|
+
answer = scenario_self._scenario_answers.pop(scenario_name_ref, None)
|
|
449
|
+
|
|
450
|
+
# Run evaluate phase (code after first yield)
|
|
451
|
+
# Use asend to pass the answer (or None if not submitted)
|
|
452
|
+
try:
|
|
453
|
+
reward = await gen.asend(answer)
|
|
454
|
+
except StopAsyncIteration:
|
|
455
|
+
# Generator ended without second yield - assume success
|
|
456
|
+
reward = 1.0
|
|
457
|
+
|
|
458
|
+
logger.debug(
|
|
459
|
+
"Scenario %s evaluate complete, session=%s, answer=%s, reward=%s",
|
|
460
|
+
scenario_name_ref,
|
|
461
|
+
session_id,
|
|
462
|
+
answer[:50] if answer and len(answer) > 50 else answer,
|
|
463
|
+
reward,
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
# Clean up latest pointer if it matches
|
|
467
|
+
if scenario_self._scenario_latest.get(scenario_name_ref) == session_id:
|
|
468
|
+
del scenario_self._scenario_latest[scenario_name_ref]
|
|
469
|
+
|
|
470
|
+
return json.dumps({"reward": float(reward)})
|
|
471
|
+
|
|
472
|
+
# Register as resource with same scenario: URI
|
|
473
|
+
from fastmcp.resources.resource import FunctionResource
|
|
474
|
+
|
|
475
|
+
resource = FunctionResource.from_function(
|
|
476
|
+
fn=resource_handler,
|
|
477
|
+
uri=scenario_id,
|
|
478
|
+
name=scenario_name,
|
|
479
|
+
description=f"[Evaluate] {scenario_desc}",
|
|
480
|
+
mime_type="application/json",
|
|
481
|
+
meta=scenario_meta,
|
|
482
|
+
)
|
|
483
|
+
self._resource_manager.add_resource(resource)
|
|
484
|
+
|
|
485
|
+
logger.debug(
|
|
486
|
+
"Registered scenario '%s' as prompt and resource: %s",
|
|
487
|
+
scenario_name,
|
|
488
|
+
scenario_id,
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
return fn
|
|
492
|
+
|
|
493
|
+
return decorator
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Tests for hud.environment module."""
|