hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +70 -5
- hud/agents/base.py +238 -500
- hud/agents/claude.py +236 -247
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +264 -0
- hud/agents/gemini_cua.py +324 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +48 -36
- hud/agents/openai.py +282 -296
- hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
- hud/agents/operator.py +199 -0
- hud/agents/resolver.py +70 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +381 -214
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +377 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +493 -546
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +699 -113
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +889 -732
- hud/cli/eval.py +793 -667
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/pull.py +1 -1
- hud/cli/push.py +38 -13
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +110 -8
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push.py +1 -1
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +70 -1
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +45 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +326 -0
- hud/datasets/runner.py +198 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +52 -0
- hud/environment/connection.py +258 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +137 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +835 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +263 -0
- hud/environment/scenarios.py +620 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +205 -0
- hud/environment/tests/test_environment.py +593 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +242 -0
- hud/environment/tests/test_scenarios.py +1086 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +727 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +187 -0
- hud/eval/manager.py +533 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +372 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +291 -0
- hud/eval/types.py +65 -0
- hud/eval/utils.py +194 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +308 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +165 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +18 -2
- hud/tools/agent.py +223 -0
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +36 -3
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +194 -56
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +89 -18
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.13.dist-info/METADATA +264 -0
- hud_python-0.5.13.dist-info/RECORD +305 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,620 @@
|
|
|
1
|
+
"""Scenario decorator for Environment - defines setup/evaluate phases."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import inspect
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
from typing import TYPE_CHECKING, Any, get_type_hints
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, ConfigDict
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import AsyncGenerator, Callable
|
|
14
|
+
|
|
15
|
+
from fastmcp.prompts import PromptManager
|
|
16
|
+
from fastmcp.resources import ResourceManager
|
|
17
|
+
from fastmcp.tools import ToolManager
|
|
18
|
+
|
|
19
|
+
__all__ = ["ScenarioMixin", "ScenarioSession"]
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ScenarioSession(BaseModel):
|
|
25
|
+
"""Tracks an active scenario from setup through evaluate.
|
|
26
|
+
|
|
27
|
+
Created during run_scenario_setup(), used by submit() and run_scenario_evaluate().
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
31
|
+
|
|
32
|
+
local_name: str # Canonical short name (e.g., "investigate")
|
|
33
|
+
full_name: str # Full name as called (e.g., "sentry-agent:investigate")
|
|
34
|
+
is_local: bool # True if running locally (generator exists)
|
|
35
|
+
connection_name: str | None # Which connection served it (if remote)
|
|
36
|
+
resource_uri: str # Full URI for reading evaluation result
|
|
37
|
+
generator: Any | None = None # AsyncGenerator (if local) - Any to avoid validation issues
|
|
38
|
+
answer: str | None = None # Submitted answer
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ScenarioMixin:
|
|
42
|
+
"""Mixin providing @env.scenario decorator for setup/evaluate phases.
|
|
43
|
+
|
|
44
|
+
Scenarios are async generators that yield twice:
|
|
45
|
+
- First yield: prompt string (setup phase)
|
|
46
|
+
- Second yield: reward float (evaluate phase)
|
|
47
|
+
|
|
48
|
+
The scenario can receive the agent's answer via yield:
|
|
49
|
+
answer = yield "Do the task"
|
|
50
|
+
yield 1.0 if "success" in answer else 0.0
|
|
51
|
+
|
|
52
|
+
The answer is passed via the hud_submit tool or ctx.submit().
|
|
53
|
+
|
|
54
|
+
The decorator registers both an MCP prompt and resource with the same
|
|
55
|
+
identifier ({env_name}:{scenario_name}), linked by session state.
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
@env.scenario()
|
|
59
|
+
async def search_cats(url: str):
|
|
60
|
+
await env.call_tool("navigate", url=url)
|
|
61
|
+
answer = yield "Find all cat images on the page"
|
|
62
|
+
result = await env.call_tool("count_cats")
|
|
63
|
+
yield float(result > 0 or "found" in answer.lower())
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
# These come from Environment/MCPServer (type hints for mixin)
|
|
67
|
+
name: str
|
|
68
|
+
_prompt_manager: PromptManager
|
|
69
|
+
_resource_manager: ResourceManager
|
|
70
|
+
_tool_manager: ToolManager
|
|
71
|
+
|
|
72
|
+
# Scenario function registry
|
|
73
|
+
_scenarios: dict[str, Callable[..., AsyncGenerator[Any, Any]]]
|
|
74
|
+
|
|
75
|
+
# Single active scenario session - used for BOTH:
|
|
76
|
+
# - Client-side: when we run scenarios (local or remote)
|
|
77
|
+
# - Server-side: when external clients call our scenarios via MCP
|
|
78
|
+
# Only one scenario can be active at a time.
|
|
79
|
+
_active_session: ScenarioSession | None
|
|
80
|
+
|
|
81
|
+
def _init_scenarios(self) -> None:
|
|
82
|
+
"""Initialize scenario state. Called from Environment.__init__."""
|
|
83
|
+
self._scenarios = {}
|
|
84
|
+
self._active_session = None
|
|
85
|
+
|
|
86
|
+
# Register _hud_submit tool (underscore = hidden from agent)
|
|
87
|
+
self._register_hud_submit_tool()
|
|
88
|
+
|
|
89
|
+
async def submit(self, scenario: str, answer: str) -> None:
|
|
90
|
+
"""Submit the agent's answer for a scenario's evaluate phase.
|
|
91
|
+
|
|
92
|
+
Uses _active_session to route to the correct connection (if remote)
|
|
93
|
+
or store locally (if local scenario).
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
scenario: Name of the scenario (may include env prefix like "env:name")
|
|
97
|
+
answer: The agent's answer/result to submit
|
|
98
|
+
"""
|
|
99
|
+
local_name = scenario.split(":")[-1] if ":" in scenario else scenario
|
|
100
|
+
|
|
101
|
+
if not self._active_session:
|
|
102
|
+
raise ValueError(
|
|
103
|
+
"No active scenario session. Call run_scenario_setup() before submit()."
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
if self._active_session.local_name != local_name:
|
|
107
|
+
raise ValueError(
|
|
108
|
+
f"Scenario mismatch: active session is '{self._active_session.local_name}', "
|
|
109
|
+
f"but submit() called with '{local_name}'"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
self._active_session.answer = answer
|
|
113
|
+
logger.debug("Stored answer in session for scenario '%s'", local_name)
|
|
114
|
+
|
|
115
|
+
if not self._active_session.is_local:
|
|
116
|
+
# Remote scenario - send to specific connection
|
|
117
|
+
conn_name = self._active_session.connection_name
|
|
118
|
+
if not conn_name:
|
|
119
|
+
raise ValueError(f"Remote scenario '{local_name}' has no connection")
|
|
120
|
+
|
|
121
|
+
conn = self._connections.get(conn_name) # type: ignore[attr-defined]
|
|
122
|
+
if not conn or not conn.client:
|
|
123
|
+
raise ValueError(f"Connection '{conn_name}' not available")
|
|
124
|
+
|
|
125
|
+
await conn.call_tool("_hud_submit", {"scenario": local_name, "answer": answer})
|
|
126
|
+
logger.debug("Sent answer to connection '%s' for scenario '%s'", conn_name, local_name)
|
|
127
|
+
|
|
128
|
+
def _register_hud_submit_tool(self) -> None:
|
|
129
|
+
"""Register the _hud_submit tool for receiving agent answers.
|
|
130
|
+
|
|
131
|
+
Named with underscore prefix to hide from agent tool listings.
|
|
132
|
+
"""
|
|
133
|
+
from fastmcp.tools import Tool
|
|
134
|
+
|
|
135
|
+
scenario_self = self
|
|
136
|
+
|
|
137
|
+
async def _hud_submit(scenario: str, answer: str) -> str:
|
|
138
|
+
"""Receive an agent's answer from an external client.
|
|
139
|
+
|
|
140
|
+
Called when an external client's Environment.submit() sends an answer
|
|
141
|
+
to us via MCP. Stores in _active_session for resource_handler to use.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
scenario: Name of the scenario (may include env prefix like "env:name")
|
|
145
|
+
answer: The agent's answer/result to submit
|
|
146
|
+
"""
|
|
147
|
+
local_name = scenario.split(":")[-1] if ":" in scenario else scenario
|
|
148
|
+
|
|
149
|
+
if not scenario_self._active_session:
|
|
150
|
+
raise ValueError(f"No active scenario session for '{local_name}'")
|
|
151
|
+
|
|
152
|
+
if scenario_self._active_session.local_name != local_name:
|
|
153
|
+
raise ValueError(
|
|
154
|
+
f"Scenario mismatch: active is '{scenario_self._active_session.local_name}', "
|
|
155
|
+
f"but received answer for '{local_name}'"
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
scenario_self._active_session.answer = answer
|
|
159
|
+
logger.debug(
|
|
160
|
+
"_hud_submit stored answer for scenario '%s': %s...",
|
|
161
|
+
local_name,
|
|
162
|
+
answer[:50] if len(answer) > 50 else answer,
|
|
163
|
+
)
|
|
164
|
+
return f"Answer submitted for scenario '{local_name}'"
|
|
165
|
+
|
|
166
|
+
# Register the tool with underscore name
|
|
167
|
+
tool = Tool.from_function(_hud_submit)
|
|
168
|
+
self._tool_manager.add_tool(tool)
|
|
169
|
+
logger.debug("Registered _hud_submit tool")
|
|
170
|
+
|
|
171
|
+
async def run_scenario_setup(self, scenario_name: str, args: dict[str, Any]) -> str | None:
|
|
172
|
+
"""Run a scenario's setup phase and return the prompt.
|
|
173
|
+
|
|
174
|
+
Handles both local scenarios (registered via @env.scenario) and remote
|
|
175
|
+
scenarios (via MCP prompt). Creates _active_session for use by submit/evaluate.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
scenario_name: Name of the scenario to run (may include "env:" prefix)
|
|
179
|
+
args: Arguments to pass to the scenario
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
The prompt string from the scenario's setup phase, or None if failed
|
|
183
|
+
"""
|
|
184
|
+
# Determine if this should be local or remote:
|
|
185
|
+
# - No prefix ("greet") → check local first
|
|
186
|
+
# - Prefix matches our env name ("my-env:greet" when self.name="my-env") → local
|
|
187
|
+
# - Prefix is different ("other-env:greet") → remote only
|
|
188
|
+
local_name: str | None = None
|
|
189
|
+
is_explicitly_remote = False
|
|
190
|
+
if ":" in scenario_name:
|
|
191
|
+
prefix, short_name = scenario_name.rsplit(":", 1)
|
|
192
|
+
# self.name is already normalized (underscores → hyphens) in Environment.__init__
|
|
193
|
+
if prefix == self.name:
|
|
194
|
+
# Prefix matches our env - check local
|
|
195
|
+
local_name = short_name
|
|
196
|
+
else:
|
|
197
|
+
# Different prefix - explicitly remote
|
|
198
|
+
local_name = short_name
|
|
199
|
+
is_explicitly_remote = True
|
|
200
|
+
else:
|
|
201
|
+
# No prefix - check local
|
|
202
|
+
local_name = scenario_name
|
|
203
|
+
|
|
204
|
+
# Check if scenario is registered locally (unless explicitly remote)
|
|
205
|
+
if not is_explicitly_remote and local_name in self._scenarios:
|
|
206
|
+
# Local scenario - run setup via generator
|
|
207
|
+
scenario_fn = self._scenarios[local_name]
|
|
208
|
+
gen = scenario_fn(**args)
|
|
209
|
+
|
|
210
|
+
# Run setup phase (code before first yield)
|
|
211
|
+
prompt = await gen.__anext__()
|
|
212
|
+
|
|
213
|
+
# Create session for local scenario
|
|
214
|
+
self._active_session = ScenarioSession(
|
|
215
|
+
local_name=local_name,
|
|
216
|
+
full_name=scenario_name,
|
|
217
|
+
is_local=True,
|
|
218
|
+
connection_name=None,
|
|
219
|
+
resource_uri=f"{self.name}:{local_name}",
|
|
220
|
+
generator=gen,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
logger.debug(
|
|
224
|
+
"Local scenario setup: %s (session=%s)",
|
|
225
|
+
local_name,
|
|
226
|
+
self._active_session,
|
|
227
|
+
)
|
|
228
|
+
return str(prompt)
|
|
229
|
+
else:
|
|
230
|
+
# Remote scenario - call via MCP prompt
|
|
231
|
+
# If scenario_name already contains ":", it's already namespaced - use directly
|
|
232
|
+
# Otherwise, prefix with env name: {env_name}:{scenario_name}
|
|
233
|
+
if ":" in scenario_name:
|
|
234
|
+
prompt_id = scenario_name
|
|
235
|
+
else:
|
|
236
|
+
# Use _source_env_name (from EvalContext) or self.name - both are normalized
|
|
237
|
+
env_name = getattr(self, "_source_env_name", None) or self.name
|
|
238
|
+
prompt_id = f"{env_name}:{scenario_name}"
|
|
239
|
+
|
|
240
|
+
# Serialize args for MCP prompt (only supports string values)
|
|
241
|
+
serialized_args: dict[str, str] = {}
|
|
242
|
+
for key, value in args.items():
|
|
243
|
+
serialized_args[key] = value if isinstance(value, str) else json.dumps(value)
|
|
244
|
+
|
|
245
|
+
try:
|
|
246
|
+
result = await self.get_prompt(prompt_id, serialized_args) # type: ignore[attr-defined]
|
|
247
|
+
# Get connection AFTER get_prompt succeeds (routing is now guaranteed built)
|
|
248
|
+
conn_name = self._router.get_prompt_connection(prompt_id) # type: ignore[attr-defined]
|
|
249
|
+
logger.debug(
|
|
250
|
+
"Remote scenario: prompt_id=%s, connection=%s",
|
|
251
|
+
prompt_id,
|
|
252
|
+
conn_name or "(not found in router)",
|
|
253
|
+
)
|
|
254
|
+
except Exception as e:
|
|
255
|
+
# Fetch available scenarios for error context
|
|
256
|
+
try:
|
|
257
|
+
prompts = await self.list_prompts() # type: ignore[attr-defined]
|
|
258
|
+
scenario_prompts = [p.name for p in prompts if ":" in p.name]
|
|
259
|
+
available = "\n ".join(scenario_prompts) if scenario_prompts else "(none)"
|
|
260
|
+
except Exception:
|
|
261
|
+
available = "(could not fetch)"
|
|
262
|
+
scenario_prompts = []
|
|
263
|
+
|
|
264
|
+
original_error = str(e)
|
|
265
|
+
if prompt_id in scenario_prompts:
|
|
266
|
+
raise ValueError(
|
|
267
|
+
f"⚠️ ERROR: Scenario '{prompt_id}' exists but failed to execute.\n\n"
|
|
268
|
+
f"The scenario was found but encountered an error during setup:\n"
|
|
269
|
+
f" {original_error}\n\n"
|
|
270
|
+
f"This could be caused by:\n"
|
|
271
|
+
f" - Missing or invalid scenario arguments\n"
|
|
272
|
+
f" - An error in the scenario's setup function\n"
|
|
273
|
+
f" - Connection or serialization issues\n\n"
|
|
274
|
+
f"Check the scenario definition and required arguments."
|
|
275
|
+
) from e
|
|
276
|
+
|
|
277
|
+
raise ValueError(
|
|
278
|
+
f"⚠️ ERROR: Scenario not found.\n\n"
|
|
279
|
+
f"Scenario IDs have the format 'environment_name:scenario_name'.\n"
|
|
280
|
+
f"If you only specify 'scenario_name', the SDK uses your task's env name "
|
|
281
|
+
f"as the prefix.\n"
|
|
282
|
+
f"This won't work if the HUD environment was declared with a different name."
|
|
283
|
+
f"\n\n"
|
|
284
|
+
f" You requested: {scenario_name}\n"
|
|
285
|
+
f" SDK looked for: {prompt_id}\n\n"
|
|
286
|
+
f"Available scenarios:\n {available}\n\n"
|
|
287
|
+
f"Fix: Use one of the scenario IDs above in your task JSON."
|
|
288
|
+
) from e
|
|
289
|
+
|
|
290
|
+
# Extract prompt text from response
|
|
291
|
+
prompt_text: str | None = None
|
|
292
|
+
if result.messages:
|
|
293
|
+
first_msg = result.messages[0]
|
|
294
|
+
content = first_msg.content
|
|
295
|
+
if hasattr(content, "text") and isinstance(content.text, str): # type: ignore[union-attr]
|
|
296
|
+
prompt_text = content.text # type: ignore[union-attr]
|
|
297
|
+
elif isinstance(content, str):
|
|
298
|
+
prompt_text = content
|
|
299
|
+
|
|
300
|
+
if not prompt_text:
|
|
301
|
+
raise ValueError(
|
|
302
|
+
f"Scenario '{scenario_name}' returned an empty response.\n\n"
|
|
303
|
+
f"The scenario's setup function was called but returned no messages.\n"
|
|
304
|
+
f"Check that the scenario returns a valid prompt string."
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
# Create session for remote scenario - use router's connection info
|
|
308
|
+
self._active_session = ScenarioSession(
|
|
309
|
+
local_name=local_name,
|
|
310
|
+
full_name=scenario_name,
|
|
311
|
+
is_local=False,
|
|
312
|
+
connection_name=conn_name,
|
|
313
|
+
resource_uri=prompt_id, # Resource has same URI as prompt
|
|
314
|
+
generator=None,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
logger.debug(
|
|
318
|
+
"Remote scenario setup: %s (connection=%s)",
|
|
319
|
+
prompt_id,
|
|
320
|
+
conn_name,
|
|
321
|
+
)
|
|
322
|
+
return prompt_text
|
|
323
|
+
|
|
324
|
+
async def run_scenario_evaluate(self, scenario_name: str) -> float | None:
|
|
325
|
+
"""Run a scenario's evaluate phase and return the reward.
|
|
326
|
+
|
|
327
|
+
Uses _active_session created by run_scenario_setup():
|
|
328
|
+
- Local: use stored generator with submitted answer
|
|
329
|
+
- Remote: read resource from the connection that served setup
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
scenario_name: Name of the scenario to evaluate
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
The reward from the scenario's evaluate phase, or None if failed
|
|
336
|
+
"""
|
|
337
|
+
if not self._active_session:
|
|
338
|
+
logger.warning("No active session for scenario '%s'", scenario_name)
|
|
339
|
+
return None
|
|
340
|
+
|
|
341
|
+
session = self._active_session
|
|
342
|
+
self._active_session = None # Clear after use
|
|
343
|
+
|
|
344
|
+
if session.is_local:
|
|
345
|
+
# Local scenario - use generator
|
|
346
|
+
if not session.generator:
|
|
347
|
+
logger.warning("Local scenario '%s' has no generator", session.local_name)
|
|
348
|
+
return None
|
|
349
|
+
|
|
350
|
+
answer = session.answer
|
|
351
|
+
try:
|
|
352
|
+
reward = await session.generator.asend(answer)
|
|
353
|
+
logger.debug(
|
|
354
|
+
"Local scenario %s evaluate: answer=%s, reward=%s",
|
|
355
|
+
session.local_name,
|
|
356
|
+
answer[:50] if answer and len(answer) > 50 else answer,
|
|
357
|
+
reward,
|
|
358
|
+
)
|
|
359
|
+
return float(reward)
|
|
360
|
+
except StopAsyncIteration:
|
|
361
|
+
return 1.0
|
|
362
|
+
else:
|
|
363
|
+
# Remote scenario - read resource via router
|
|
364
|
+
try:
|
|
365
|
+
contents = await self.read_resource(session.resource_uri) # type: ignore[attr-defined]
|
|
366
|
+
if contents:
|
|
367
|
+
first = contents[0]
|
|
368
|
+
if hasattr(first, "text") and isinstance(first.text, str): # type: ignore[union-attr]
|
|
369
|
+
data = json.loads(first.text) # type: ignore[union-attr]
|
|
370
|
+
if "reward" in data:
|
|
371
|
+
logger.debug(
|
|
372
|
+
"Remote scenario %s evaluate: reward=%s",
|
|
373
|
+
session.local_name,
|
|
374
|
+
data["reward"],
|
|
375
|
+
)
|
|
376
|
+
return float(data["reward"])
|
|
377
|
+
except Exception as e:
|
|
378
|
+
logger.warning("Failed to get scenario reward from %s: %s", session.resource_uri, e)
|
|
379
|
+
return None
|
|
380
|
+
|
|
381
|
+
def scenario(
|
|
382
|
+
self,
|
|
383
|
+
name: str | None = None,
|
|
384
|
+
description: str | None = None,
|
|
385
|
+
required_env_vars: list[str] | None = None,
|
|
386
|
+
) -> Callable[
|
|
387
|
+
[Callable[..., AsyncGenerator[Any, None]]],
|
|
388
|
+
Callable[..., AsyncGenerator[Any, None]],
|
|
389
|
+
]:
|
|
390
|
+
"""Decorator to register a scenario with setup and evaluate phases.
|
|
391
|
+
|
|
392
|
+
Creates both a prompt and resource with identifier scenario:{name}.
|
|
393
|
+
The scenario function should yield twice:
|
|
394
|
+
- First yield: the prompt string (returned from prompt)
|
|
395
|
+
- Second yield: the reward float (returned from resource)
|
|
396
|
+
|
|
397
|
+
Args:
|
|
398
|
+
name: Optional name for the scenario (defaults to function name)
|
|
399
|
+
description: Optional description of what the scenario does
|
|
400
|
+
required_env_vars: Optional list of environment variable names this scenario requires.
|
|
401
|
+
These are used by the HUD platform to check if users have configured the
|
|
402
|
+
necessary API keys/credentials before running this specific scenario.
|
|
403
|
+
|
|
404
|
+
Example:
|
|
405
|
+
@env.scenario(required_env_vars=["OPENAI_API_KEY"])
|
|
406
|
+
async def chat(query: str):
|
|
407
|
+
yield f"Answer this question: {query}"
|
|
408
|
+
# ... evaluate
|
|
409
|
+
yield 1.0
|
|
410
|
+
|
|
411
|
+
# MCP client usage:
|
|
412
|
+
# 1. get_prompt("{env_name}:chat", {query: "..."}) -> prompt messages
|
|
413
|
+
# 2. agent runs...
|
|
414
|
+
# 3. read_resource("{env_name}:chat") -> {"reward": 0.95}
|
|
415
|
+
"""
|
|
416
|
+
|
|
417
|
+
def decorator(
|
|
418
|
+
fn: Callable[..., AsyncGenerator[Any, None]],
|
|
419
|
+
) -> Callable[..., AsyncGenerator[Any, None]]:
|
|
420
|
+
scenario_name = name or fn.__name__
|
|
421
|
+
|
|
422
|
+
# Validate scenario name - colons are reserved as env:scenario separator
|
|
423
|
+
if ":" in scenario_name:
|
|
424
|
+
raise ValueError(
|
|
425
|
+
f"Scenario name '{scenario_name}' cannot contain ':' "
|
|
426
|
+
"(reserved as separator between environment and scenario names)"
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
# self.name is already normalized (lowercase, hyphens) by Environment.__init__
|
|
430
|
+
scenario_id = f"{self.name}:{scenario_name}"
|
|
431
|
+
scenario_desc = description or fn.__doc__ or f"Scenario: {scenario_name}"
|
|
432
|
+
|
|
433
|
+
# Capture source code for reproducibility
|
|
434
|
+
try:
|
|
435
|
+
source_code = inspect.getsource(fn)
|
|
436
|
+
except (OSError, TypeError) as e:
|
|
437
|
+
logger.warning(
|
|
438
|
+
"Could not capture source code for scenario '%s': %s",
|
|
439
|
+
scenario_name,
|
|
440
|
+
e,
|
|
441
|
+
)
|
|
442
|
+
source_code = None
|
|
443
|
+
|
|
444
|
+
# Store the generator function
|
|
445
|
+
self._scenarios[scenario_name] = fn
|
|
446
|
+
|
|
447
|
+
# Get function signature for prompt arguments with type info
|
|
448
|
+
sig = inspect.signature(fn)
|
|
449
|
+
prompt_args: list[dict[str, Any]] = []
|
|
450
|
+
for p in sig.parameters.values():
|
|
451
|
+
is_required = p.default is inspect.Parameter.empty
|
|
452
|
+
arg_info: dict[str, Any] = {"name": p.name, "required": is_required}
|
|
453
|
+
|
|
454
|
+
# Include default value if present
|
|
455
|
+
if not is_required:
|
|
456
|
+
# Only include JSON-serializable defaults
|
|
457
|
+
default_val = p.default
|
|
458
|
+
if default_val is None or isinstance(
|
|
459
|
+
default_val, (str | int | float | bool | list | dict)
|
|
460
|
+
):
|
|
461
|
+
arg_info["default"] = default_val
|
|
462
|
+
|
|
463
|
+
# Extract type annotation
|
|
464
|
+
if p.annotation is not inspect.Parameter.empty:
|
|
465
|
+
try:
|
|
466
|
+
# Use pydantic to convert annotation to JSON schema
|
|
467
|
+
from pydantic import TypeAdapter
|
|
468
|
+
|
|
469
|
+
adapter = TypeAdapter(p.annotation)
|
|
470
|
+
param_schema = adapter.json_schema()
|
|
471
|
+
# Extract type from schema (could be "string", "integer", etc.)
|
|
472
|
+
if "type" in param_schema:
|
|
473
|
+
arg_info["type"] = param_schema["type"]
|
|
474
|
+
elif "$ref" in param_schema or "anyOf" in param_schema:
|
|
475
|
+
# Complex type - store the full schema
|
|
476
|
+
arg_info["inputSchema"] = param_schema
|
|
477
|
+
except Exception:
|
|
478
|
+
arg_info["type"] = "string"
|
|
479
|
+
else:
|
|
480
|
+
arg_info["type"] = "string"
|
|
481
|
+
|
|
482
|
+
prompt_args.append(arg_info)
|
|
483
|
+
|
|
484
|
+
# Register PROMPT - runs setup, returns prompt messages
|
|
485
|
+
# We need a reference to self and the outer variables
|
|
486
|
+
scenario_self = self
|
|
487
|
+
scenario_name_ref = scenario_name
|
|
488
|
+
|
|
489
|
+
# Resolve parameter type hints for deserialization
|
|
490
|
+
# Use get_type_hints() to handle `from __future__ import annotations`
|
|
491
|
+
# which makes annotations lazy strings (PEP 563)
|
|
492
|
+
# MCP prompts only support string arguments, so we JSON-serialize complex types
|
|
493
|
+
# and use Pydantic TypeAdapter to properly deserialize them
|
|
494
|
+
try:
|
|
495
|
+
param_annotations = get_type_hints(fn)
|
|
496
|
+
except Exception:
|
|
497
|
+
# Fall back to raw annotations if get_type_hints fails
|
|
498
|
+
param_annotations = {
|
|
499
|
+
p.name: p.annotation
|
|
500
|
+
for p in sig.parameters.values()
|
|
501
|
+
if p.annotation is not inspect.Parameter.empty
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
async def prompt_handler(**handler_args: Any) -> list[str]:
|
|
505
|
+
from pydantic import TypeAdapter
|
|
506
|
+
|
|
507
|
+
# Deserialize JSON-encoded arguments using Pydantic TypeAdapter
|
|
508
|
+
# MCP prompts only support string arguments, so complex types are
|
|
509
|
+
# JSON-serialized on the sending side and deserialized here
|
|
510
|
+
deserialized_args: dict[str, Any] = {}
|
|
511
|
+
for arg_name, arg_value in handler_args.items():
|
|
512
|
+
annotation = param_annotations.get(arg_name)
|
|
513
|
+
|
|
514
|
+
# Only attempt deserialization on string values
|
|
515
|
+
if not isinstance(arg_value, str):
|
|
516
|
+
deserialized_args[arg_name] = arg_value
|
|
517
|
+
continue
|
|
518
|
+
|
|
519
|
+
# If annotation is explicitly str, keep as string
|
|
520
|
+
if annotation is str:
|
|
521
|
+
deserialized_args[arg_name] = arg_value
|
|
522
|
+
continue
|
|
523
|
+
|
|
524
|
+
# If we have a non-str type annotation, use TypeAdapter
|
|
525
|
+
if annotation is not None:
|
|
526
|
+
try:
|
|
527
|
+
adapter = TypeAdapter(annotation)
|
|
528
|
+
deserialized_args[arg_name] = adapter.validate_json(arg_value)
|
|
529
|
+
continue
|
|
530
|
+
except Exception: # noqa: S110
|
|
531
|
+
pass # Fall through to generic JSON decode
|
|
532
|
+
|
|
533
|
+
# Try JSON decode for strings that look like JSON
|
|
534
|
+
stripped = arg_value.strip()
|
|
535
|
+
if (stripped and stripped[0] in "[{") or stripped in ("true", "false", "null"):
|
|
536
|
+
try:
|
|
537
|
+
deserialized_args[arg_name] = json.loads(arg_value)
|
|
538
|
+
continue
|
|
539
|
+
except json.JSONDecodeError:
|
|
540
|
+
pass
|
|
541
|
+
|
|
542
|
+
# Try to decode if it looks like a number
|
|
543
|
+
if stripped.lstrip("-").replace(".", "", 1).isdigit():
|
|
544
|
+
try:
|
|
545
|
+
deserialized_args[arg_name] = json.loads(arg_value)
|
|
546
|
+
continue
|
|
547
|
+
except json.JSONDecodeError:
|
|
548
|
+
pass
|
|
549
|
+
|
|
550
|
+
# Keep as string
|
|
551
|
+
deserialized_args[arg_name] = arg_value
|
|
552
|
+
|
|
553
|
+
# Delegate to run_scenario_setup (consolidates client/server logic)
|
|
554
|
+
prompt_text = await scenario_self.run_scenario_setup(
|
|
555
|
+
scenario_name_ref, deserialized_args
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
if prompt_text is None:
|
|
559
|
+
raise ValueError(f"Scenario '{scenario_name_ref}' setup returned no prompt")
|
|
560
|
+
|
|
561
|
+
# Return just the string - FastMCP wraps it in PromptMessage
|
|
562
|
+
return [str(prompt_text)]
|
|
563
|
+
|
|
564
|
+
# Register prompt using FastMCP - create FunctionPrompt directly
|
|
565
|
+
# to bypass the **kwargs validation in from_function()
|
|
566
|
+
from fastmcp.prompts.prompt import FunctionPrompt, PromptArgument
|
|
567
|
+
|
|
568
|
+
# Build meta with source code and full arguments info (with types/defaults)
|
|
569
|
+
scenario_meta: dict[str, Any] = {}
|
|
570
|
+
if source_code:
|
|
571
|
+
scenario_meta["code"] = source_code
|
|
572
|
+
if prompt_args:
|
|
573
|
+
scenario_meta["arguments"] = prompt_args
|
|
574
|
+
if required_env_vars:
|
|
575
|
+
scenario_meta["required_env_vars"] = required_env_vars
|
|
576
|
+
|
|
577
|
+
prompt = FunctionPrompt(
|
|
578
|
+
name=scenario_id,
|
|
579
|
+
description=f"[Setup] {scenario_desc}",
|
|
580
|
+
arguments=[
|
|
581
|
+
PromptArgument(name=arg["name"], required=arg["required"])
|
|
582
|
+
for arg in prompt_args
|
|
583
|
+
],
|
|
584
|
+
fn=prompt_handler,
|
|
585
|
+
meta=scenario_meta if scenario_meta else None,
|
|
586
|
+
)
|
|
587
|
+
self._prompt_manager.add_prompt(prompt)
|
|
588
|
+
|
|
589
|
+
# Register RESOURCE - runs evaluate, returns reward
|
|
590
|
+
async def resource_handler() -> str:
|
|
591
|
+
# Delegate to run_scenario_evaluate (consolidates client/server logic)
|
|
592
|
+
reward = await scenario_self.run_scenario_evaluate(scenario_name_ref)
|
|
593
|
+
|
|
594
|
+
if reward is None:
|
|
595
|
+
raise ValueError(f"Scenario '{scenario_name_ref}' evaluation failed")
|
|
596
|
+
|
|
597
|
+
return json.dumps({"reward": float(reward)})
|
|
598
|
+
|
|
599
|
+
# Register as resource with same scenario: URI
|
|
600
|
+
from fastmcp.resources.resource import FunctionResource
|
|
601
|
+
|
|
602
|
+
resource = FunctionResource.from_function(
|
|
603
|
+
fn=resource_handler,
|
|
604
|
+
uri=scenario_id,
|
|
605
|
+
name=scenario_name,
|
|
606
|
+
description=f"[Evaluate] {scenario_desc}",
|
|
607
|
+
mime_type="application/json",
|
|
608
|
+
meta=scenario_meta,
|
|
609
|
+
)
|
|
610
|
+
self._resource_manager.add_resource(resource)
|
|
611
|
+
|
|
612
|
+
logger.debug(
|
|
613
|
+
"Registered scenario '%s' as prompt and resource: %s",
|
|
614
|
+
scenario_name,
|
|
615
|
+
scenario_id,
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
return fn
|
|
619
|
+
|
|
620
|
+
return decorator
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Tests for hud.environment module."""
|