PyPI - hud-python - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.13__py3-none-any.whl - Mend

hud-python 0.5.1py3-none-any.whl → 0.5.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

hud/__init__.py +1 -1
hud/agents/__init__.py +65 -6
hud/agents/base.py +33 -15
hud/agents/claude.py +60 -31
hud/agents/gateway.py +42 -0
hud/agents/gemini.py +15 -26
hud/agents/gemini_cua.py +6 -17
hud/agents/misc/response_agent.py +7 -0
hud/agents/openai.py +16 -29
hud/agents/openai_chat.py +3 -19
hud/agents/operator.py +5 -17
hud/agents/resolver.py +70 -0
hud/agents/tests/test_claude.py +2 -4
hud/agents/tests/test_openai.py +2 -1
hud/agents/tests/test_resolver.py +192 -0
hud/agents/types.py +148 -0
hud/cli/__init__.py +34 -3
hud/cli/build.py +37 -5
hud/cli/dev.py +11 -2
hud/cli/eval.py +51 -39
hud/cli/flows/init.py +1 -1
hud/cli/pull.py +1 -1
hud/cli/push.py +9 -2
hud/cli/tests/test_build.py +2 -2
hud/cli/tests/test_push.py +1 -1
hud/cli/utils/metadata.py +1 -1
hud/cli/utils/tests/test_metadata.py +1 -1
hud/clients/mcp_use.py +6 -1
hud/datasets/loader.py +17 -18
hud/datasets/runner.py +16 -10
hud/datasets/tests/test_loader.py +15 -15
hud/environment/__init__.py +5 -3
hud/environment/connection.py +58 -6
hud/environment/connectors/mcp_config.py +29 -1
hud/environment/environment.py +218 -77
hud/environment/router.py +175 -24
hud/environment/scenarios.py +313 -186
hud/environment/tests/test_connectors.py +10 -23
hud/environment/tests/test_environment.py +432 -0
hud/environment/tests/test_local_connectors.py +81 -40
hud/environment/tests/test_scenarios.py +820 -14
hud/eval/context.py +63 -10
hud/eval/instrument.py +4 -2
hud/eval/manager.py +79 -12
hud/eval/task.py +36 -4
hud/eval/tests/test_eval.py +1 -1
hud/eval/tests/test_task.py +147 -1
hud/eval/types.py +2 -0
hud/eval/utils.py +14 -3
hud/patches/mcp_patches.py +178 -21
hud/telemetry/instrument.py +8 -1
hud/telemetry/tests/test_eval_telemetry.py +8 -8
hud/tools/__init__.py +2 -0
hud/tools/agent.py +223 -0
hud/tools/computer/__init__.py +34 -5
hud/tools/shell.py +3 -3
hud/tools/tests/test_agent_tool.py +355 -0
hud/types.py +62 -34
hud/utils/hud_console.py +30 -17
hud/utils/strict_schema.py +1 -1
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/METADATA +2 -2
{hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/RECORD +67 -61
{hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/WHEEL +0 -0
{hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
{hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0

hud/environment/scenarios.py CHANGED Viewed

@@ -5,8 +5,9 @@ from __future__ import annotations
 import inspect
 import json
 import logging
-import uuid
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, get_type_hints
+from pydantic import BaseModel, ConfigDict
 if TYPE_CHECKING:
     from collections.abc import AsyncGenerator, Callable
@@ -15,11 +16,28 @@ if TYPE_CHECKING:
     from fastmcp.resources import ResourceManager
     from fastmcp.tools import ToolManager
-__all__ = ["ScenarioMixin"]
+__all__ = ["ScenarioMixin", "ScenarioSession"]
 logger = logging.getLogger(__name__)
+class ScenarioSession(BaseModel):
+    """Tracks an active scenario from setup through evaluate.
+    Created during run_scenario_setup(), used by submit() and run_scenario_evaluate().
+    """
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    local_name: str  # Canonical short name (e.g., "investigate")
+    full_name: str  # Full name as called (e.g., "sentry-agent:investigate")
+    is_local: bool  # True if running locally (generator exists)
+    connection_name: str | None  # Which connection served it (if remote)
+    resource_uri: str  # Full URI for reading evaluation result
+    generator: Any | None = None  # AsyncGenerator (if local) - Any to avoid validation issues
+    answer: str | None = None  # Submitted answer
 class ScenarioMixin:
     """Mixin providing @env.scenario decorator for setup/evaluate phases.
@@ -45,24 +63,25 @@ class ScenarioMixin:
             yield float(result > 0 or "found" in answer.lower())
     """
-    # These come from Environment/MCPServer
+    # These come from Environment/MCPServer (type hints for mixin)
     name: str
     _prompt_manager: PromptManager
     _resource_manager: ResourceManager
     _tool_manager: ToolManager
-    # Scenario state
+    # Scenario function registry
     _scenarios: dict[str, Callable[..., AsyncGenerator[Any, Any]]]
-    _scenario_sessions: dict[str, AsyncGenerator[Any, Any]]  # session_id -> generator
-    _scenario_latest: dict[str, str]  # scenario_name -> latest session_id
-    _scenario_answers: dict[str, str]  # scenario_name -> submitted answer
+    # Single active scenario session - used for BOTH:
+    # - Client-side: when we run scenarios (local or remote)
+    # - Server-side: when external clients call our scenarios via MCP
+    # Only one scenario can be active at a time.
+    _active_session: ScenarioSession | None
     def _init_scenarios(self) -> None:
         """Initialize scenario state. Called from Environment.__init__."""
         self._scenarios = {}
-        self._scenario_sessions = {}
-        self._scenario_latest = {}
-        self._scenario_answers = {}
+        self._active_session = None
         # Register _hud_submit tool (underscore = hidden from agent)
         self._register_hud_submit_tool()
@@ -70,35 +89,41 @@ class ScenarioMixin:
     async def submit(self, scenario: str, answer: str) -> None:
         """Submit the agent's answer for a scenario's evaluate phase.
-        This stores the answer locally and broadcasts to connected hubs
-        that have the _hud_submit tool (auto-detected by Environment).
+        Uses _active_session to route to the correct connection (if remote)
+        or store locally (if local scenario).
         Args:
-            scenario: Name of the scenario (without env prefix)
+            scenario: Name of the scenario (may include env prefix like "env:name")
             answer: The agent's answer/result to submit
+        """
+        local_name = scenario.split(":")[-1] if ":" in scenario else scenario
-        Example:
-            # Direct call with scenario name
-            await env.submit("checkout", "Order completed successfully")
+        if not self._active_session:
+            raise ValueError(
+                "No active scenario session. Call run_scenario_setup() before submit()."
+            )
-            # Or via EvalContext (knows its own scenario)
-            await ctx.submit("Order completed successfully")
-        """
-        # Store locally for our scenarios
-        self._scenario_answers[scenario] = answer
-        logger.debug(
-            "Stored answer for scenario '%s': %s...",
-            scenario,
-            answer[:50] if len(answer) > 50 else answer,
-        )
-        # Broadcast to connections that have _hud_submit
-        # Environment._broadcast_tool auto-filters to connections with the tool
-        await self._broadcast_tool(  # type: ignore[attr-defined]
-            "_hud_submit",
-            scenario=scenario,
-            answer=answer,
-        )
+        if self._active_session.local_name != local_name:
+            raise ValueError(
+                f"Scenario mismatch: active session is '{self._active_session.local_name}', "
+                f"but submit() called with '{local_name}'"
+            )
+        self._active_session.answer = answer
+        logger.debug("Stored answer in session for scenario '%s'", local_name)
+        if not self._active_session.is_local:
+            # Remote scenario - send to specific connection
+            conn_name = self._active_session.connection_name
+            if not conn_name:
+                raise ValueError(f"Remote scenario '{local_name}' has no connection")
+            conn = self._connections.get(conn_name)  # type: ignore[attr-defined]
+            if not conn or not conn.client:
+                raise ValueError(f"Connection '{conn_name}' not available")
+            await conn.call_tool("_hud_submit", {"scenario": local_name, "answer": answer})
+            logger.debug("Sent answer to connection '%s' for scenario '%s'", conn_name, local_name)
     def _register_hud_submit_tool(self) -> None:
         """Register the _hud_submit tool for receiving agent answers.
@@ -110,22 +135,33 @@ class ScenarioMixin:
         scenario_self = self
         async def _hud_submit(scenario: str, answer: str) -> str:
-            """Submit the agent's answer for a scenario's evaluate phase.
+            """Receive an agent's answer from an external client.
-            Internal tool - called by Environment.submit() on connected hubs.
+            Called when an external client's Environment.submit() sends an answer
+            to us via MCP. Stores in _active_session for resource_handler to use.
             Args:
-                scenario: Name of the scenario (without env prefix)
+                scenario: Name of the scenario (may include env prefix like "env:name")
                 answer: The agent's answer/result to submit
             """
-            # Store locally (don't broadcast - we ARE the target)
-            scenario_self._scenario_answers[scenario] = answer
+            local_name = scenario.split(":")[-1] if ":" in scenario else scenario
+            if not scenario_self._active_session:
+                raise ValueError(f"No active scenario session for '{local_name}'")
+            if scenario_self._active_session.local_name != local_name:
+                raise ValueError(
+                    f"Scenario mismatch: active is '{scenario_self._active_session.local_name}', "
+                    f"but received answer for '{local_name}'"
+                )
+            scenario_self._active_session.answer = answer
             logger.debug(
-                "_hud_submit received answer for scenario '%s': %s...",
-                scenario,
+                "_hud_submit stored answer for scenario '%s': %s...",
+                local_name,
                 answer[:50] if len(answer) > 50 else answer,
             )
-            return f"Answer submitted for scenario '{scenario}'"
+            return f"Answer submitted for scenario '{local_name}'"
         # Register the tool with underscore name
         tool = Tool.from_function(_hud_submit)
@@ -136,33 +172,58 @@ class ScenarioMixin:
         """Run a scenario's setup phase and return the prompt.
         Handles both local scenarios (registered via @env.scenario) and remote
-        scenarios (via MCP prompt).
+        scenarios (via MCP prompt). Creates _active_session for use by submit/evaluate.
         Args:
-            scenario_name: Name of the scenario to run
+            scenario_name: Name of the scenario to run (may include "env:" prefix)
             args: Arguments to pass to the scenario
         Returns:
             The prompt string from the scenario's setup phase, or None if failed
         """
-        # Check if scenario is registered locally
-        if scenario_name in self._scenarios:
+        # Determine if this should be local or remote:
+        # - No prefix ("greet") → check local first
+        # - Prefix matches our env name ("my-env:greet" when self.name="my-env") → local
+        # - Prefix is different ("other-env:greet") → remote only
+        local_name: str | None = None
+        is_explicitly_remote = False
+        if ":" in scenario_name:
+            prefix, short_name = scenario_name.rsplit(":", 1)
+            # self.name is already normalized (underscores → hyphens) in Environment.__init__
+            if prefix == self.name:
+                # Prefix matches our env - check local
+                local_name = short_name
+            else:
+                # Different prefix - explicitly remote
+                local_name = short_name
+                is_explicitly_remote = True
+        else:
+            # No prefix - check local
+            local_name = scenario_name
+        # Check if scenario is registered locally (unless explicitly remote)
+        if not is_explicitly_remote and local_name in self._scenarios:
             # Local scenario - run setup via generator
-            scenario_fn = self._scenarios[scenario_name]
+            scenario_fn = self._scenarios[local_name]
             gen = scenario_fn(**args)
             # Run setup phase (code before first yield)
             prompt = await gen.__anext__()
-            # Store generator for evaluate phase
-            session_id = uuid.uuid4().hex[:8]
-            self._scenario_sessions[session_id] = gen
-            self._scenario_latest[scenario_name] = session_id
+            # Create session for local scenario
+            self._active_session = ScenarioSession(
+                local_name=local_name,
+                full_name=scenario_name,
+                is_local=True,
+                connection_name=None,
+                resource_uri=f"{self.name}:{local_name}",
+                generator=gen,
+            )
             logger.debug(
-                "Scenario %s setup complete, session=%s",
-                scenario_name,
-                session_id,
+                "Local scenario setup: %s (session=%s)",
+                local_name,
+                self._active_session,
             )
             return str(prompt)
         else:
@@ -171,27 +232,50 @@ class ScenarioMixin:
             # Otherwise, prefix with env name: {env_name}:{scenario_name}
             if ":" in scenario_name:
                 prompt_id = scenario_name
-                logger.debug("Remote scenario (already namespaced): prompt_id=%s", prompt_id)
             else:
+                # Use _source_env_name (from EvalContext) or self.name - both are normalized
                 env_name = getattr(self, "_source_env_name", None) or self.name
-                safe_env_name = env_name.replace("_", "-")
-                prompt_id = f"{safe_env_name}:{scenario_name}"
-                logger.debug("Remote scenario (adding namespace): prompt_id=%s", prompt_id)
+                prompt_id = f"{env_name}:{scenario_name}"
+            # Serialize args for MCP prompt (only supports string values)
+            serialized_args: dict[str, str] = {}
+            for key, value in args.items():
+                serialized_args[key] = value if isinstance(value, str) else json.dumps(value)
             try:
-                result = await self.get_prompt(prompt_id, args)  # type: ignore[attr-defined]
+                result = await self.get_prompt(prompt_id, serialized_args)  # type: ignore[attr-defined]
+                # Get connection AFTER get_prompt succeeds (routing is now guaranteed built)
+                conn_name = self._router.get_prompt_connection(prompt_id)  # type: ignore[attr-defined]
+                logger.debug(
+                    "Remote scenario: prompt_id=%s, connection=%s",
+                    prompt_id,
+                    conn_name or "(not found in router)",
+                )
             except Exception as e:
                 # Fetch available scenarios for error context
                 try:
                     prompts = await self.list_prompts()  # type: ignore[attr-defined]
                     scenario_prompts = [p.name for p in prompts if ":" in p.name]
-                    available = (
-                        "\n    ".join(scenario_prompts) if scenario_prompts else "(none found)"
-                    )
+                    available = "\n    ".join(scenario_prompts) if scenario_prompts else "(none)"
                 except Exception:
-                    available = "(could not fetch available scenarios)"
+                    available = "(could not fetch)"
+                    scenario_prompts = []
+                original_error = str(e)
+                if prompt_id in scenario_prompts:
+                    raise ValueError(
+                        f"⚠️ ERROR: Scenario '{prompt_id}' exists but failed to execute.\n\n"
+                        f"The scenario was found but encountered an error during setup:\n"
+                        f"  {original_error}\n\n"
+                        f"This could be caused by:\n"
+                        f"  - Missing or invalid scenario arguments\n"
+                        f"  - An error in the scenario's setup function\n"
+                        f"  - Connection or serialization issues\n\n"
+                        f"Check the scenario definition and required arguments."
+                    ) from e
                 raise ValueError(
-                    f"Scenario not found.\n\n"
+                    f"⚠️ ERROR: Scenario not found.\n\n"
                     f"Scenario IDs have the format 'environment_name:scenario_name'.\n"
                     f"If you only specify 'scenario_name', the SDK uses your task's env name "
                     f"as the prefix.\n"
@@ -203,35 +287,46 @@ class ScenarioMixin:
                     f"Fix: Use one of the scenario IDs above in your task JSON."
                 ) from e
-            # Validate the response (outside try/except so errors aren't wrapped)
+            # Extract prompt text from response
+            prompt_text: str | None = None
             if result.messages:
                 first_msg = result.messages[0]
                 content = first_msg.content
                 if hasattr(content, "text") and isinstance(content.text, str):  # type: ignore[union-attr]
-                    return content.text  # type: ignore[union-attr]
+                    prompt_text = content.text  # type: ignore[union-attr]
                 elif isinstance(content, str):
-                    return content
-                else:
-                    # Content exists but is neither text object nor string
-                    raise ValueError(
-                        f"Scenario '{scenario_name}' returned malformed content.\n\n"
-                        f"Expected: content with .text attribute (str) or content as str\n"
-                        f"Got: {type(content).__name__}\n\n"
-                        f"Check that the scenario's setup function returns a valid prompt."
-                    )
-            else:
-                # get_prompt succeeded but returned empty messages
+                    prompt_text = content
+            if not prompt_text:
                 raise ValueError(
                     f"Scenario '{scenario_name}' returned an empty response.\n\n"
                     f"The scenario's setup function was called but returned no messages.\n"
                     f"Check that the scenario returns a valid prompt string."
                 )
+            # Create session for remote scenario - use router's connection info
+            self._active_session = ScenarioSession(
+                local_name=local_name,
+                full_name=scenario_name,
+                is_local=False,
+                connection_name=conn_name,
+                resource_uri=prompt_id,  # Resource has same URI as prompt
+                generator=None,
+            )
+            logger.debug(
+                "Remote scenario setup: %s (connection=%s)",
+                prompt_id,
+                conn_name,
+            )
+            return prompt_text
     async def run_scenario_evaluate(self, scenario_name: str) -> float | None:
         """Run a scenario's evaluate phase and return the reward.
-        Uses the submitted answer (if any) via gen.asend().
-        Handles both local and remote scenarios.
+        Uses _active_session created by run_scenario_setup():
+        - Local: use stored generator with submitted answer
+        - Remote: read resource from the connection that served setup
         Args:
             scenario_name: Name of the scenario to evaluate
@@ -239,56 +334,55 @@ class ScenarioMixin:
         Returns:
             The reward from the scenario's evaluate phase, or None if failed
         """
-        # Check if we have a stored generator (local scenario)
-        session_id = self._scenario_latest.get(scenario_name)
-        if session_id:
-            gen = self._scenario_sessions.pop(session_id, None)
-            if gen:
-                # Get submitted answer (if any)
-                answer = self._scenario_answers.pop(scenario_name, None)
+        if not self._active_session:
+            logger.warning("No active session for scenario '%s'", scenario_name)
+            return None
-                try:
-                    # Use asend to pass the answer to the scenario
-                    reward = await gen.asend(answer)
-                    logger.debug(
-                        "Scenario %s evaluate complete, answer=%s, reward=%s",
-                        scenario_name,
-                        answer[:50] if answer and len(answer) > 50 else answer,
-                        reward,
-                    )
-                    return float(reward)
-                except StopAsyncIteration:
-                    # Generator ended without second yield - assume success
-                    return 1.0
-                finally:
-                    # Clean up latest pointer
-                    if self._scenario_latest.get(scenario_name) == session_id:
-                        del self._scenario_latest[scenario_name]
-        # Remote scenario - read via MCP resource
-        # If scenario_name already contains ":", it's already namespaced - use directly
-        if ":" in scenario_name:
-            resource_id = scenario_name
+        session = self._active_session
+        self._active_session = None  # Clear after use
+        if session.is_local:
+            # Local scenario - use generator
+            if not session.generator:
+                logger.warning("Local scenario '%s' has no generator", session.local_name)
+                return None
+            answer = session.answer
+            try:
+                reward = await session.generator.asend(answer)
+                logger.debug(
+                    "Local scenario %s evaluate: answer=%s, reward=%s",
+                    session.local_name,
+                    answer[:50] if answer and len(answer) > 50 else answer,
+                    reward,
+                )
+                return float(reward)
+            except StopAsyncIteration:
+                return 1.0
         else:
-            env_name = getattr(self, "_source_env_name", None) or self.name
-            safe_env_name = env_name.replace("_", "-")
-            resource_id = f"{safe_env_name}:{scenario_name}"
-        try:
-            contents = await self.read_resource(resource_id)  # type: ignore[attr-defined]
-            if contents:
-                first_content = contents[0]
-                if hasattr(first_content, "text") and isinstance(first_content.text, str):  # type: ignore[union-attr]
-                    data = json.loads(first_content.text)  # type: ignore[union-attr]
-                    if "reward" in data:
-                        return float(data["reward"])
-        except Exception as e:
-            logger.warning("Failed to get scenario reward: %s", e)
-        return None
+            # Remote scenario - read resource via router
+            try:
+                contents = await self.read_resource(session.resource_uri)  # type: ignore[attr-defined]
+                if contents:
+                    first = contents[0]
+                    if hasattr(first, "text") and isinstance(first.text, str):  # type: ignore[union-attr]
+                        data = json.loads(first.text)  # type: ignore[union-attr]
+                        if "reward" in data:
+                            logger.debug(
+                                "Remote scenario %s evaluate: reward=%s",
+                                session.local_name,
+                                data["reward"],
+                            )
+                            return float(data["reward"])
+            except Exception as e:
+                logger.warning("Failed to get scenario reward from %s: %s", session.resource_uri, e)
+            return None
     def scenario(
         self,
         name: str | None = None,
         description: str | None = None,
+        required_env_vars: list[str] | None = None,
     ) -> Callable[
         [Callable[..., AsyncGenerator[Any, None]]],
         Callable[..., AsyncGenerator[Any, None]],
@@ -303,28 +397,37 @@ class ScenarioMixin:
         Args:
             name: Optional name for the scenario (defaults to function name)
             description: Optional description of what the scenario does
+            required_env_vars: Optional list of environment variable names this scenario requires.
+                These are used by the HUD platform to check if users have configured the
+                necessary API keys/credentials before running this specific scenario.
         Example:
-            @env.scenario()
-            async def search_cats(url: str):
-                await env.call_tool("navigate", url=url)
-                yield "Find cat images"
-                result = await env.call_tool("count_cats")
-                yield float(result > 0)
+            @env.scenario(required_env_vars=["OPENAI_API_KEY"])
+            async def chat(query: str):
+                yield f"Answer this question: {query}"
+                # ... evaluate
+                yield 1.0
             # MCP client usage:
-            # 1. get_prompt("{env_name}:search_cats", {url: "..."}) -> prompt messages
+            # 1. get_prompt("{env_name}:chat", {query: "..."}) -> prompt messages
             # 2. agent runs...
-            # 3. read_resource("{env_name}:search_cats") -> {"reward": 0.95}
+            # 3. read_resource("{env_name}:chat") -> {"reward": 0.95}
         """
         def decorator(
             fn: Callable[..., AsyncGenerator[Any, None]],
         ) -> Callable[..., AsyncGenerator[Any, None]]:
             scenario_name = name or fn.__name__
-            # Sanitize env name for URI scheme (no underscores allowed)
-            safe_env_name = self.name.replace("_", "-")
-            scenario_id = f"{safe_env_name}:{scenario_name}"
+            # Validate scenario name - colons are reserved as env:scenario separator
+            if ":" in scenario_name:
+                raise ValueError(
+                    f"Scenario name '{scenario_name}' cannot contain ':' "
+                    "(reserved as separator between environment and scenario names)"
+                )
+            # self.name is already normalized (lowercase, hyphens) by Environment.__init__
+            scenario_id = f"{self.name}:{scenario_name}"
             scenario_desc = description or fn.__doc__ or f"Scenario: {scenario_name}"
             # Capture source code for reproducibility
@@ -353,7 +456,7 @@ class ScenarioMixin:
                     # Only include JSON-serializable defaults
                     default_val = p.default
                     if default_val is None or isinstance(
-                        default_val, (str, int, float, bool, list, dict)
+                        default_val, (str | int | float | bool | list | dict)
                     ):
                         arg_info["default"] = default_val
@@ -381,30 +484,81 @@ class ScenarioMixin:
             # Register PROMPT - runs setup, returns prompt messages
             # We need a reference to self and the outer variables
             scenario_self = self
-            scenario_fn = fn
             scenario_name_ref = scenario_name
-            async def prompt_handler(**handler_args: Any) -> list[str]:
-                # Create generator instance
-                gen = scenario_fn(**handler_args)
-                # Run setup phase (code before first yield)
-                prompt_text = await gen.__anext__()
-                # Store generator with session ID
-                session_id = uuid.uuid4().hex[:8]
-                scenario_self._scenario_sessions[session_id] = gen
-                scenario_self._scenario_latest[scenario_name_ref] = session_id
+            # Resolve parameter type hints for deserialization
+            # Use get_type_hints() to handle `from __future__ import annotations`
+            # which makes annotations lazy strings (PEP 563)
+            # MCP prompts only support string arguments, so we JSON-serialize complex types
+            # and use Pydantic TypeAdapter to properly deserialize them
+            try:
+                param_annotations = get_type_hints(fn)
+            except Exception:
+                # Fall back to raw annotations if get_type_hints fails
+                param_annotations = {
+                    p.name: p.annotation
+                    for p in sig.parameters.values()
+                    if p.annotation is not inspect.Parameter.empty
+                }
-                logger.debug(
-                    "Scenario %s setup complete, session=%s, prompt=%s",
-                    scenario_name_ref,
-                    session_id,
-                    prompt_text[:50] if isinstance(prompt_text, str) else prompt_text,
+            async def prompt_handler(**handler_args: Any) -> list[str]:
+                from pydantic import TypeAdapter
+                # Deserialize JSON-encoded arguments using Pydantic TypeAdapter
+                # MCP prompts only support string arguments, so complex types are
+                # JSON-serialized on the sending side and deserialized here
+                deserialized_args: dict[str, Any] = {}
+                for arg_name, arg_value in handler_args.items():
+                    annotation = param_annotations.get(arg_name)
+                    # Only attempt deserialization on string values
+                    if not isinstance(arg_value, str):
+                        deserialized_args[arg_name] = arg_value
+                        continue
+                    # If annotation is explicitly str, keep as string
+                    if annotation is str:
+                        deserialized_args[arg_name] = arg_value
+                        continue
+                    # If we have a non-str type annotation, use TypeAdapter
+                    if annotation is not None:
+                        try:
+                            adapter = TypeAdapter(annotation)
+                            deserialized_args[arg_name] = adapter.validate_json(arg_value)
+                            continue
+                        except Exception:  # noqa: S110
+                            pass  # Fall through to generic JSON decode
+                    # Try JSON decode for strings that look like JSON
+                    stripped = arg_value.strip()
+                    if (stripped and stripped[0] in "[{") or stripped in ("true", "false", "null"):
+                        try:
+                            deserialized_args[arg_name] = json.loads(arg_value)
+                            continue
+                        except json.JSONDecodeError:
+                            pass
+                    # Try to decode if it looks like a number
+                    if stripped.lstrip("-").replace(".", "", 1).isdigit():
+                        try:
+                            deserialized_args[arg_name] = json.loads(arg_value)
+                            continue
+                        except json.JSONDecodeError:
+                            pass
+                    # Keep as string
+                    deserialized_args[arg_name] = arg_value
+                # Delegate to run_scenario_setup (consolidates client/server logic)
+                prompt_text = await scenario_self.run_scenario_setup(
+                    scenario_name_ref, deserialized_args
                 )
+                if prompt_text is None:
+                    raise ValueError(f"Scenario '{scenario_name_ref}' setup returned no prompt")
                 # Return just the string - FastMCP wraps it in PromptMessage
-                # Don't return dict or it gets JSON-serialized as text content
                 return [str(prompt_text)]
             # Register prompt using FastMCP - create FunctionPrompt directly
@@ -417,6 +571,8 @@ class ScenarioMixin:
                 scenario_meta["code"] = source_code
             if prompt_args:
                 scenario_meta["arguments"] = prompt_args
+            if required_env_vars:
+                scenario_meta["required_env_vars"] = required_env_vars
             prompt = FunctionPrompt(
                 name=scenario_id,
@@ -432,40 +588,11 @@ class ScenarioMixin:
             # Register RESOURCE - runs evaluate, returns reward
             async def resource_handler() -> str:
-                # Get latest session for this scenario
-                session_id = scenario_self._scenario_latest.get(scenario_name_ref)
-                if not session_id:
-                    raise ValueError(
-                        f"No active session for scenario '{scenario_name_ref}'. "
-                        "Call the prompt first to run setup."
-                    )
-                gen = scenario_self._scenario_sessions.pop(session_id, None)
-                if gen is None:
-                    raise ValueError(f"Session '{session_id}' not found or already evaluated.")
-                # Get submitted answer (if any)
-                answer = scenario_self._scenario_answers.pop(scenario_name_ref, None)
-                # Run evaluate phase (code after first yield)
-                # Use asend to pass the answer (or None if not submitted)
-                try:
-                    reward = await gen.asend(answer)
-                except StopAsyncIteration:
-                    # Generator ended without second yield - assume success
-                    reward = 1.0
-                logger.debug(
-                    "Scenario %s evaluate complete, session=%s, answer=%s, reward=%s",
-                    scenario_name_ref,
-                    session_id,
-                    answer[:50] if answer and len(answer) > 50 else answer,
-                    reward,
-                )
+                # Delegate to run_scenario_evaluate (consolidates client/server logic)
+                reward = await scenario_self.run_scenario_evaluate(scenario_name_ref)
-                # Clean up latest pointer if it matches
-                if scenario_self._scenario_latest.get(scenario_name_ref) == session_id:
-                    del scenario_self._scenario_latest[scenario_name_ref]
+                if reward is None:
+                    raise ValueError(f"Scenario '{scenario_name_ref}' evaluation failed")
                 return json.dumps({"reward": float(reward)})

hud-python 0.5.1__py3-none-any.whl → 0.5.13__py3-none-any.whl

hud-python 0.5.1py3-none-any.whl → 0.5.13py3-none-any.whl