PyPI - hud-python - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.13__py3-none-any.whl - Mend

hud-python 0.5.1py3-none-any.whl → 0.5.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

hud/__init__.py +1 -1
hud/agents/__init__.py +65 -6
hud/agents/base.py +33 -15
hud/agents/claude.py +60 -31
hud/agents/gateway.py +42 -0
hud/agents/gemini.py +15 -26
hud/agents/gemini_cua.py +6 -17
hud/agents/misc/response_agent.py +7 -0
hud/agents/openai.py +16 -29
hud/agents/openai_chat.py +3 -19
hud/agents/operator.py +5 -17
hud/agents/resolver.py +70 -0
hud/agents/tests/test_claude.py +2 -4
hud/agents/tests/test_openai.py +2 -1
hud/agents/tests/test_resolver.py +192 -0
hud/agents/types.py +148 -0
hud/cli/__init__.py +34 -3
hud/cli/build.py +37 -5
hud/cli/dev.py +11 -2
hud/cli/eval.py +51 -39
hud/cli/flows/init.py +1 -1
hud/cli/pull.py +1 -1
hud/cli/push.py +9 -2
hud/cli/tests/test_build.py +2 -2
hud/cli/tests/test_push.py +1 -1
hud/cli/utils/metadata.py +1 -1
hud/cli/utils/tests/test_metadata.py +1 -1
hud/clients/mcp_use.py +6 -1
hud/datasets/loader.py +17 -18
hud/datasets/runner.py +16 -10
hud/datasets/tests/test_loader.py +15 -15
hud/environment/__init__.py +5 -3
hud/environment/connection.py +58 -6
hud/environment/connectors/mcp_config.py +29 -1
hud/environment/environment.py +218 -77
hud/environment/router.py +175 -24
hud/environment/scenarios.py +313 -186
hud/environment/tests/test_connectors.py +10 -23
hud/environment/tests/test_environment.py +432 -0
hud/environment/tests/test_local_connectors.py +81 -40
hud/environment/tests/test_scenarios.py +820 -14
hud/eval/context.py +63 -10
hud/eval/instrument.py +4 -2
hud/eval/manager.py +79 -12
hud/eval/task.py +36 -4
hud/eval/tests/test_eval.py +1 -1
hud/eval/tests/test_task.py +147 -1
hud/eval/types.py +2 -0
hud/eval/utils.py +14 -3
hud/patches/mcp_patches.py +178 -21
hud/telemetry/instrument.py +8 -1
hud/telemetry/tests/test_eval_telemetry.py +8 -8
hud/tools/__init__.py +2 -0
hud/tools/agent.py +223 -0
hud/tools/computer/__init__.py +34 -5
hud/tools/shell.py +3 -3
hud/tools/tests/test_agent_tool.py +355 -0
hud/types.py +62 -34
hud/utils/hud_console.py +30 -17
hud/utils/strict_schema.py +1 -1
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/METADATA +2 -2
{hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/RECORD +67 -61
{hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/WHEEL +0 -0
{hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
{hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0

hud/environment/tests/test_scenarios.py CHANGED Viewed

@@ -2,11 +2,55 @@
 from __future__ import annotations
+from datetime import datetime
+from enum import Enum
+from typing import Any
 import pytest
+from pydantic import BaseModel
 from hud.environment import Environment
+# Module-level models for Pydantic/Enum/datetime deserialization tests
+# (prefixed with underscore to avoid pytest collection warnings)
+class _UserConfig(BaseModel):
+    """Pydantic model for testing."""
+    name: str
+    age: int
+    active: bool = True
+class _Status(Enum):
+    """Enum for testing."""
+    PENDING = "pending"
+    ACTIVE = "active"
+    COMPLETED = "completed"
+class _Address(BaseModel):
+    """Nested Pydantic model for testing."""
+    street: str
+    city: str
+class _Person(BaseModel):
+    """Pydantic model with nested model for testing."""
+    name: str
+    address: _Address
+class _Item(BaseModel):
+    """Pydantic model for list tests."""
+    id: int
+    name: str
 class TestScenarioDecorator:
     """Tests for @env.scenario decorator."""
@@ -110,8 +154,9 @@ class TestScenarioExecution:
         assert prompt is not None
         await prompt.render({})
-        # Check session was stored
-        assert "test" in env._scenario_latest
+        # Check session was stored in _active_session
+        assert env._active_session is not None
+        assert env._active_session.local_name == "test"
     @pytest.mark.asyncio
     async def test_scenario_full_flow(self) -> None:
@@ -170,7 +215,7 @@ class TestScenarioSubmit:
     @pytest.mark.asyncio
     async def test_submit_stores_answer(self) -> None:
-        """submit() stores answer for scenario."""
+        """submit() stores answer in active session."""
         env = Environment("test-env")
         @env.scenario("test")
@@ -178,15 +223,15 @@ class TestScenarioSubmit:
             yield "What is 2+2?"
             yield 1.0
-        # Run setup
-        prompt = env._prompt_manager._prompts.get("test-env:test")
-        assert prompt is not None
-        await prompt.render({})
+        # Run setup via proper API (creates _active_session)
+        await env.run_scenario_setup("test", {})
         # Submit answer
         await env.submit("test", "4")
-        assert env._scenario_answers.get("test") == "4"
+        # Answer is stored in active session (not _scenario_answers for client-side)
+        assert env._active_session is not None
+        assert env._active_session.answer == "4"
     @pytest.mark.asyncio
     async def test_scenario_receives_answer(self) -> None:
@@ -201,13 +246,14 @@ class TestScenarioSubmit:
             received_answer = answer
             yield 1.0 if answer == "4" else 0.0
-        # Run setup
+        # Run setup (creates _active_session)
         prompt = env._prompt_manager._prompts.get("test-env:qa")
         assert prompt is not None
         await prompt.render({})
-        # Submit answer
-        env._scenario_answers["qa"] = "4"
+        # Submit answer via _active_session
+        assert env._active_session is not None
+        env._active_session.answer = "4"
         # Run evaluate
         resource = env._resource_manager._resources.get("test-env:qa")
@@ -226,13 +272,14 @@ class TestScenarioSubmit:
             answer = yield "What is the capital of France?"
             yield 1.0 if "paris" in answer.lower() else 0.0
-        # Run setup
+        # Run setup (creates _active_session)
         prompt = env._prompt_manager._prompts.get("test-env:grading")
         assert prompt is not None
         await prompt.render({})
-        # Submit correct answer
-        env._scenario_answers["grading"] = "Paris"
+        # Submit correct answer via _active_session
+        assert env._active_session is not None
+        env._active_session.answer = "Paris"
         # Run evaluate
         resource = env._resource_manager._resources.get("test-env:grading")
@@ -244,6 +291,46 @@ class TestScenarioSubmit:
         data = json.loads(result)
         assert data["reward"] == 1.0
+    @pytest.mark.asyncio
+    async def test_hud_submit_normalizes_prefixed_scenario_name(self) -> None:
+        """_hud_submit with prefixed name stores answer in _active_session.
+        Regression test: answers submitted with "env:scenario" prefix must
+        match the active session's local_name for storage.
+        """
+        env = Environment("my-env")
+        @env.scenario("greet")
+        async def greet_scenario():
+            answer = yield "Say hello"
+            yield 1.0 if answer == "hello" else 0.0
+        # Run setup via prompt (creates _active_session)
+        prompt = env._prompt_manager._prompts.get("my-env:greet")
+        assert prompt is not None
+        await prompt.render({})
+        # Verify session exists before _hud_submit
+        assert env._active_session is not None
+        assert env._active_session.local_name == "greet"
+        # Simulate _hud_submit with PREFIXED scenario name (as happens in remote calls)
+        # This should normalize to "greet" and match the active session
+        await env.call_tool("_hud_submit", scenario="my-env:greet", answer="hello")
+        # Verify answer was stored in _active_session
+        assert env._active_session.answer == "hello"
+        # Verify evaluation works
+        resource = env._resource_manager._resources.get("my-env:greet")
+        assert resource is not None
+        result = await resource.read()
+        import json
+        data = json.loads(result)
+        assert data["reward"] == 1.0
 class TestScenarioMeta:
     """Tests for scenario _meta containing code."""
@@ -278,3 +365,722 @@ class TestScenarioMeta:
         assert resource.meta is not None
         assert "code" in resource.meta
         assert "async def example_scenario" in resource.meta["code"]
+class TestScenarioJsonSerialization:
+    """Tests for JSON serialization of complex argument types.
+    MCP prompts only support string arguments (dict[str, str]).
+    Complex types like lists, dicts, and numbers are JSON-serialized
+    when sent and deserialized based on type annotations when received.
+    """
+    @pytest.mark.asyncio
+    async def test_list_argument_deserialization(self) -> None:
+        """List arguments are JSON-deserialized from strings."""
+        env = Environment("test-env")
+        received_items: list[str] = []
+        @env.scenario("process_items")
+        async def process_items_scenario(items: list[str]):
+            received_items.extend(items)
+            yield f"Processing {len(items)} items"
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:process_items")
+        assert prompt is not None
+        # Simulate MCP sending JSON-encoded list as string
+        await prompt.render({"items": '["apple", "banana", "cherry"]'})
+        assert received_items == ["apple", "banana", "cherry"]
+    @pytest.mark.asyncio
+    async def test_dict_argument_deserialization(self) -> None:
+        """Dict arguments are JSON-deserialized from strings."""
+        env = Environment("test-env")
+        received_config: dict[str, Any] = {}
+        @env.scenario("configure")
+        async def configure_scenario(config: dict[str, Any]):
+            received_config.update(config)
+            yield "Configuring..."
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:configure")
+        assert prompt is not None
+        # Simulate MCP sending JSON-encoded dict as string
+        await prompt.render({"config": '{"timeout": 30, "retries": 3}'})
+        assert received_config == {"timeout": 30, "retries": 3}
+    @pytest.mark.asyncio
+    async def test_int_argument_deserialization(self) -> None:
+        """Integer arguments are JSON-deserialized from strings."""
+        env = Environment("test-env")
+        received_count = 0
+        @env.scenario("count")
+        async def count_scenario(count: int):
+            nonlocal received_count
+            received_count = count
+            yield f"Counting to {count}"
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:count")
+        assert prompt is not None
+        # Simulate MCP sending JSON-encoded int as string
+        await prompt.render({"count": "42"})
+        assert received_count == 42
+        assert isinstance(received_count, int)
+    @pytest.mark.asyncio
+    async def test_float_argument_deserialization(self) -> None:
+        """Float arguments are JSON-deserialized from strings."""
+        env = Environment("test-env")
+        received_value = 0.0
+        @env.scenario("precision")
+        async def precision_scenario(value: float):
+            nonlocal received_value
+            received_value = value
+            yield f"Value is {value}"
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:precision")
+        assert prompt is not None
+        # Simulate MCP sending JSON-encoded float as string
+        await prompt.render({"value": "3.14159"})
+        assert received_value == 3.14159
+        assert isinstance(received_value, float)
+    @pytest.mark.asyncio
+    async def test_bool_argument_deserialization(self) -> None:
+        """Boolean arguments are JSON-deserialized from strings."""
+        env = Environment("test-env")
+        received_flag = False
+        @env.scenario("toggle")
+        async def toggle_scenario(enabled: bool):
+            nonlocal received_flag
+            received_flag = enabled
+            yield f"Enabled: {enabled}"
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:toggle")
+        assert prompt is not None
+        # Simulate MCP sending JSON-encoded bool as string
+        await prompt.render({"enabled": "true"})
+        assert received_flag is True
+        assert isinstance(received_flag, bool)
+    @pytest.mark.asyncio
+    async def test_string_argument_unchanged(self) -> None:
+        """String arguments are passed through unchanged."""
+        env = Environment("test-env")
+        received_name = ""
+        @env.scenario("greet")
+        async def greet_scenario(name: str):
+            nonlocal received_name
+            received_name = name
+            yield f"Hello, {name}!"
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:greet")
+        assert prompt is not None
+        # String should pass through as-is (not double-encoded)
+        await prompt.render({"name": "Alice"})
+        assert received_name == "Alice"
+    @pytest.mark.asyncio
+    async def test_mixed_argument_types(self) -> None:
+        """Mixed argument types are handled correctly."""
+        env = Environment("test-env")
+        received_args: dict[str, Any] = {}
+        @env.scenario("mixed")
+        async def mixed_scenario(
+            name: str,
+            count: int,
+            items: list[str],
+            options: dict[str, bool],
+        ):
+            received_args["name"] = name
+            received_args["count"] = count
+            received_args["items"] = items
+            received_args["options"] = options
+            yield "Processing..."
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:mixed")
+        assert prompt is not None
+        await prompt.render(
+            {
+                "name": "test",
+                "count": "5",
+                "items": '["a", "b", "c"]',
+                "options": '{"verbose": true, "dry_run": false}',
+            }
+        )
+        assert received_args["name"] == "test"
+        assert received_args["count"] == 5
+        assert received_args["items"] == ["a", "b", "c"]
+        assert received_args["options"] == {"verbose": True, "dry_run": False}
+    @pytest.mark.asyncio
+    async def test_invalid_json_falls_back_to_string(self) -> None:
+        """Invalid JSON for non-string type falls back to string value."""
+        env = Environment("test-env")
+        received_items: list[str] = []
+        @env.scenario("fallback")
+        async def fallback_scenario(items: list[str]):
+            # This will receive the raw string if JSON parsing fails
+            received_items.append(str(items))
+            yield "Processing..."
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:fallback")
+        assert prompt is not None
+        # Invalid JSON - should fall back to string
+        await prompt.render({"items": "not valid json ["})
+        # Falls back to raw string
+        assert received_items == ["not valid json ["]
+    @pytest.mark.asyncio
+    async def test_nested_complex_types(self) -> None:
+        """Nested complex types are deserialized correctly."""
+        env = Environment("test-env")
+        received_data: dict[str, Any] = {}
+        @env.scenario("nested")
+        async def nested_scenario(data: dict[str, Any]):
+            received_data.update(data)
+            yield "Processing nested data..."
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:nested")
+        assert prompt is not None
+        nested_json = (
+            '{"users": [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}], '
+            '"metadata": {"version": 1}}'
+        )
+        await prompt.render({"data": nested_json})
+        assert received_data == {
+            "users": [
+                {"name": "Alice", "age": 30},
+                {"name": "Bob", "age": 25},
+            ],
+            "metadata": {"version": 1},
+        }
+    @pytest.mark.asyncio
+    async def test_optional_list_with_value(self) -> None:
+        """Optional[list[str]] receives list when provided."""
+        env = Environment("test-env")
+        received_items: list[str] | None = None
+        @env.scenario("optional_list")
+        async def optional_list_scenario(items: list[str] | None = None):
+            nonlocal received_items
+            received_items = items
+            yield f"Got {items}"
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:optional_list")
+        assert prompt is not None
+        await prompt.render({"items": '["x", "y", "z"]'})
+        assert received_items == ["x", "y", "z"]
+    @pytest.mark.asyncio
+    async def test_optional_list_with_null(self) -> None:
+        """Optional[list[str]] receives None when 'null' is passed."""
+        env = Environment("test-env")
+        received_items: list[str] | None = ["initial"]
+        @env.scenario("optional_list_null")
+        async def optional_list_null_scenario(items: list[str] | None = None):
+            nonlocal received_items
+            received_items = items
+            yield f"Got {items}"
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:optional_list_null")
+        assert prompt is not None
+        await prompt.render({"items": "null"})
+        assert received_items is None
+    @pytest.mark.asyncio
+    async def test_optional_str_with_value(self) -> None:
+        """Optional[str] receives string value correctly."""
+        env = Environment("test-env")
+        received_name: str | None = None
+        @env.scenario("optional_str")
+        async def optional_str_scenario(name: str | None = None):
+            nonlocal received_name
+            received_name = name
+            yield f"Got {name}"
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:optional_str")
+        assert prompt is not None
+        await prompt.render({"name": "Alice"})
+        assert received_name == "Alice"
+    @pytest.mark.asyncio
+    async def test_optional_str_with_null(self) -> None:
+        """Optional[str] receives None when 'null' is passed."""
+        env = Environment("test-env")
+        received_name: str | None = "initial"
+        @env.scenario("optional_str_null")
+        async def optional_str_null_scenario(name: str | None = None):
+            nonlocal received_name
+            received_name = name
+            yield f"Got {name}"
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:optional_str_null")
+        assert prompt is not None
+        await prompt.render({"name": "null"})
+        assert received_name is None
+    @pytest.mark.asyncio
+    async def test_pydantic_model_deserialization(self) -> None:
+        """Pydantic models are properly deserialized from JSON."""
+        env = Environment("test-env")
+        received_config: _UserConfig | None = None
+        @env.scenario("pydantic_model")
+        async def pydantic_model_scenario(config: _UserConfig):
+            nonlocal received_config
+            received_config = config
+            yield f"Got config for {config.name}"
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:pydantic_model")
+        assert prompt is not None
+        await prompt.render({"config": '{"name": "Alice", "age": 30}'})
+        assert received_config is not None
+        assert isinstance(received_config, _UserConfig)
+        assert received_config.name == "Alice"
+        assert received_config.age == 30
+        assert received_config.active is True  # default value
+    @pytest.mark.asyncio
+    async def test_enum_deserialization(self) -> None:
+        """Enum values are properly deserialized from JSON strings."""
+        env = Environment("test-env")
+        received_status: _Status | None = None
+        @env.scenario("enum_status")
+        async def enum_scenario(status: _Status):
+            nonlocal received_status
+            received_status = status
+            yield f"Status is {status.value}"
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:enum_status")
+        assert prompt is not None
+        await prompt.render({"status": '"active"'})
+        assert received_status is not None
+        assert isinstance(received_status, _Status)
+        assert received_status == _Status.ACTIVE
+    @pytest.mark.asyncio
+    async def test_datetime_deserialization(self) -> None:
+        """Datetime values are properly deserialized from ISO strings."""
+        env = Environment("test-env")
+        received_dt: datetime | None = None
+        @env.scenario("datetime_scenario")
+        async def datetime_scenario(created_at: datetime):
+            nonlocal received_dt
+            received_dt = created_at
+            yield f"Created at {created_at}"
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:datetime_scenario")
+        assert prompt is not None
+        await prompt.render({"created_at": '"2024-06-15T10:30:00"'})
+        assert received_dt is not None
+        assert isinstance(received_dt, datetime)
+        assert received_dt.year == 2024
+        assert received_dt.month == 6
+        assert received_dt.day == 15
+        assert received_dt.hour == 10
+        assert received_dt.minute == 30
+    @pytest.mark.asyncio
+    async def test_nested_pydantic_model(self) -> None:
+        """Nested Pydantic models are properly deserialized."""
+        env = Environment("test-env")
+        received_person: _Person | None = None
+        @env.scenario("nested_pydantic")
+        async def nested_pydantic_scenario(person: _Person):
+            nonlocal received_person
+            received_person = person
+            yield f"Person {person.name} from {person.address.city}"
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:nested_pydantic")
+        assert prompt is not None
+        json_data = '{"name": "Bob", "address": {"street": "123 Main St", "city": "NYC"}}'
+        await prompt.render({"person": json_data})
+        assert received_person is not None
+        assert isinstance(received_person, _Person)
+        assert received_person.name == "Bob"
+        assert isinstance(received_person.address, _Address)
+        assert received_person.address.city == "NYC"
+    @pytest.mark.asyncio
+    async def test_list_of_pydantic_models(self) -> None:
+        """List of Pydantic models are properly deserialized."""
+        env = Environment("test-env")
+        received_items: list[_Item] = []
+        @env.scenario("list_pydantic")
+        async def list_pydantic_scenario(items: list[_Item]):
+            nonlocal received_items
+            received_items = items
+            yield f"Got {len(items)} items"
+            yield 1.0
+        prompt = env._prompt_manager._prompts.get("test-env:list_pydantic")
+        assert prompt is not None
+        json_data = '[{"id": 1, "name": "Apple"}, {"id": 2, "name": "Banana"}]'
+        await prompt.render({"items": json_data})
+        assert len(received_items) == 2
+        assert all(isinstance(item, _Item) for item in received_items)
+        assert received_items[0].name == "Apple"
+        assert received_items[1].name == "Banana"
+class TestScenarioNameNormalization:
+    """Test edge cases for environment and scenario name handling."""
+    @pytest.mark.asyncio
+    async def test_env_name_with_underscores_normalizes(self) -> None:
+        """Environment name with underscores normalizes to hyphens."""
+        env = Environment("my_test_env")
+        assert env.name == "my-test-env"
+        @env.scenario("greet")
+        async def greet():
+            yield "Hello"
+            yield 1.0
+        # Scenario should be registered with normalized name
+        assert "my-test-env:greet" in [p.name for p in env._prompt_manager._prompts.values()]
+    @pytest.mark.asyncio
+    async def test_env_name_with_spaces_normalizes(self) -> None:
+        """Environment name with spaces normalizes to hyphens."""
+        env = Environment("my test env")
+        assert env.name == "my-test-env"
+    @pytest.mark.asyncio
+    async def test_env_name_with_caps_normalizes(self) -> None:
+        """Environment name with capitals normalizes to lowercase."""
+        env = Environment("MyTestEnv")
+        assert env.name == "mytestenv"
+    @pytest.mark.asyncio
+    async def test_env_name_mixed_formatting(self) -> None:
+        """Environment name with mixed formatting normalizes correctly."""
+        env = Environment("My_Test Env")
+        assert env.name == "my-test-env"
+    @pytest.mark.asyncio
+    async def test_prefix_matches_normalized_name(self) -> None:
+        """Scenario prefix should match normalized env name."""
+        env = Environment("my_env")  # Normalizes to "my-env"
+        @env.scenario("test")
+        async def test_scenario():
+            yield "Prompt"
+            yield 1.0
+        # Calling with normalized prefix should work as local
+        prompt = await env.run_scenario_setup("my-env:test", {})
+        assert prompt == "Prompt"
+        assert env._active_session is not None
+        assert env._active_session.is_local is True
+    @pytest.mark.asyncio
+    async def test_unnormalized_prefix_treated_as_remote(self) -> None:
+        """Calling with unnormalized prefix treats as remote (different env)."""
+        env = Environment("my_env")  # Normalizes to "my-env"
+        @env.scenario("test")
+        async def test_scenario():
+            yield "Prompt"
+            yield 1.0
+        # Calling with "my_env:test" (underscore) won't match "my-env"
+        # So it's treated as remote - which will fail since no connection
+        with pytest.raises(ValueError, match="Scenario not found"):
+            await env.run_scenario_setup("my_env:test", {})
+class TestScenarioMalformedNames:
+    """Test handling of malformed scenario names."""
+    @pytest.mark.asyncio
+    async def test_empty_scenario_name_rejected(self) -> None:
+        """Empty scenario name should be handled gracefully."""
+        env = Environment("test-env")
+        @env.scenario("valid")
+        async def valid_scenario():
+            yield "Prompt"
+            yield 1.0
+        # Empty name - should fail since not registered
+        with pytest.raises((ValueError, KeyError)):
+            await env.run_scenario_setup("", {})
+    @pytest.mark.asyncio
+    async def test_only_colon_handled(self) -> None:
+        """Scenario name that is just ':' should be handled."""
+        env = Environment("test-env")
+        # ":" splits to prefix="" and short_name=""
+        with pytest.raises((ValueError, KeyError)):
+            await env.run_scenario_setup(":", {})
+    @pytest.mark.asyncio
+    async def test_colon_in_scenario_name_rejected_at_registration(self) -> None:
+        """Scenario names with colons are rejected at registration time."""
+        env = Environment("test-env")
+        # Colons are reserved as the separator between env and scenario names
+        with pytest.raises(ValueError, match="cannot contain ':'"):
+            @env.scenario("invalid:name")
+            async def scenario_with_colon():
+                yield "Prompt"
+                yield 1.0
+    @pytest.mark.asyncio
+    async def test_whitespace_in_scenario_name(self) -> None:
+        """Scenario names with whitespace should work (not normalized)."""
+        env = Environment("test-env")
+        @env.scenario("my scenario")
+        async def scenario_with_space():
+            yield "Prompt"
+            yield 1.0
+        # Scenario names are NOT normalized (only env names are)
+        prompt = await env.run_scenario_setup("my scenario", {})
+        assert prompt == "Prompt"
+class TestScenarioRegistration:
+    """Test scenario registration edge cases."""
+    @pytest.mark.asyncio
+    async def test_duplicate_scenario_name_overwrites(self) -> None:
+        """Registering same scenario name twice should overwrite."""
+        env = Environment("test-env")
+        @env.scenario("greet")
+        async def greet_v1():
+            yield "Hello v1"
+            yield 1.0
+        @env.scenario("greet")
+        async def greet_v2():
+            yield "Hello v2"
+            yield 1.0
+        # Should use v2
+        prompt = await env.run_scenario_setup("greet", {})
+        assert prompt == "Hello v2"
+    @pytest.mark.asyncio
+    async def test_scenario_with_special_chars(self) -> None:
+        """Scenario names can contain special characters."""
+        env = Environment("test-env")
+        @env.scenario("test-scenario_v2.0")
+        async def special_scenario():
+            yield "Prompt"
+            yield 1.0
+        prompt = await env.run_scenario_setup("test-scenario_v2.0", {})
+        assert prompt == "Prompt"
+    @pytest.mark.asyncio
+    async def test_scenario_that_yields_once(self) -> None:
+        """Scenario that yields only once (no evaluate) should handle gracefully."""
+        env = Environment("test-env")
+        @env.scenario("one-yield")
+        async def one_yield_scenario():
+            yield "Prompt"
+            # No second yield!
+        prompt = await env.run_scenario_setup("one-yield", {})
+        assert prompt == "Prompt"
+        assert env._active_session is not None
+        env._active_session.answer = "test"
+        # Evaluate should handle StopAsyncIteration and return 1.0
+        reward = await env.run_scenario_evaluate("one-yield")
+        assert reward == 1.0
+    @pytest.mark.asyncio
+    async def test_scenario_that_yields_three_times(self) -> None:
+        """Scenario that yields more than twice - third yield ignored."""
+        env = Environment("test-env")
+        @env.scenario("three-yields")
+        async def three_yield_scenario():
+            yield "Prompt"
+            yield 0.5
+            yield "This should be ignored"
+        prompt = await env.run_scenario_setup("three-yields", {})
+        assert prompt == "Prompt"
+        assert env._active_session is not None
+        env._active_session.answer = "test"
+        reward = await env.run_scenario_evaluate("three-yields")
+        assert reward == 0.5
+class TestScenarioSessionState:
+    """Test session state management edge cases."""
+    @pytest.mark.asyncio
+    async def test_submit_before_setup_raises(self) -> None:
+        """Calling submit() before run_scenario_setup() should raise."""
+        env = Environment("test-env")
+        @env.scenario("test")
+        async def test_scenario():
+            yield "Prompt"
+            yield 1.0
+        with pytest.raises(ValueError, match="No active scenario session"):
+            await env.submit("test", "answer")
+    @pytest.mark.asyncio
+    async def test_evaluate_before_setup_returns_none(self) -> None:
+        """Calling evaluate() before setup() should return None."""
+        env = Environment("test-env")
+        @env.scenario("test")
+        async def test_scenario():
+            yield "Prompt"
+            yield 1.0
+        result = await env.run_scenario_evaluate("test")
+        assert result is None
+    @pytest.mark.asyncio
+    async def test_double_evaluate_returns_none(self) -> None:
+        """Calling evaluate() twice should return None on second call."""
+        env = Environment("test-env")
+        @env.scenario("test")
+        async def test_scenario():
+            yield "Prompt"
+            yield 0.75
+        await env.run_scenario_setup("test", {})
+        assert env._active_session is not None
+        env._active_session.answer = "answer"
+        reward1 = await env.run_scenario_evaluate("test")
+        assert reward1 == 0.75
+        # Second call - session cleared
+        reward2 = await env.run_scenario_evaluate("test")
+        assert reward2 is None
+    @pytest.mark.asyncio
+    async def test_submit_wrong_scenario_raises(self) -> None:
+        """Submitting answer for wrong scenario should raise."""
+        env = Environment("test-env")
+        @env.scenario("scenario-a")
+        async def scenario_a():
+            yield "Prompt A"
+            yield 1.0
+        @env.scenario("scenario-b")
+        async def scenario_b():
+            yield "Prompt B"
+            yield 1.0
+        await env.run_scenario_setup("scenario-a", {})
+        with pytest.raises(ValueError, match="Scenario mismatch"):
+            await env.submit("scenario-b", "answer")
+    @pytest.mark.asyncio
+    async def test_second_setup_overwrites_first(self) -> None:
+        """Starting a new scenario before evaluating previous one overwrites."""
+        env = Environment("test-env")
+        @env.scenario("first")
+        async def first_scenario():
+            yield "First"
+            yield 1.0
+        @env.scenario("second")
+        async def second_scenario():
+            yield "Second"
+            yield 0.5
+        await env.run_scenario_setup("first", {})
+        assert env._active_session is not None
+        assert env._active_session.local_name == "first"
+        # Start second without evaluating first
+        await env.run_scenario_setup("second", {})
+        assert env._active_session is not None
+        assert env._active_session.local_name == "second"
+        env._active_session.answer = "answer"
+        reward = await env.run_scenario_evaluate("second")
+        assert reward == 0.5

hud-python 0.5.1__py3-none-any.whl → 0.5.13__py3-none-any.whl

hud-python 0.5.1py3-none-any.whl → 0.5.13py3-none-any.whl