PyPI - langwatch-scenario - Versions diffs - 0.4.0__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

langwatch-scenario 0.4.0py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (238) hide show

scenario/scenario_executor.py CHANGED Viewed

@@ -12,37 +12,55 @@ from typing import (
     Callable,
     Dict,
     List,
-    Any,
     Optional,
     Set,
     Tuple,
     Union,
+    TypedDict,
 )
 import time
+import warnings
 import termcolor
 import asyncio
 import concurrent.futures
 from scenario.config import ScenarioConfig
-from scenario.utils import (
-    await_if_awaitable,
-    check_valid_return_type,
+from scenario._utils import (
     convert_agent_return_types_to_openai_messages,
+    check_valid_return_type,
     print_openai_messages,
     show_spinner,
+    await_if_awaitable,
+    get_or_create_batch_run_id,
+    generate_scenario_run_id,
 )
 from openai.types.chat import (
     ChatCompletionMessageParam,
     ChatCompletionUserMessageParam,
+    ChatCompletionAssistantMessageParam,
 )
 from .types import AgentInput, AgentRole, ScenarioResult, ScriptStep
-from .error_messages import agent_response_not_awaitable
+from ._error_messages import agent_response_not_awaitable
 from .cache import context_scenario
 from .agent_adapter import AgentAdapter
 from .script import proceed
 from pksuid import PKSUID
 from .scenario_state import ScenarioState
+from ._events import (
+    ScenarioEventBus,
+    ScenarioEvent,
+    ScenarioRunStartedEvent,
+    ScenarioMessageSnapshotEvent,
+    ScenarioRunFinishedEvent,
+    ScenarioRunStartedEventMetadata,
+    ScenarioRunFinishedEventResults,
+    ScenarioRunFinishedEventVerdict,
+    ScenarioRunFinishedEventStatus,
+    convert_messages_to_api_client_messages,
+)
+from rx.subject.subject import Subject
+from rx.core.observable.observable import Observable
 class ScenarioExecutor:
@@ -66,41 +84,8 @@ class ScenarioExecutor:
         agents: List of agent adapters participating in the scenario
         script: Optional list of script steps to control scenario flow
         config: Configuration settings for execution behavior
-    Example:
-        ```python
-        # Direct instantiation (less common)
-        executor = ScenarioExecutor(
-            name="weather query test",
-            description="User asks about weather, agent should provide helpful response",
-            agents=[
-                weather_agent,
-                scenario.UserSimulatorAgent(),
-                scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
-            ],
-            max_turns=10,
-            verbose=True
-        )
-        result = await executor._run()
-        # Preferred high-level API
-        result = await scenario.run(
-            name="weather query test",
-            description="User asks about weather, agent should provide helpful response",
-            agents=[
-                weather_agent,
-                scenario.UserSimulatorAgent(),
-                scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
-            ]
-        )
-        ```
-    Note:
-        - Scenarios run in isolated thread pools to support parallel execution
-        - All agent interactions are cached when cache_key is configured
-        - Debug mode allows step-by-step execution with user intervention
-        - Results include detailed timing information and conversation history
     """
     name: str
     description: str
     agents: List[AgentAdapter]
@@ -115,6 +100,11 @@ class ScenarioExecutor:
     _pending_roles_on_turn: List[AgentRole] = []
     _pending_agents_on_turn: Set[AgentAdapter] = set()
     _agent_times: Dict[int, float] = {}
+    _events: Subject
+    event_bus: ScenarioEventBus
+    batch_run_id: str
     def __init__(
         self,
@@ -127,6 +117,7 @@ class ScenarioExecutor:
         verbose: Optional[Union[bool, int]] = None,
         cache_key: Optional[str] = None,
         debug: Optional[bool] = None,
+        event_bus: Optional[ScenarioEventBus] = None,
     ):
         """
         Initialize a scenario executor.
@@ -147,26 +138,7 @@ class ScenarioExecutor:
                       Overrides global configuration for this scenario.
             debug: Whether to enable debug mode with step-by-step execution.
                   Overrides global configuration for this scenario.
-        Example:
-            ```python
-            executor = ScenarioExecutor(
-                name="customer service test",
-                description="Customer has a billing question and needs help",
-                agents=[
-                    customer_service_agent,
-                    scenario.UserSimulatorAgent(),
-                    scenario.JudgeAgent(criteria=[
-                        "Agent is polite and professional",
-                        "Agent addresses the billing question",
-                        "Agent provides clear next steps"
-                    ])
-                ],
-                max_turns=15,
-                verbose=True,
-                debug=False
-            )
-            ```
+            event_bus: Optional event bus that will subscribe to this executor's events
         """
         self.name = name
         self.description = description
@@ -183,115 +155,33 @@ class ScenarioExecutor:
         self.reset()
-    @classmethod
-    async def run(
-        cls,
-        name: str,
-        description: str,
-        agents: List[AgentAdapter] = [],
-        max_turns: Optional[int] = None,
-        verbose: Optional[Union[bool, int]] = None,
-        cache_key: Optional[str] = None,
-        debug: Optional[bool] = None,
-        script: Optional[List[ScriptStep]] = None,
-    ) -> ScenarioResult:
-        """
-        High-level interface for running a scenario test.
-        This is the main entry point for executing scenario tests. It creates a
-        ScenarioExecutor instance and runs it in an isolated thread pool to support
-        parallel execution and prevent blocking.
+        # Create executor's own event stream
+        self._events = Subject()
-        Args:
-            name: Human-readable name for the scenario
-            description: Detailed description of what the scenario tests
-            agents: List of agent adapters (agent under test, user simulator, judge)
-            max_turns: Maximum conversation turns before timeout (default: 10)
-            verbose: Show detailed output during execution
-            cache_key: Cache key for deterministic behavior
-            debug: Enable debug mode for step-by-step execution
-            script: Optional script steps to control scenario flow
+        # Create and configure event bus to subscribe to our events
+        self.event_bus = event_bus or ScenarioEventBus()
+        self.event_bus.subscribe_to_events(self._events)
-        Returns:
-            ScenarioResult containing the test outcome, conversation history,
-            success/failure status, and detailed reasoning
-        Example:
-            ```python
-            import scenario
+        self.batch_run_id = get_or_create_batch_run_id()
-            # Simple scenario with automatic flow
-            result = await scenario.run(
-                name="help request",
-                description="User asks for help with a technical problem",
-                agents=[
-                    my_agent,
-                    scenario.UserSimulatorAgent(),
-                    scenario.JudgeAgent(criteria=["Agent provides helpful response"])
-                ]
-            )
+    @property
+    def events(self) -> Observable:
+        """Expose event stream for subscribers like the event bus."""
+        return self._events
-            # Scripted scenario with custom evaluations
-            result = await scenario.run(
-                name="custom interaction",
-                description="Test specific conversation flow",
-                agents=[
-                    my_agent,
-                    scenario.UserSimulatorAgent(),
-                    scenario.JudgeAgent(criteria=["Agent provides helpful response"])
-                ],
-                script=[
-                    scenario.user("Hello"),
-                    scenario.agent(),
-                    custom_eval,
-                    scenario.succeed()
-                ]
-            )
+    def _emit_event(self, event: ScenarioEvent) -> None:
+        """
+        Emit a domain event to all subscribers.
-            # Results analysis
-            print(f"Test {'PASSED' if result.success else 'FAILED'}")
-            print(f"Reasoning: {result.reasoning}")
-            print(f"Conversation had {len(result.messages)} messages")
-            ```
+        This method publishes scenario events to the internal event stream,
+        which subscribers (like the event bus) can observe and react to.
+        The timestamp is automatically set to the current time.
-        Note:
-            - Runs in isolated thread pool to support parallel execution
-            - Blocks until scenario completes or times out
-            - All agent calls are automatically cached when cache_key is set
-            - Exception handling ensures clean resource cleanup
+        Args:
+            event: The scenario event to emit
         """
-        scenario = cls(
-            name=name,
-            description=description,
-            agents=agents,
-            max_turns=max_turns,
-            verbose=verbose,
-            cache_key=cache_key,
-            debug=debug,
-            script=script,
-        )
-        # We'll use a thread pool to run the execution logic, we
-        # require a separate thread because even though asyncio is
-        # being used throughout, any user code on the callback can
-        # be blocking, preventing them from running scenarios in parallel
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            def run_in_thread():
-                loop = asyncio.new_event_loop()
-                asyncio.set_event_loop(loop)
-                try:
-                    return loop.run_until_complete(scenario._run())
-                finally:
-                    loop.close()
-            # Run the function in the thread pool and await its result
-            # This converts the thread's execution into a Future that the current
-            # event loop can await without blocking
-            loop = asyncio.get_event_loop()
-            result = await loop.run_in_executor(executor, run_in_thread)
-            return result
+        event.timestamp = int(time.time() * 1000)
+        self._events.on_next(event)
     def reset(self):
         """
@@ -300,18 +190,6 @@ class ScenarioExecutor:
         This method reinitializes all internal state for a fresh scenario run,
         including conversation history, turn counters, and agent timing information.
         Called automatically during initialization and can be used to rerun scenarios.
-        Example:
-            ```python
-            executor = ScenarioExecutor(...)
-            # Run first test
-            result1 = await executor._run()
-            # Reset and run again
-            executor.reset()
-            result2 = await executor._run()
-            ```
         """
         self._state = ScenarioState(
             description=self.description,
@@ -349,24 +227,24 @@ class ScenarioExecutor:
                            Used to avoid broadcasting the message back to its creator.
         Example:
-            ```python
+            ```
             def inject_system_message(state: ScenarioState) -> None:
-                state._executor.add_message({
+                state.add_message({
                     "role": "system",
                     "content": "The user is now in a hurry"
                 })
             # Use in script
             result = await scenario.run(
-                name="system message test",
-                agents=[agent, user_sim, judge],
-                script=[
-                    scenario.user("Hello"),
-                    scenario.agent(),
-                    inject_system_message,
-                    scenario.user(),  # Will see the system message
-                    scenario.succeed()
-                ]
+               name="system message test",
+               agents=[agent, user_sim, judge],
+               script=[
+                   scenario.user("Hello"),
+                   scenario.agent(),
+                   inject_system_message,
+                   scenario.user(),  # Will see the system message
+                   scenario.succeed()
+               ]
             )
             ```
         """
@@ -396,7 +274,7 @@ class ScenarioExecutor:
             from_agent_idx: Index of the agent that generated these messages
         Example:
-            ```python
+            ```
             # Agent returns multiple messages for a complex interaction
             messages = [
                 {"role": "assistant", "content": "Let me search for that..."},
@@ -476,7 +354,11 @@ class ScenarioExecutor:
         self, role: AgentRole
     ) -> Tuple[int, Optional[AgentAdapter]]:
         for idx, agent in enumerate(self.agents):
-            if role == agent.role and agent in self._pending_agents_on_turn:
+            if (
+                role == agent.role
+                and agent in self._pending_agents_on_turn
+                and agent.role in self._pending_roles_on_turn
+            ):
                 return idx, agent
         return -1, None
@@ -503,7 +385,7 @@ class ScenarioExecutor:
             agent_time=agent_time,
         )
-    async def _run(self) -> ScenarioResult:
+    async def run(self) -> ScenarioResult:
         """
         Run a scenario against the agent under test.
@@ -513,30 +395,63 @@ class ScenarioExecutor:
         Returns:
             ScenarioResult containing the test outcome
         """
+        scenario_run_id = generate_scenario_run_id()
-        if self.config.verbose:
-            print("")  # new line
-        self.reset()
-        for script_step in self.script:
-            callable = script_step(self._state)
-            if isinstance(callable, Awaitable):
-                result = await callable
-            else:
-                result = callable
+        try:
+            self._emit_run_started_event(scenario_run_id)
-            if isinstance(result, ScenarioResult):
-                return result
+            if self.config.verbose:
+                print("")  # new line
+            self.reset()
+            for script_step in self.script:
+                callable = script_step(self._state)
+                if isinstance(callable, Awaitable):
+                    result = await callable
+                else:
+                    result = callable
+                self._emit_message_snapshot_event(scenario_run_id)
+                if isinstance(result, ScenarioResult):
+                    status = (
+                        ScenarioRunFinishedEventStatus.SUCCESS
+                        if result.success
+                        else ScenarioRunFinishedEventStatus.FAILED
+                    )
+                    self._emit_run_finished_event(scenario_run_id, result, status)
+                    return result
-        return self._reached_max_turns(
-            """Reached end of script without conclusion, add one of the following to the end of the script:
+            result = self._reached_max_turns(
+                """Reached end of script without conclusion, add one of the following to the end of the script:
 - `scenario.proceed()` to let the simulation continue to play out
 - `scenario.judge()` to force criteria judgement
 - `scenario.succeed()` or `scenario.fail()` to end the test with an explicit result
-            """
-        )
+                """
+            )
+            status = (
+                ScenarioRunFinishedEventStatus.SUCCESS
+                if result.success
+                else ScenarioRunFinishedEventStatus.FAILED
+            )
+            self._emit_run_finished_event(scenario_run_id, result, status)
+            return result
+        except Exception as e:
+            # Publish failure event before propagating the error
+            error_result = ScenarioResult(
+                success=False,
+                messages=self._state.messages,
+                reasoning=f"Scenario failed with error: {str(e)}",
+                total_time=time.time() - self._total_start_time,
+                agent_time=0,
+            )
+            self._emit_run_finished_event(
+                scenario_run_id, error_result, ScenarioRunFinishedEventStatus.ERROR
+            )
+            raise  # Re-raise the exception after cleanup
     async def _call_agent(
         self, idx: int, role: AgentRole, request_judgment: bool = False
@@ -577,16 +492,19 @@ class ScenarioExecutor:
         ):
             start_time = time.time()
-            agent_response = agent.call(
-                AgentInput(
-                    # TODO: test thread_id
-                    thread_id=self._state.thread_id,
-                    messages=self._state.messages,
-                    new_messages=self._pending_messages.get(idx, []),
-                    judgment_request=request_judgment,
-                    scenario_state=self._state,
+            # Prevent pydantic validation warnings which should already be disabled
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                agent_response = agent.call(
+                    AgentInput(
+                        # TODO: test thread_id
+                        thread_id=self._state.thread_id,
+                        messages=self._state.messages,
+                        new_messages=self._pending_messages.get(idx, []),
+                        judgment_request=request_judgment,
+                        scenario_state=self._state,
+                    )
                 )
-            )
             if not isinstance(agent_response, Awaitable):
                 raise Exception(
                     agent_response_not_awaitable(agent.__class__.__name__),
@@ -708,15 +626,24 @@ class ScenarioExecutor:
             reasoning=reasoning or "Scenario marked as failed with scenario.fail()",
         )
+    def _consume_until_role(self, role: AgentRole) -> None:
+        while len(self._pending_roles_on_turn) > 0:
+            next_role = self._pending_roles_on_turn[0]
+            if next_role == role:
+                break
+            self._pending_roles_on_turn.pop(0)
     async def _script_call_agent(
         self,
         role: AgentRole,
         content: Optional[Union[str, ChatCompletionMessageParam]] = None,
         request_judgment: bool = False,
     ) -> Optional[ScenarioResult]:
+        self._consume_until_role(role)
         idx, next_agent = self._next_agent_for_role(role)
         if not next_agent:
             self._new_turn()
+            self._consume_until_role(role)
             idx, next_agent = self._next_agent_for_role(role)
             if not next_agent:
@@ -738,11 +665,16 @@ class ScenarioExecutor:
                 )
         self._pending_agents_on_turn.remove(next_agent)
-        self._pending_roles_on_turn.remove(role)
         if content:
             if isinstance(content, str):
-                message = ChatCompletionUserMessageParam(role="user", content=content)
+                message = (
+                    ChatCompletionUserMessageParam(role="user", content=content)
+                    if role == AgentRole.USER
+                    else ChatCompletionAssistantMessageParam(
+                        role="assistant", content=content
+                    )
+                )
             else:
                 message = content
@@ -756,3 +688,228 @@ class ScenarioExecutor:
         )
         if isinstance(result, ScenarioResult):
             return result
+    # Event handling methods
+    class _CommonEventFields(TypedDict):
+        """
+        Common fields shared across all scenario events.
+        These fields provide consistent identification and timing information
+        for all events emitted during scenario execution.
+        Attributes:
+            batch_run_id: Unique identifier for the batch of scenario runs
+            scenario_run_id: Unique identifier for this specific scenario run
+            scenario_id: Human-readable name/identifier for the scenario
+            timestamp: Unix timestamp in milliseconds when the event occurred
+        """
+        batch_run_id: str
+        scenario_run_id: str
+        scenario_id: str
+        timestamp: int
+    def _create_common_event_fields(self, scenario_run_id: str) -> _CommonEventFields:
+        """
+        Create common fields used across all scenario events.
+        This method generates the standard fields that every scenario event
+        must include for proper identification and timing.
+        Args:
+            scenario_run_id: Unique identifier for the current scenario run
+        Returns:
+            Dictionary containing common event fields with current timestamp
+        """
+        return {
+            "batch_run_id": self.batch_run_id,
+            "scenario_run_id": scenario_run_id,
+            "scenario_id": self.name,
+            "timestamp": int(time.time() * 1000),
+        }
+    def _emit_run_started_event(self, scenario_run_id: str) -> None:
+        """
+        Emit a scenario run started event.
+        This event is published when a scenario begins execution. It includes
+        metadata about the scenario such as name and description, and is used
+        to track the start of scenario runs in monitoring systems.
+        Args:
+            scenario_run_id: Unique identifier for the current scenario run
+        """
+        common_fields = self._create_common_event_fields(scenario_run_id)
+        metadata = ScenarioRunStartedEventMetadata(
+            name=self.name,
+            description=self.description,
+        )
+        event = ScenarioRunStartedEvent(
+            **common_fields,
+            metadata=metadata,
+        )
+        self._emit_event(event)
+    def _emit_message_snapshot_event(self, scenario_run_id: str) -> None:
+        """
+        Emit a message snapshot event.
+        This event captures the current state of the conversation during
+        scenario execution. It's published whenever messages are added to
+        the conversation, allowing real-time tracking of scenario progress.
+        """
+        common_fields = self._create_common_event_fields(scenario_run_id)
+        event = ScenarioMessageSnapshotEvent(
+            **common_fields,
+            messages=convert_messages_to_api_client_messages(self._state.messages),
+        )
+        self._emit_event(event)
+    def _emit_run_finished_event(
+        self,
+        scenario_run_id: str,
+        result: ScenarioResult,
+        status: ScenarioRunFinishedEventStatus,
+    ) -> None:
+        """
+        Emit a scenario run finished event.
+        This event is published when a scenario completes execution, whether
+        successfully or with an error. It includes the final results, verdict,
+        and reasoning for the scenario outcome.
+        Args:
+            scenario_run_id: Unique identifier for the current scenario run
+            result: The final scenario result containing success/failure status
+            status: The execution status (SUCCESS, FAILED, or ERROR)
+        """
+        common_fields = self._create_common_event_fields(scenario_run_id)
+        results = ScenarioRunFinishedEventResults(
+            verdict=(
+                ScenarioRunFinishedEventVerdict.SUCCESS
+                if result.success
+                else ScenarioRunFinishedEventVerdict.FAILURE
+            ),
+            reasoning=result.reasoning or "",
+            met_criteria=result.passed_criteria,
+            unmet_criteria=result.failed_criteria,
+        )
+        event = ScenarioRunFinishedEvent(
+            **common_fields,
+            status=status,
+            results=results,
+        )
+        self._emit_event(event)
+        # Signal end of event stream
+        self._events.on_completed()
+async def run(
+    name: str,
+    description: str,
+    agents: List[AgentAdapter] = [],
+    max_turns: Optional[int] = None,
+    verbose: Optional[Union[bool, int]] = None,
+    cache_key: Optional[str] = None,
+    debug: Optional[bool] = None,
+    script: Optional[List[ScriptStep]] = None,
+) -> ScenarioResult:
+    """
+    High-level interface for running a scenario test.
+    This is the main entry point for executing scenario tests. It creates a
+    ScenarioExecutor instance and runs it in an isolated thread pool to support
+    parallel execution and prevent blocking.
+    Args:
+        name: Human-readable name for the scenario
+        description: Detailed description of what the scenario tests
+        agents: List of agent adapters (agent under test, user simulator, judge)
+        max_turns: Maximum conversation turns before timeout (default: 10)
+        verbose: Show detailed output during execution
+        cache_key: Cache key for deterministic behavior
+        debug: Enable debug mode for step-by-step execution
+        script: Optional script steps to control scenario flow
+    Returns:
+        ScenarioResult containing the test outcome, conversation history,
+        success/failure status, and detailed reasoning
+    Example:
+        ```
+        import scenario
+        # Simple scenario with automatic flow
+        result = await scenario.run(
+           name="help request",
+           description="User asks for help with a technical problem",
+           agents=[
+               my_agent,
+               scenario.UserSimulatorAgent(),
+               scenario.JudgeAgent(criteria=["Agent provides helpful response"])
+           ]
+        )
+        # Scripted scenario with custom evaluations
+        result = await scenario.run(
+           name="custom interaction",
+           description="Test specific conversation flow",
+           agents=[
+               my_agent,
+               scenario.UserSimulatorAgent(),
+               scenario.JudgeAgent(criteria=["Agent provides helpful response"])
+           ],
+           script=[
+               scenario.user("Hello"),
+               scenario.agent(),
+               custom_eval,
+               scenario.succeed()
+           ]
+        )
+        # Results analysis
+        print(f"Test {'PASSED' if result.success else 'FAILED'}")
+        print(f"Reasoning: {result.reasoning}")
+        print(f"Conversation had {len(result.messages)} messages")
+        ```
+    """
+    scenario = ScenarioExecutor(
+        name=name,
+        description=description,
+        agents=agents,
+        max_turns=max_turns,
+        verbose=verbose,
+        cache_key=cache_key,
+        debug=debug,
+        script=script,
+    )
+    # We'll use a thread pool to run the execution logic, we
+    # require a separate thread because even though asyncio is
+    # being used throughout, any user code on the callback can
+    # be blocking, preventing them from running scenarios in parallel
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        def run_in_thread():
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            try:
+                return loop.run_until_complete(scenario.run())
+            finally:
+                scenario.event_bus.drain()
+                loop.close()
+        # Run the function in the thread pool and await its result
+        # This converts the thread's execution into a Future that the current
+        # event loop can await without blocking
+        loop = asyncio.get_event_loop()
+        result = await loop.run_in_executor(executor, run_in_thread)
+        return result

langwatch-scenario 0.4.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

langwatch-scenario 0.4.0py3-none-any.whl → 0.7.1py3-none-any.whl