PyPI - langwatch-scenario - Versions diffs - 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

langwatch-scenario 0.4.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{langwatch_scenario-0.4.0.dist-info → langwatch_scenario-0.6.0.dist-info}/METADATA +93 -71
langwatch_scenario-0.6.0.dist-info/RECORD +27 -0
scenario/__init__.py +11 -114
scenario/_utils/__init__.py +32 -0
scenario/_utils/ids.py +58 -0
scenario/_utils/message_conversion.py +103 -0
scenario/{utils.py → _utils/utils.py} +21 -110
scenario/agent_adapter.py +8 -4
scenario/cache.py +4 -3
scenario/config.py +7 -5
scenario/events/__init__.py +66 -0
scenario/events/event_bus.py +175 -0
scenario/events/event_reporter.py +83 -0
scenario/events/events.py +169 -0
scenario/events/messages.py +84 -0
scenario/events/utils.py +86 -0
scenario/judge_agent.py +7 -28
scenario/pytest_plugin.py +2 -47
scenario/scenario_executor.py +268 -84
scenario/scenario_state.py +6 -6
scenario/script.py +9 -9
scenario/types.py +10 -6
scenario/user_simulator_agent.py +4 -11
langwatch_scenario-0.4.0.dist-info/RECORD +0 -18
{langwatch_scenario-0.4.0.dist-info → langwatch_scenario-0.6.0.dist-info}/WHEEL +0 -0
{langwatch_scenario-0.4.0.dist-info → langwatch_scenario-0.6.0.dist-info}/entry_points.txt +0 -0
{langwatch_scenario-0.4.0.dist-info → langwatch_scenario-0.6.0.dist-info}/top_level.txt +0 -0
/scenario/{error_messages.py → _error_messages.py} +0 -0

scenario/pytest_plugin.py CHANGED Viewed

@@ -8,7 +8,7 @@ pytest-based testing workflows.
 """
 import pytest
-from typing import TypedDict, List, Tuple
+from typing import TypedDict
 import functools
 from termcolor import colored
@@ -16,7 +16,6 @@ from scenario.config import ScenarioConfig
 from scenario.types import ScenarioResult
 from .scenario_executor import ScenarioExecutor
-import scenario
 class ScenarioReporterResults(TypedDict):
@@ -46,23 +45,6 @@ class ScenarioReporter:
     Attributes:
         results: List of all scenario test results collected during the session
-    Example:
-        The reporter is used automatically, but you can access it in tests:
-        ```python
-        def test_my_scenarios(scenario_reporter):
-            # Run your scenarios
-            result1 = await scenario.run(...)
-            result2 = await scenario.run(...)
-            # Check collected results
-            assert len(scenario_reporter.results) == 2
-            # Get summary statistics
-            summary = scenario_reporter.get_summary()
-            print(f"Success rate: {summary['success_rate']}%")
-        ```
     """
     def __init__(self):
@@ -80,21 +62,6 @@ class ScenarioReporter:
         Args:
             scenario: The ScenarioExecutor instance that ran the test
             result: The ScenarioResult containing test outcome and details
-        Example:
-            ```python
-            # This happens automatically when you run scenarios
-            result = await scenario.run(
-                name="my test",
-                description="Test description",
-                agents=[
-                    my_agent,
-                    scenario.UserSimulatorAgent(),
-                    scenario.JudgeAgent(criteria=["Agent provides helpful response"])
-                ]
-            )
-            # Result is automatically added to the global reporter
-            ```
         """
         self.results.append({"scenario": scenario, "result": result})
@@ -111,18 +78,6 @@ class ScenarioReporter:
             - passed: Number of scenarios that passed
             - failed: Number of scenarios that failed
             - success_rate: Percentage of scenarios that passed (0-100)
-        Example:
-            ```python
-            def test_summary_check(scenario_reporter):
-                # Run some scenarios...
-                await scenario.run(...)
-                await scenario.run(...)
-                summary = scenario_reporter.get_summary()
-                assert summary['total'] == 2
-                assert summary['success_rate'] >= 80  # Require 80% success rate
-            ```
         """
         total = len(self.results)
         passed = sum(1 for r in self.results if r["result"].success)
@@ -347,7 +302,7 @@ def scenario_reporter(request):
         ScenarioReporter: The global reporter instance collecting all scenario results
     Example:
-        ```python
+        ```
         @pytest.mark.agent_test
         def test_with_custom_reporting(scenario_reporter):
             # Run your scenarios

scenario/scenario_executor.py CHANGED Viewed

@@ -12,11 +12,11 @@ from typing import (
     Callable,
     Dict,
     List,
-    Any,
     Optional,
     Set,
     Tuple,
     Union,
+    TypedDict,
 )
 import time
 import termcolor
@@ -24,25 +24,39 @@ import asyncio
 import concurrent.futures
 from scenario.config import ScenarioConfig
-from scenario.utils import (
-    await_if_awaitable,
+from scenario._utils import (
     check_valid_return_type,
     convert_agent_return_types_to_openai_messages,
     print_openai_messages,
     show_spinner,
+    await_if_awaitable,
+    get_or_create_batch_run_id,
+    generate_scenario_run_id,
 )
 from openai.types.chat import (
     ChatCompletionMessageParam,
     ChatCompletionUserMessageParam,
+    ChatCompletionAssistantMessageParam,
 )
 from .types import AgentInput, AgentRole, ScenarioResult, ScriptStep
-from .error_messages import agent_response_not_awaitable
+from ._error_messages import agent_response_not_awaitable
 from .cache import context_scenario
 from .agent_adapter import AgentAdapter
 from .script import proceed
 from pksuid import PKSUID
 from .scenario_state import ScenarioState
+from .events import (
+    ScenarioEventBus,
+    ScenarioRunStartedEvent,
+    ScenarioMessageSnapshotEvent,
+    ScenarioRunFinishedEvent,
+    ScenarioRunStartedEventMetadata,
+    ScenarioRunFinishedEventResults,
+    ScenarioRunFinishedEventVerdict,
+    ScenarioRunFinishedEventStatus,
+    convert_messages_to_ag_ui_messages,
+)
 class ScenarioExecutor:
@@ -68,30 +82,30 @@ class ScenarioExecutor:
         config: Configuration settings for execution behavior
     Example:
-        ```python
+        ```
         # Direct instantiation (less common)
         executor = ScenarioExecutor(
-            name="weather query test",
-            description="User asks about weather, agent should provide helpful response",
-            agents=[
-                weather_agent,
-                scenario.UserSimulatorAgent(),
-                scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
-            ],
-            max_turns=10,
-            verbose=True
+           name="weather query test",
+           description="User asks about weather, agent should provide helpful response",
+           agents=[
+               weather_agent,
+               scenario.UserSimulatorAgent(),
+               scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
+           ],
+           max_turns=10,
+           verbose=True
         )
         result = await executor._run()
         # Preferred high-level API
         result = await scenario.run(
-            name="weather query test",
-            description="User asks about weather, agent should provide helpful response",
-            agents=[
-                weather_agent,
-                scenario.UserSimulatorAgent(),
-                scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
-            ]
+           name="weather query test",
+           description="User asks about weather, agent should provide helpful response",
+           agents=[
+               weather_agent,
+               scenario.UserSimulatorAgent(),
+               scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
+           ]
         )
         ```
@@ -101,6 +115,7 @@ class ScenarioExecutor:
         - Debug mode allows step-by-step execution with user intervention
         - Results include detailed timing information and conversation history
     """
     name: str
     description: str
     agents: List[AgentAdapter]
@@ -116,6 +131,10 @@ class ScenarioExecutor:
     _pending_agents_on_turn: Set[AgentAdapter] = set()
     _agent_times: Dict[int, float] = {}
+    event_bus: ScenarioEventBus
+    batch_run_id: str
     def __init__(
         self,
         name: str,
@@ -127,6 +146,7 @@ class ScenarioExecutor:
         verbose: Optional[Union[bool, int]] = None,
         cache_key: Optional[str] = None,
         debug: Optional[bool] = None,
+        event_bus: Optional[ScenarioEventBus] = None,
     ):
         """
         Initialize a scenario executor.
@@ -147,6 +167,7 @@ class ScenarioExecutor:
                       Overrides global configuration for this scenario.
             debug: Whether to enable debug mode with step-by-step execution.
                   Overrides global configuration for this scenario.
+            event_reporter: Optional event reporter for the scenario
         Example:
             ```python
@@ -183,6 +204,10 @@ class ScenarioExecutor:
         self.reset()
+        self.event_bus = event_bus or ScenarioEventBus()
+        self.batch_run_id = get_or_create_batch_run_id()
     @classmethod
     async def run(
         cls,
@@ -217,35 +242,35 @@ class ScenarioExecutor:
             success/failure status, and detailed reasoning
         Example:
-            ```python
+            ```
             import scenario
             # Simple scenario with automatic flow
             result = await scenario.run(
-                name="help request",
-                description="User asks for help with a technical problem",
-                agents=[
-                    my_agent,
-                    scenario.UserSimulatorAgent(),
-                    scenario.JudgeAgent(criteria=["Agent provides helpful response"])
-                ]
+               name="help request",
+               description="User asks for help with a technical problem",
+               agents=[
+                   my_agent,
+                   scenario.UserSimulatorAgent(),
+                   scenario.JudgeAgent(criteria=["Agent provides helpful response"])
+               ]
             )
             # Scripted scenario with custom evaluations
             result = await scenario.run(
-                name="custom interaction",
-                description="Test specific conversation flow",
-                agents=[
-                    my_agent,
-                    scenario.UserSimulatorAgent(),
-                    scenario.JudgeAgent(criteria=["Agent provides helpful response"])
-                ],
-                script=[
-                    scenario.user("Hello"),
-                    scenario.agent(),
-                    custom_eval,
-                    scenario.succeed()
-                ]
+               name="custom interaction",
+               description="Test specific conversation flow",
+               agents=[
+                   my_agent,
+                   scenario.UserSimulatorAgent(),
+                   scenario.JudgeAgent(criteria=["Agent provides helpful response"])
+               ],
+               script=[
+                   scenario.user("Hello"),
+                   scenario.agent(),
+                   custom_eval,
+                   scenario.succeed()
+               ]
             )
             # Results analysis
@@ -284,6 +309,7 @@ class ScenarioExecutor:
                 try:
                     return loop.run_until_complete(scenario._run())
                 finally:
+                    loop.run_until_complete(scenario.event_bus.drain())
                     loop.close()
             # Run the function in the thread pool and await its result
@@ -300,18 +326,6 @@ class ScenarioExecutor:
         This method reinitializes all internal state for a fresh scenario run,
         including conversation history, turn counters, and agent timing information.
         Called automatically during initialization and can be used to rerun scenarios.
-        Example:
-            ```python
-            executor = ScenarioExecutor(...)
-            # Run first test
-            result1 = await executor._run()
-            # Reset and run again
-            executor.reset()
-            result2 = await executor._run()
-            ```
         """
         self._state = ScenarioState(
             description=self.description,
@@ -349,24 +363,24 @@ class ScenarioExecutor:
                            Used to avoid broadcasting the message back to its creator.
         Example:
-            ```python
+            ```
             def inject_system_message(state: ScenarioState) -> None:
-                state._executor.add_message({
+                state.add_message({
                     "role": "system",
                     "content": "The user is now in a hurry"
                 })
             # Use in script
             result = await scenario.run(
-                name="system message test",
-                agents=[agent, user_sim, judge],
-                script=[
-                    scenario.user("Hello"),
-                    scenario.agent(),
-                    inject_system_message,
-                    scenario.user(),  # Will see the system message
-                    scenario.succeed()
-                ]
+               name="system message test",
+               agents=[agent, user_sim, judge],
+               script=[
+                   scenario.user("Hello"),
+                   scenario.agent(),
+                   inject_system_message,
+                   scenario.user(),  # Will see the system message
+                   scenario.succeed()
+               ]
             )
             ```
         """
@@ -380,6 +394,7 @@ class ScenarioExecutor:
                 self._pending_messages[idx] = []
             self._pending_messages[idx].append(message)
     def add_messages(
         self,
         messages: List[ChatCompletionMessageParam],
@@ -396,7 +411,7 @@ class ScenarioExecutor:
             from_agent_idx: Index of the agent that generated these messages
         Example:
-            ```python
+            ```
             # Agent returns multiple messages for a complex interaction
             messages = [
                 {"role": "assistant", "content": "Let me search for that..."},
@@ -476,7 +491,11 @@ class ScenarioExecutor:
         self, role: AgentRole
     ) -> Tuple[int, Optional[AgentAdapter]]:
         for idx, agent in enumerate(self.agents):
-            if role == agent.role and agent in self._pending_agents_on_turn:
+            if (
+                role == agent.role
+                and agent in self._pending_agents_on_turn
+                and agent.role in self._pending_roles_on_turn
+            ):
                 return idx, agent
         return -1, None
@@ -513,30 +532,54 @@ class ScenarioExecutor:
         Returns:
             ScenarioResult containing the test outcome
         """
+        scenario_run_id = generate_scenario_run_id()
-        if self.config.verbose:
-            print("")  # new line
+        try:
+            await self.event_bus.listen()
+            self._emit_run_started_event(scenario_run_id)
-        self.reset()
+            if self.config.verbose:
+                print("")  # new line
-        for script_step in self.script:
-            callable = script_step(self._state)
-            if isinstance(callable, Awaitable):
-                result = await callable
-            else:
-                result = callable
+            self.reset()
-            if isinstance(result, ScenarioResult):
-                return result
+            for script_step in self.script:
+                callable = script_step(self._state)
+                if isinstance(callable, Awaitable):
+                    result = await callable
+                else:
+                    result = callable
+                self._emit_message_snapshot_event(scenario_run_id)
-        return self._reached_max_turns(
-            """Reached end of script without conclusion, add one of the following to the end of the script:
+                if isinstance(result, ScenarioResult):
+                    status = ScenarioRunFinishedEventStatus.SUCCESS if result.success else ScenarioRunFinishedEventStatus.FAILED
+                    self._emit_run_finished_event(scenario_run_id, result, status)
+                    return result
+            result = self._reached_max_turns(
+                """Reached end of script without conclusion, add one of the following to the end of the script:
 - `scenario.proceed()` to let the simulation continue to play out
 - `scenario.judge()` to force criteria judgement
 - `scenario.succeed()` or `scenario.fail()` to end the test with an explicit result
-            """
-        )
+                """
+            )
+            status = ScenarioRunFinishedEventStatus.SUCCESS if result.success else ScenarioRunFinishedEventStatus.FAILED
+            self._emit_run_finished_event(scenario_run_id, result, status)
+            return result
+        except Exception as e:
+            # Publish failure event before propagating the error
+            error_result = ScenarioResult(
+                success=False,
+                messages=self._state.messages,
+                reasoning=f"Scenario failed with error: {str(e)}",
+                total_time=time.time() - self._total_start_time,
+                agent_time=0,
+            )
+            self._emit_run_finished_event(scenario_run_id, error_result, ScenarioRunFinishedEventStatus.ERROR)
+            raise  # Re-raise the exception after cleanup
     async def _call_agent(
         self, idx: int, role: AgentRole, request_judgment: bool = False
@@ -708,15 +751,24 @@ class ScenarioExecutor:
             reasoning=reasoning or "Scenario marked as failed with scenario.fail()",
         )
+    def _consume_until_role(self, role: AgentRole) -> None:
+        while len(self._pending_roles_on_turn) > 0:
+            next_role = self._pending_roles_on_turn[0]
+            if next_role == role:
+                break
+            self._pending_roles_on_turn.pop(0)
     async def _script_call_agent(
         self,
         role: AgentRole,
         content: Optional[Union[str, ChatCompletionMessageParam]] = None,
         request_judgment: bool = False,
     ) -> Optional[ScenarioResult]:
+        self._consume_until_role(role)
         idx, next_agent = self._next_agent_for_role(role)
         if not next_agent:
             self._new_turn()
+            self._consume_until_role(role)
             idx, next_agent = self._next_agent_for_role(role)
             if not next_agent:
@@ -738,11 +790,16 @@ class ScenarioExecutor:
                 )
         self._pending_agents_on_turn.remove(next_agent)
-        self._pending_roles_on_turn.remove(role)
         if content:
             if isinstance(content, str):
-                message = ChatCompletionUserMessageParam(role="user", content=content)
+                message = (
+                    ChatCompletionUserMessageParam(role="user", content=content)
+                    if role == AgentRole.USER
+                    else ChatCompletionAssistantMessageParam(
+                        role="assistant", content=content
+                    )
+                )
             else:
                 message = content
@@ -756,3 +813,130 @@ class ScenarioExecutor:
         )
         if isinstance(result, ScenarioResult):
             return result
+    # Event handling methods
+    class _CommonEventFields(TypedDict):
+        """
+        Common fields shared across all scenario events.
+        These fields provide consistent identification and timing information
+        for all events emitted during scenario execution.
+        Attributes:
+            batch_run_id: Unique identifier for the batch of scenario runs
+            scenario_run_id: Unique identifier for this specific scenario run
+            scenario_id: Human-readable name/identifier for the scenario
+            timestamp: Unix timestamp in milliseconds when the event occurred
+        """
+        batch_run_id: str
+        scenario_run_id: str
+        scenario_id: str
+        timestamp: int
+    def _create_common_event_fields(self, scenario_run_id: str) -> _CommonEventFields:
+        """
+        Create common fields used across all scenario events.
+        This method generates the standard fields that every scenario event
+        must include for proper identification and timing.
+        Args:
+            scenario_run_id: Unique identifier for the current scenario run
+        Returns:
+            Dictionary containing common event fields with current timestamp
+        """
+        return {
+            "batch_run_id": self.batch_run_id,
+            "scenario_run_id": scenario_run_id,
+            "scenario_id": self.name,
+            "timestamp": int(time.time() * 1000),
+        }
+    def _emit_run_started_event(self, scenario_run_id: str) -> None:
+        """
+        Emit a scenario run started event.
+        This event is published when a scenario begins execution. It includes
+        metadata about the scenario such as name and description, and is used
+        to track the start of scenario runs in monitoring systems.
+        Args:
+            scenario_run_id: Unique identifier for the current scenario run
+        Note:
+            This event is automatically published at the beginning of `_run()`
+            and signals the start of scenario execution to any event listeners.
+        """
+        common_fields = self._create_common_event_fields(scenario_run_id)
+        metadata = ScenarioRunStartedEventMetadata(
+            name=self.name,
+            description=self.description,
+        )
+        event = ScenarioRunStartedEvent(
+            **common_fields,
+            metadata=metadata,
+        )
+        self.event_bus.publish(event)
+    def _emit_message_snapshot_event(self, scenario_run_id: str) -> None:
+        """
+        Emit a message snapshot event.
+        This event captures the current state of the conversation during
+        scenario execution. It's published whenever messages are added to
+        the conversation, allowing real-time tracking of scenario progress.
+        Note:
+            This event is automatically published by `add_message()` and
+            `add_messages()` to provide continuous visibility into scenario
+            execution state.
+        """
+        common_fields = self._create_common_event_fields(scenario_run_id)
+        event = ScenarioMessageSnapshotEvent(
+            **common_fields,
+            messages=convert_messages_to_ag_ui_messages(self._state.messages),
+        )
+        self.event_bus.publish(event)
+    def _emit_run_finished_event(
+        self,
+        scenario_run_id: str,
+        result: ScenarioResult,
+        status: ScenarioRunFinishedEventStatus
+    ) -> None:
+        """
+        Emit a scenario run finished event.
+        This event is published when a scenario completes execution, whether
+        successfully or with an error. It includes the final results, verdict,
+        and reasoning for the scenario outcome.
+        Args:
+            scenario_run_id: Unique identifier for the current scenario run
+            result: The final scenario result containing success/failure status
+            status: The execution status (SUCCESS, FAILED, or ERROR)
+        Note:
+            This event is automatically published at the end of `_run()` and
+            signals the completion of scenario execution to any event listeners.
+            It includes detailed results for monitoring and analysis purposes.
+        """
+        common_fields = self._create_common_event_fields(scenario_run_id)
+        results = ScenarioRunFinishedEventResults(
+            verdict=ScenarioRunFinishedEventVerdict.SUCCESS if result.success else ScenarioRunFinishedEventVerdict.FAILURE,
+            reasoning=result.reasoning or "",
+            met_criteria=result.passed_criteria,
+            unmet_criteria=result.failed_criteria,
+        )
+        event = ScenarioRunFinishedEvent(
+            **common_fields,
+            status=status,
+            results=results,
+        )
+        self.event_bus.publish(event)

scenario/scenario_state.py CHANGED Viewed

@@ -36,7 +36,7 @@ class ScenarioState(BaseModel):
         config: Configuration settings for this scenario execution
     Example:
-        ```python
+        ```
         def check_agent_behavior(state: ScenarioState) -> None:
             # Check if the agent called a specific tool
             if state.has_tool_call("get_weather"):
@@ -87,7 +87,7 @@ class ScenarioState(BaseModel):
             message: OpenAI-compatible message to add to the conversation
         Example:
-            ```python
+            ```
             def inject_system_message(state: ScenarioState) -> None:
                 state.add_message({
                     "role": "system",
@@ -108,7 +108,7 @@ class ScenarioState(BaseModel):
             ValueError: If no messages exist in the conversation
         Example:
-            ```python
+            ```
             def check_last_response(state: ScenarioState) -> None:
                 last = state.last_message()
                 if last["role"] == "assistant":
@@ -131,7 +131,7 @@ class ScenarioState(BaseModel):
             ValueError: If no user messages exist in the conversation
         Example:
-            ```python
+            ```
             def analyze_user_intent(state: ScenarioState) -> None:
                 user_msg = state.last_user_message()
                 content = user_msg["content"]
@@ -162,7 +162,7 @@ class ScenarioState(BaseModel):
             The tool call object if found, None otherwise
         Example:
-            ```python
+            ```
             def verify_weather_call(state: ScenarioState) -> None:
                 weather_call = state.last_tool_call("get_current_weather")
                 if weather_call:
@@ -192,7 +192,7 @@ class ScenarioState(BaseModel):
             True if the tool has been called, False otherwise
         Example:
-            ```python
+            ```
             def ensure_tool_usage(state: ScenarioState) -> None:
                 # Verify the agent used required tools
                 assert state.has_tool_call("search_database")

langwatch-scenario 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

langwatch-scenario 0.4.0py3-none-any.whl → 0.6.0py3-none-any.whl