PyPI - langwatch-scenario - Versions diffs - 0.3.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

langwatch-scenario 0.3.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

langwatch_scenario-0.6.0.dist-info/METADATA +385 -0
langwatch_scenario-0.6.0.dist-info/RECORD +27 -0
scenario/__init__.py +128 -17
scenario/{error_messages.py → _error_messages.py} +8 -38
scenario/_utils/__init__.py +32 -0
scenario/_utils/ids.py +58 -0
scenario/_utils/message_conversion.py +103 -0
scenario/_utils/utils.py +425 -0
scenario/agent_adapter.py +115 -0
scenario/cache.py +134 -9
scenario/config.py +156 -10
scenario/events/__init__.py +66 -0
scenario/events/event_bus.py +175 -0
scenario/events/event_reporter.py +83 -0
scenario/events/events.py +169 -0
scenario/events/messages.py +84 -0
scenario/events/utils.py +86 -0
scenario/judge_agent.py +414 -0
scenario/pytest_plugin.py +177 -14
scenario/scenario_executor.py +630 -154
scenario/scenario_state.py +205 -0
scenario/script.py +361 -0
scenario/types.py +197 -20
scenario/user_simulator_agent.py +242 -0
langwatch_scenario-0.3.0.dist-info/METADATA +0 -302
langwatch_scenario-0.3.0.dist-info/RECORD +0 -16
scenario/scenario.py +0 -238
scenario/scenario_agent_adapter.py +0 -16
scenario/testing_agent.py +0 -279
scenario/utils.py +0 -264
{langwatch_scenario-0.3.0.dist-info → langwatch_scenario-0.6.0.dist-info}/WHEEL +0 -0
{langwatch_scenario-0.3.0.dist-info → langwatch_scenario-0.6.0.dist-info}/entry_points.txt +0 -0
{langwatch_scenario-0.3.0.dist-info → langwatch_scenario-0.6.0.dist-info}/top_level.txt +0 -0

scenario/types.py CHANGED Viewed

@@ -6,8 +6,6 @@ from typing import (
     Any,
     Awaitable,
     Callable,
-    Coroutine,
-    Dict,
     List,
     Optional,
     Union,
@@ -17,29 +15,85 @@ from openai.types.chat import ChatCompletionMessageParam, ChatCompletionUserMess
 # Prevent circular imports + Pydantic breaking
 if TYPE_CHECKING:
-    from scenario.scenario_executor import ScenarioExecutor
+    from scenario.scenario_executor import ScenarioState
-    ScenarioExecutorType = ScenarioExecutor
+    ScenarioStateType = ScenarioState
 else:
-    ScenarioExecutorType = Any
+    ScenarioStateType = Any
-class ScenarioAgentRole(Enum):
+class AgentRole(Enum):
+    """
+    Defines the different roles that agents can play in a scenario.
+    This enum is used to identify the role of each agent during scenario execution,
+    enabling the framework to determine the order and interaction patterns between
+    different types of agents.
+    Attributes:
+        USER: Represents a user simulator agent that generates user inputs
+        AGENT: Represents the agent under test that responds to user inputs
+        JUDGE: Represents a judge agent that evaluates the conversation and determines success/failure
+    """
     USER = "User"
     AGENT = "Agent"
     JUDGE = "Judge"
 class AgentInput(BaseModel):
+    """
+    Input data structure passed to agent adapters during scenario execution.
+    This class encapsulates all the information an agent needs to generate its next response,
+    including conversation history, thread context, and scenario state. It provides convenient
+    methods to access the most recent user messages.
+    Attributes:
+        thread_id: Unique identifier for the conversation thread
+        messages: Complete conversation history as OpenAI-compatible messages
+        new_messages: Only the new messages since the agent's last call
+        judgment_request: Whether this call is requesting a judgment from a judge agent
+        scenario_state: Current state of the scenario execution
+    Example:
+        ```
+        class MyAgent(AgentAdapter):
+            async def call(self, input: AgentInput) -> str:
+                # Get the latest user message
+                user_msg = input.last_new_user_message_str()
+                # Process with your LLM/agent
+                response = await my_llm.complete(
+                    messages=input.messages,
+                    prompt=user_msg
+                )
+                return response
+        ```
+    """
     thread_id: str
     # Prevent pydantic from validating/parsing the messages and causing issues: https://github.com/pydantic/pydantic/issues/9541
     messages: Annotated[List[ChatCompletionMessageParam], SkipValidation]
     new_messages: Annotated[List[ChatCompletionMessageParam], SkipValidation]
-    context: Dict[str, Any]
-    requested_role: ScenarioAgentRole
-    scenario_state: ScenarioExecutorType = Field(exclude=True)
+    judgment_request: bool = False
+    scenario_state: ScenarioStateType
     def last_new_user_message(self) -> ChatCompletionUserMessageParam:
+        """
+        Get the most recent user message from the new messages.
+        Returns:
+            The last user message in OpenAI message format
+        Raises:
+            ValueError: If no new user messages are found
+        Example:
+            ```
+            user_message = input.last_new_user_message()
+            content = user_message["content"]
+            ```
+        """
         user_messages = [m for m in self.new_messages if m["role"] == "user"]
         if not user_messages:
             raise ValueError(
@@ -48,6 +102,24 @@ class AgentInput(BaseModel):
         return user_messages[-1]
     def last_new_user_message_str(self) -> str:
+        """
+        Get the content of the most recent user message as a string.
+        This is a convenience method for getting simple text content from user messages.
+        For multimodal messages or complex content, use last_new_user_message() instead.
+        Returns:
+            The text content of the last user message
+        Raises:
+            ValueError: If no new user messages found or if the message content is not a string
+        Example:
+            ```
+            user_text = input.last_new_user_message_str()
+            response = f"You said: {user_text}"
+            ```
+        """
         content = self.last_new_user_message()["content"]
         if type(content) != str:
             raise ValueError(
@@ -58,14 +130,41 @@ class AgentInput(BaseModel):
 class ScenarioResult(BaseModel):
     """
-    Represents the results of a scenario test run.
+    Represents the final result of a scenario test execution.
+    This class contains all the information about how a scenario performed,
+    including whether it succeeded, the conversation that took place, and
+    detailed reasoning about which criteria were met or failed.
     Attributes:
-        success: Whether the scenario passed
-        conversation: The conversation history
-        reasoning: Reasoning for the result
-        passed_criteria: List of criteria that were met
-        failed_criteria: List of criteria that were not met
+        success: Whether the scenario passed all criteria and completed successfully
+        messages: Complete conversation history that occurred during the scenario
+        reasoning: Detailed explanation of why the scenario succeeded or failed
+        passed_criteria: List of success criteria that were satisfied
+        failed_criteria: List of success criteria that were not satisfied
+        total_time: Total execution time in seconds (if measured)
+        agent_time: Time spent in agent calls in seconds (if measured)
+    Example:
+        ```
+        result = await scenario.run(
+            name="weather query",
+            description="User asks about weather",
+            agents=[
+                weather_agent,
+                scenario.UserSimulatorAgent(),
+                scenario.JudgeAgent(criteria=["Agent provides helpful weather information"])
+            ]
+        )
+        print(f"Test {'PASSED' if result.success else 'FAILED'}")
+        print(f"Reasoning: {result.reasoning}")
+        if not result.success:
+            print("Failed criteria:")
+            for criteria in result.failed_criteria:
+                print(f"  - {criteria}")
+        ```
     """
     success: bool
@@ -77,7 +176,12 @@ class ScenarioResult(BaseModel):
     agent_time: Optional[float] = None
     def __repr__(self) -> str:
-        """Provide a concise representation for debugging."""
+        """
+        Provide a concise representation for debugging and logging.
+        Returns:
+            A string representation showing success status and reasoning
+        """
         status = "PASSED" if self.success else "FAILED"
         return f"ScenarioResult(success={self.success}, status={status}, reasoning='{self.reasoning or 'None'}')"
@@ -85,12 +189,85 @@ class ScenarioResult(BaseModel):
 AgentReturnTypes = Union[
     str, ChatCompletionMessageParam, List[ChatCompletionMessageParam], ScenarioResult
 ]
+"""
+Union type representing all valid return types for agent adapter call methods.
+Agent adapters can return any of these types:
+- str: Simple text response
+- ChatCompletionMessageParam: Single OpenAI-compatible message
+- List[ChatCompletionMessageParam]: Multiple OpenAI-compatible messages (for multi-step responses)
+- ScenarioResult: Direct test result (typically used by judge agents to end scenarios)
+Example:
+    ```
+    class MyAgent(AgentAdapter):
+        async def call(self, input: AgentInput) -> AgentReturnTypes:
+            # Can return a simple string
+            return "Hello, how can I help you?"
+            # Or a structured message
+            return {"role": "assistant", "content": "Hello!"}
+            # Or multiple messages for complex interactions
+            return [
+                {"role": "assistant", "content": "Let me search for that..."},
+                {"role": "assistant", "content": "Here's what I found: ..."}
+            ]
+    ```
+"""
 # TODO: remove the optional ScenarioResult return type from here, use events instead
 ScriptStep = Union[
-    Callable[["ScenarioExecutor"], None],
-    Callable[["ScenarioExecutor"], Optional[ScenarioResult]],
+    Callable[["ScenarioState"], None],
+    Callable[["ScenarioState"], Optional[ScenarioResult]],
     # Async as well
-    Callable[["ScenarioExecutor"], Awaitable[None]],
-    Callable[["ScenarioExecutor"], Awaitable[Optional[ScenarioResult]]],
+    Callable[["ScenarioState"], Awaitable[None]],
+    Callable[["ScenarioState"], Awaitable[Optional[ScenarioResult]]],
 ]
+"""
+Union type for script step functions used in scenario scripts.
+Script steps are functions that can be called during scenario execution to control
+the flow, add custom assertions, or perform evaluations. They receive the current
+scenario state and can optionally return a result to end the scenario.
+The functions can be either synchronous or asynchronous.
+Example:
+    ```
+    def check_tool_call(state: ScenarioState) -> None:
+        assert state.has_tool_call("get_weather")
+    async def custom_evaluation(state: ScenarioState) -> Optional[ScenarioResult]:
+        eval_result = await some_external_evaluator(state.messages)
+        if not eval_result.passed:
+            return ScenarioResult(
+                success=False,
+                messages=state.messages,
+                reasoning="Custom evaluation failed"
+            )
+        return None  # Continue scenario
+    # Use in script
+    result = await scenario.run(
+        name="test",
+        description="Test scenario",
+        agents=[
+            MyAgent(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(criteria=["Agent provides helpful response"])
+        ],
+        script=[
+            scenario.user("What's the weather?"),
+            scenario.agent(),
+            check_tool_call,
+            custom_evaluation,
+            scenario.succeed()
+        ]
+    )
+    ```
+"""

scenario/user_simulator_agent.py ADDED Viewed

@@ -0,0 +1,242 @@
+"""
+User simulator agent module for generating realistic user interactions.
+This module provides the UserSimulatorAgent class, which simulates human user
+behavior in conversations with agents under test. The simulator generates
+contextually appropriate user messages based on the scenario description and
+conversation history.
+"""
+import logging
+from typing import Optional, cast
+from litellm import Choices, completion
+from litellm.files.main import ModelResponse
+from scenario.cache import scenario_cache
+from scenario.agent_adapter import AgentAdapter
+from scenario._utils.utils import reverse_roles
+from scenario.config import ModelConfig, ScenarioConfig
+from ._error_messages import agent_not_configured_error_message
+from .types import AgentInput, AgentReturnTypes, AgentRole
+logger = logging.getLogger("scenario")
+class UserSimulatorAgent(AgentAdapter):
+    """
+    Agent that simulates realistic user behavior in scenario conversations.
+    This agent generates user messages that are appropriate for the given scenario
+    context, simulating how a real human user would interact with the agent under test.
+    It uses an LLM to generate natural, contextually relevant user inputs that help
+    drive the conversation forward according to the scenario description.
+    Attributes:
+        role: Always AgentRole.USER for user simulator agents
+        model: LLM model identifier to use for generating user messages
+        api_key: Optional API key for the model provider
+        temperature: Sampling temperature for response generation
+        max_tokens: Maximum tokens to generate in user messages
+        system_prompt: Custom system prompt to override default user simulation behavior
+    Example:
+        ```
+        import scenario
+        # Basic user simulator with default behavior
+        user_sim = scenario.UserSimulatorAgent(
+            model="openai/gpt-4.1-mini"
+        )
+        # Customized user simulator
+        custom_user_sim = scenario.UserSimulatorAgent(
+            model="openai/gpt-4.1-mini",
+            temperature=0.3,
+            system_prompt="You are a technical user who asks detailed questions"
+        )
+        # Use in scenario
+        result = await scenario.run(
+            name="user interaction test",
+            description="User seeks help with Python programming",
+            agents=[
+                my_programming_agent,
+                user_sim,
+                scenario.JudgeAgent(criteria=["Provides helpful code examples"])
+            ]
+        )
+        ```
+    Note:
+        - The user simulator automatically generates short, natural user messages
+        - It follows the scenario description to stay on topic
+        - Messages are generated in a casual, human-like style (lowercase, brief, etc.)
+        - The simulator will not act as an assistant - it only generates user inputs
+    """
+    role = AgentRole.USER
+    model: str
+    api_key: Optional[str]
+    temperature: float
+    max_tokens: Optional[int]
+    system_prompt: Optional[str]
+    def __init__(
+        self,
+        *,
+        model: Optional[str] = None,
+        api_key: Optional[str] = None,
+        temperature: float = 0.0,
+        max_tokens: Optional[int] = None,
+        system_prompt: Optional[str] = None,
+    ):
+        """
+        Initialize a user simulator agent.
+        Args:
+            model: LLM model identifier (e.g., "openai/gpt-4.1-mini").
+                   If not provided, uses the default model from global configuration.
+            api_key: API key for the model provider. If not provided,
+                     uses the key from global configuration or environment.
+            temperature: Sampling temperature for message generation (0.0-1.0).
+                        Lower values make responses more deterministic.
+            max_tokens: Maximum number of tokens to generate in user messages.
+                       If not provided, uses model defaults.
+            system_prompt: Custom system prompt to override default user simulation behavior.
+                          Use this to create specialized user personas or behaviors.
+        Raises:
+            Exception: If no model is configured either in parameters or global config
+        Example:
+            ```
+            # Basic user simulator
+            user_sim = UserSimulatorAgent(model="openai/gpt-4.1-mini")
+            # User simulator with custom persona
+            expert_user = UserSimulatorAgent(
+                model="openai/gpt-4.1-mini",
+                temperature=0.2,
+                system_prompt='''
+                You are an expert software developer testing an AI coding assistant.
+                Ask challenging, technical questions and be demanding about code quality.
+                '''
+            )
+            ```
+        """
+        # Override the default system prompt for the user simulator agent
+        self.api_key = api_key
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        self.system_prompt = system_prompt
+        if model:
+            self.model = model
+        if ScenarioConfig.default_config is not None and isinstance(
+            ScenarioConfig.default_config.default_model, str
+        ):
+            self.model = model or ScenarioConfig.default_config.default_model
+        elif ScenarioConfig.default_config is not None and isinstance(
+            ScenarioConfig.default_config.default_model, ModelConfig
+        ):
+            self.model = model or ScenarioConfig.default_config.default_model.model
+            self.api_key = (
+                api_key or ScenarioConfig.default_config.default_model.api_key
+            )
+            self.temperature = (
+                temperature or ScenarioConfig.default_config.default_model.temperature
+            )
+            self.max_tokens = (
+                max_tokens or ScenarioConfig.default_config.default_model.max_tokens
+            )
+        if not hasattr(self, "model"):
+            raise Exception(agent_not_configured_error_message("TestingAgent"))
+    @scenario_cache()
+    async def call(
+        self,
+        input: AgentInput,
+    ) -> AgentReturnTypes:
+        """
+        Generate the next user message in the conversation.
+        This method analyzes the current conversation state and scenario context
+        to generate an appropriate user message that moves the conversation forward
+        in a realistic, human-like manner.
+        Args:
+            input: AgentInput containing conversation history and scenario context
+        Returns:
+            AgentReturnTypes: A user message in OpenAI format that continues the conversation
+        Note:
+            - Messages are generated in a casual, human-like style
+            - The simulator follows the scenario description to stay contextually relevant
+            - Uses role reversal internally to work around LLM biases toward assistant roles
+            - Results are cached when cache_key is configured for deterministic testing
+        """
+        scenario = input.scenario_state
+        messages = [
+            {
+                "role": "system",
+                "content": self.system_prompt
+                or f"""
+<role>
+You are pretending to be a user, you are testing an AI Agent (shown as the user role) based on a scenario.
+Approach this naturally, as a human user would, with very short inputs, few words, all lowercase, imperative, not periods, like when they google or talk to chatgpt.
+</role>
+<goal>
+Your goal (assistant) is to interact with the Agent Under Test (user) as if you were a human user to see if it can complete the scenario successfully.
+</goal>
+<scenario>
+{scenario.description}
+</scenario>
+<rules>
+- DO NOT carry over any requests yourself, YOU ARE NOT the assistant today, you are the user
+</rules>
+""",
+            },
+            {"role": "assistant", "content": "Hello, how can I help you today?"},
+            *input.messages,
+        ]
+        # User to assistant role reversal
+        # LLM models are biased to always be the assistant not the user, so we need to do this reversal otherwise models like GPT 4.5 is
+        # super confused, and Claude 3.7 even starts throwing exceptions.
+        messages = reverse_roles(messages)
+        response = cast(
+            ModelResponse,
+            completion(
+                model=self.model,
+                messages=messages,
+                temperature=self.temperature,
+                max_tokens=self.max_tokens,
+                tools=[],
+            ),
+        )
+        # Extract the content from the response
+        if hasattr(response, "choices") and len(response.choices) > 0:
+            message = cast(Choices, response.choices[0]).message
+            message_content = message.content
+            if message_content is None:
+                raise Exception(f"No response from LLM: {response.__repr__()}")
+            return {"role": "user", "content": message_content}
+        else:
+            raise Exception(
+                f"Unexpected response format from LLM: {response.__repr__()}"
+            )

langwatch-scenario 0.3.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

langwatch-scenario 0.3.0py3-none-any.whl → 0.6.0py3-none-any.whl