PyPI - langwatch-scenario - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

langwatch-scenario 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.3.0.dist-info}/METADATA +60 -12
langwatch_scenario-0.3.0.dist-info/RECORD +16 -0
scenario/__init__.py +13 -3
scenario/config.py +18 -7
scenario/error_messages.py +81 -23
scenario/pytest_plugin.py +1 -1
scenario/scenario.py +135 -20
scenario/scenario_agent_adapter.py +16 -0
scenario/scenario_executor.py +405 -143
scenario/testing_agent.py +75 -58
scenario/types.py +96 -0
scenario/utils.py +148 -5
langwatch_scenario-0.2.0.dist-info/RECORD +0 -15
scenario/result.py +0 -74
{langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.3.0.dist-info}/WHEEL +0 -0
{langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.3.0.dist-info}/entry_points.txt +0 -0
{langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.3.0.dist-info}/top_level.txt +0 -0

{langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: langwatch-scenario
-Version: 0.2.0
+Version: 0.3.0
 Summary: The end-to-end agent testing library
 Author-email: LangWatch Team <support@langwatch.ai>
 License: MIT
@@ -25,11 +25,13 @@ Requires-Dist: joblib>=1.4.2
 Requires-Dist: wrapt>=1.17.2
 Requires-Dist: pytest-asyncio>=0.26.0
 Requires-Dist: rich<15.0.0,>=13.3.3
+Requires-Dist: pksuid>=1.1.2
 Provides-Extra: dev
 Requires-Dist: black; extra == "dev"
 Requires-Dist: isort; extra == "dev"
-Requires-Dist: mypy; extra == "dev"
 Requires-Dist: pytest-cov; extra == "dev"
+Requires-Dist: pre-commit; extra == "dev"
+Requires-Dist: commitizen; extra == "dev"
 ![scenario](https://github.com/langwatch/scenario/raw/main/assets/scenario-wide.webp)
@@ -39,9 +41,9 @@ Requires-Dist: pytest-cov; extra == "dev"
 # Scenario: Use an Agent to test your Agent
-Scenario is a library for testing agents end-to-end as a human would, but without having to manually do it. The automated testing agent covers every single scenario for you.
+Scenario is an Agent Testing Framework for testing AI agents through Simulation Testing.
-You define the scenarios, and the testing agent will simulate your users as it follows them, it will keep chatting and evaluating your agent until it reaches the desired goal or detects an unexpected behavior.
+You define the scenarios, and the testing agent will simulate a real user as it follows them, it will keep chatting back and forth with _your_ agent to play out the simulation, until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
 [📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
@@ -63,20 +65,23 @@ Now create your first scenario and save it as `tests/test_vegetarian_recipe_agen
 ```python
 import pytest
-from scenario import Scenario, TestingAgent, scenario_cache
+from scenario import Scenario, TestingAgent, ScenarioAgentAdapter, AgentInput, AgentReturnTypes, scenario_cache
 Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
+# Create an adapter to call your agent
+class VegetarianRecipeAgentAdapter(ScenarioAgentAdapter):
+    def __init__(self, input: AgentInput):
+        self.agent = VegetarianRecipeAgent()
+    async def call(self, input: AgentInput) -> AgentReturnTypes:
+        return self.agent.run(input.last_new_user_message_str())
 @pytest.mark.agent_test
 @pytest.mark.asyncio
 async def test_vegetarian_recipe_agent():
-    agent = VegetarianRecipeAgent()
-    def vegetarian_recipe_agent(message, context):
-        # Call your agent here
-        return agent.run(message)
     # Define the simulated scenario
     scenario = Scenario(
         name="dinner idea",
@@ -133,7 +138,7 @@ class VegetarianRecipeAgent:
         message = response.choices[0].message  # type: ignore
         self.history.append(message)
-        return {"messages": [message]}
+        return [message]
 ```
@@ -186,6 +191,49 @@ result = await scenario.run()
 You can find a fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
+## Specify a script for guiding the scenario
+You can specify a script for guiding the scenario by passing a list of steps to the `script` field.
+```python
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_ai_assistant_agent():
+    scenario = Scenario(
+        name="false assumptions",
+        description="""
+            The agent makes false assumption about being an ATM bank, and user corrects it
+        """,
+        agent=AiAssistantAgentAdapter,
+        criteria=[
+            "user should get good recommendations on river crossing",
+            "agent should NOT follow up about ATM recommendation after user has corrected them they are just hiking",
+        ],
+        max_turns=5,
+    )
+    def check_if_tool_was_called(state: ScenarioExecutor) -> None:
+        assert state.has_tool_call("web_search")
+    result = await scenario.script(
+        [
+            # Define existing history of messages
+            scenario.user("how do I safely approach a bank?"),
+            # Or let it be generate automatically
+            scenario.agent(),
+            # Add custom assertions, for example making sure a tool was called
+            check_if_tool_was_called,
+            scenario.user(),
+            # Let the simulation proceed for 2 more turns
+            scenario.proceed(turns=2),
+            # Time to make a judgment call
+            scenario.judge(),
+        ]
+    ).run()
+    assert result.success
+```
 ## Debug mode
 You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running, or by passing the `--debug` flag to pytest.

langwatch_scenario-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+scenario/__init__.py,sha256=0OavO4hoZMFL6frlplNkR7BSHfGSOhuVtmKmTrOMFEs,844
+scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
+scenario/config.py,sha256=NiCCmr8flds-VDzvF8ps4SChVTARtcWfEoHhK0UkDMQ,1076
+scenario/error_messages.py,sha256=8_pa3HIaqkw08qOqeiRKDCNykr9jtofpNJoEV03aRWc,4690
+scenario/pytest_plugin.py,sha256=oJtEPVPi5x50Z-UawVyVPNd6buvh_4msSZ-3hLFpw_Y,5770
+scenario/scenario.py,sha256=K4Snu4-pJaoprEFyly7ZQT8qNlAamxt-eXibCJ0EIJU,7332
+scenario/scenario_agent_adapter.py,sha256=Y2dP3z-2jLYCssQ20oHOphwwrRPQNo2HmLD2KBcJRu0,427
+scenario/scenario_executor.py,sha256=geaP3Znd1he66L6ku3l2IAODj68TtAIk8b8Ssy494xA,15681
+scenario/testing_agent.py,sha256=5S2PIl2hi9FBSVjjs9afXhEgiogryjBIyffH5iJBwdo,10676
+scenario/types.py,sha256=-Uz0qg_fY5vAEkrZnM5CMqE5hiP8OtNErpDdHJmHtac,3179
+scenario/utils.py,sha256=bx813RpZO3xyPfD-dTBbeLM9umWm3PGOq9pw48aJoHI,8113
+langwatch_scenario-0.3.0.dist-info/METADATA,sha256=pywrVOVE2eE4Zk5wePzJoEfErNXWvgK-C8G-qfWp7EI,11040
+langwatch_scenario-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+langwatch_scenario-0.3.0.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
+langwatch_scenario-0.3.0.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
+langwatch_scenario-0.3.0.dist-info/RECORD,,

scenario/__init__.py CHANGED Viewed

@@ -3,10 +3,11 @@ Scenario: A testing library for conversational agents.
 """
 # First import non-dependent modules
-from .result import ScenarioResult
+from .types import ScenarioResult, AgentInput, ScenarioAgentRole, AgentReturnTypes
 from .config import ScenarioConfig
 # Then import modules with dependencies
+from .scenario_agent_adapter import ScenarioAgentAdapter
 from .testing_agent import TestingAgent
 from .scenario import Scenario
 from .cache import scenario_cache
@@ -15,10 +16,19 @@ from .cache import scenario_cache
 from .pytest_plugin import pytest_configure, scenario_reporter
 __all__ = [
-    "Scenario",
-    "TestingAgent",
+    # Types
     "ScenarioResult",
+    "AgentInput",
+    "ScenarioAgentRole",
     "ScenarioConfig",
+    "AgentReturnTypes",
+    # Classes
+    "Scenario",
+    "ScenarioAgentAdapter",
+    "TestingAgent",
+    # Plugins
     "pytest_configure",
     "scenario_reporter",
     "scenario_cache",

scenario/config.py CHANGED Viewed

@@ -2,10 +2,16 @@
 Configuration module for Scenario.
 """
-from typing import Optional, Union
+from typing import TYPE_CHECKING, Any, Optional, Type, Union
 from pydantic import BaseModel
-from scenario.testing_agent import TestingAgent
+if TYPE_CHECKING:
+    from scenario.scenario_agent_adapter import ScenarioAgentAdapter
+    ScenarioAgentType = ScenarioAgentAdapter
+else:
+    ScenarioAgentType = Any
 class ScenarioConfig(BaseModel):
     """
@@ -15,14 +21,19 @@ class ScenarioConfig(BaseModel):
     such as the LLM provider and model to use for the testing agent.
     """
-    testing_agent: Optional[TestingAgent] = None
+    testing_agent: Optional[Type[ScenarioAgentType]] = None
     max_turns: Optional[int] = 10
     verbose: Optional[Union[bool, int]] = True
     cache_key: Optional[str] = None
     debug: Optional[bool] = False
     def merge(self, other: "ScenarioConfig") -> "ScenarioConfig":
-        return ScenarioConfig(**{
-            **self.model_dump(),
-            **other.model_dump(exclude_none=True),
-        })
+        return ScenarioConfig(
+            **{
+                **self.items(),
+                **other.items(),
+            }
+        )
+    def items(self):
+        return {k: getattr(self, k) for k in self.model_dump(exclude_none=True).keys()}

scenario/error_messages.py CHANGED Viewed

@@ -36,41 +36,99 @@ default_config_error_message = f"""
         result = scenario.run()
         assert result.success
-                          """
+"""
-def message_return_error_message(got: Any):
-    got_ = got.__repr__()
+testing_agent_not_configured_error_message = f"""
+ {termcolor.colored("->", "cyan")} Testing agent was initialized without a model, please set the model when defining the testing agent, for example:
+    TestingAgent.with_config(model="openai/gpt-4.1-mini")
+    {termcolor.colored("^" * 53, "green")}
+"""
+def message_return_error_message(got: Any, class_name: str):
+    got_ = repr(got)
     if len(got_) > 100:
         got_ = got_[:100] + "..."
     return f"""
- {termcolor.colored("->", "cyan")} Your agent returned:
+ {termcolor.colored("->", "cyan")} On the {termcolor.colored("call", "green")} method of the {class_name} agent adapter, you returned:
 {indent(got_, ' ' * 4)}
- {termcolor.colored("->", "cyan")} But your agent should return a dict with either a "message" string key or a "messages" key in OpenAI messages format so the testing agent can understand what happened. For example:
+ {termcolor.colored("->", "cyan")} But the adapter should return either a string, a dict on the OpenAI messages format, or a list of messages in the OpenAI messages format so the testing agent can understand what happened. For example:
+    class MyAgentAdapter(ScenarioAgentAdapter):
+        async def call(self, input: AgentInput) -> AgentReturnTypes:
+            response = call_my_agent(message)
+            return response.output_text
+            {termcolor.colored("^" * 27, "green")}
+ {termcolor.colored("->", "cyan")} Alternatively, you can return a list of messages in OpenAI messages format, this is useful for capturing tool calls and other before the final response:
+    class MyAgentAdapter(ScenarioAgentAdapter):
+        async def call(self, input: AgentInput) -> AgentReturnTypes:
+            response = call_my_agent(message)
+            return [
+                {{"role": "assistant", "content": response.output_text}},
+                {termcolor.colored("^" * 55, "green")}
+            ]
+"""
+def message_invalid_agent_type(got: Any):
+    got_ = repr(got)
+    if len(got_) > 100:
+        got_ = got_[:100] + "..."
+    return f"""
+ {termcolor.colored("->", "cyan")} The {termcolor.colored("agent", "green")} argument of Scenario needs to receive a class that inherits from {termcolor.colored("ScenarioAgentAdapter", "green")}, but you passed:
+{indent(got_, ' ' * 4)}
-    def my_agent_under_test(message, context):
-        response = call_my_agent(message)
+ {termcolor.colored("->", "cyan")} Instead, wrap your agent in a ScenarioAgentAdapter subclass. For example:
-        return {{
-            "message": response.output_text
-            {termcolor.colored("^" * 31, "green")}
-        }}
+    class MyAgentAdapter(ScenarioAgentAdapter):
+    {termcolor.colored("^" * 43, "green")}
+        async def call(self, input: AgentInput) -> AgentReturnTypes:
+            response = call_my_agent(message)
- {termcolor.colored("->", "cyan")} Alternatively, you can return a list of messages in OpenAI messages format, you can also optionally provide extra artifacts:
+            return response.output_text
-    def my_agent_under_test(message, context):
-        response = call_my_agent(message)
+ {termcolor.colored("->", "cyan")} And then you can use that on your scenario definition:
-        return {{
-            "messages": [
-                {{"role": "assistant", "content": response}}
-                {termcolor.colored("^" * 42, "green")}
+    @pytest.mark.agent_test
+    def test_my_agent():
+        scenario = Scenario(
+            name="first scenario",
+            description=\"\"\"
+                Example scenario description to test your agent.
+            \"\"\",
+            agent=MyAgentAdapter,
+            {termcolor.colored("^" * 20, "green")}
+            criteria=[
+                "Requirement One",
+                "Requirement Two",
             ],
-            "extra": {{
-                # ... optional extra artifacts
-            }}
-        }}
-                          """
+        )
+        result = scenario.run()
+        assert result.success
+"""
+def agent_response_not_awaitable(class_name: str):
+    return f"""
+ {termcolor.colored("->", "cyan")} The {termcolor.colored("call", "green")} method of the {class_name} agent adapter returned a non-awaitable response, you probably forgot to add the {termcolor.colored("async", "green")} keyword to the method definition, make sure your code looks like this:
+    class {class_name}(ScenarioAgentAdapter):
+        async def call(self, input: AgentInput) -> AgentReturnTypes:
+        {termcolor.colored("^" * 5, "green")}
+            response = call_my_agent(message)
+            return response.output_text
+"""

scenario/pytest_plugin.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import TypedDict
 import functools
 from termcolor import colored
-from scenario.result import ScenarioResult
+from scenario.types import ScenarioResult
 from .scenario import Scenario

scenario/scenario.py CHANGED Viewed

@@ -2,16 +2,29 @@
 Scenario module: defines the core Scenario class for agent testing.
 """
-from typing import Awaitable, List, Dict, Any, Optional, Callable, TypedDict, Union
+from typing import (
+    Awaitable,
+    Callable,
+    List,
+    Dict,
+    Any,
+    Optional,
+    Type,
+    TypedDict,
+    Union,
+)
 import asyncio
 import concurrent.futures
-from functools import partial
 from scenario.config import ScenarioConfig
+from scenario.error_messages import (
+    default_config_error_message,
+    message_invalid_agent_type,
+)
+from scenario.scenario_agent_adapter import ScenarioAgentAdapter
 from scenario.scenario_executor import ScenarioExecutor
-from .result import ScenarioResult
-from .testing_agent import TestingAgent
+from .types import ScenarioResult, ScriptStep
 from openai.types.chat import ChatCompletionMessageParam
@@ -34,18 +47,38 @@ class Scenario(ScenarioConfig):
     name: str
     description: str
-    agent: Union[
-        Callable[[str, Optional[Dict[str, Any]]], Dict[str, Any]],
-        Callable[[str, Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]],
-    ]
+    agents: List[Type[ScenarioAgentAdapter]]
     criteria: List[str]
-    def __init__(self, name: str, description: str, **kwargs):
+    def __init__(
+        self,
+        name: str,
+        description: str,
+        criteria: List[str] = [],
+        agent: Optional[Type[ScenarioAgentAdapter]] = None,
+        testing_agent: Optional[Type[ScenarioAgentAdapter]] = None,
+        agents: List[Type[ScenarioAgentAdapter]] = [],
+        max_turns: Optional[int] = None,
+        verbose: Optional[Union[bool, int]] = None,
+        cache_key: Optional[str] = None,
+        debug: Optional[bool] = None,
+    ):
         """Validate scenario configuration after initialization."""
-        default_config = getattr(Scenario, "default_config", None)
+        config = ScenarioConfig(
+            testing_agent=testing_agent,
+            max_turns=max_turns,
+            verbose=verbose,
+            cache_key=cache_key,
+            debug=debug,
+        )
+        kwargs = config.items()
+        default_config: Optional[ScenarioConfig] = getattr(
+            Scenario, "default_config", None
+        )
         if default_config:
-            kwargs = {**default_config.model_dump(), **kwargs}
+            kwargs = default_config.merge(config).items()
         if not name:
             raise ValueError("Scenario name cannot be empty")
@@ -55,19 +88,48 @@ class Scenario(ScenarioConfig):
             raise ValueError("Scenario description cannot be empty")
         kwargs["description"] = description
-        # TODO: allow not having any criteria, for scripted scenarios
-        if not kwargs.get("criteria"):
-            raise ValueError("Scenario must have at least one criteria")
+        kwargs["criteria"] = criteria
-        if kwargs.get("max_turns", 0) < 1:
+        if kwargs.get("max_turns", 10) < 1:
             raise ValueError("max_turns must be a positive integer")
-        # Ensure agent is callable
-        if not callable(kwargs.get("agent")):
-            raise ValueError("Agent must be a callable function")
+        if not agents and not agent:
+            raise ValueError(
+                "Missing required argument `agent`. Either `agent` or `agents` argument must be provided for the Scenario"
+            )
+        if not agents and not kwargs.get("testing_agent"):
+            raise Exception(default_config_error_message)
+        agents = agents or [
+            kwargs.get("testing_agent"),
+            agent,  # type: ignore
+        ]
+        # Ensure each agent is a ScenarioAgentAdapter
+        for agent in agents:
+            if (
+                not agent
+                or not isinstance(agent, type)
+                or not issubclass(agent, ScenarioAgentAdapter)
+            ):
+                raise ValueError(message_invalid_agent_type(agent))
+        kwargs["agents"] = agents
         super().__init__(**kwargs)
+    def script(self, script: List[ScriptStep]):
+        class ScriptedScenario:
+            def __init__(self, scenario: "Scenario"):
+                self._scenario = scenario
+            async def run(
+                self, context: Optional[Dict[str, Any]] = None
+            ) -> ScenarioResult:
+                return await self._scenario._run(context, script)
+        return ScriptedScenario(self)
     async def run(self, context: Optional[Dict[str, Any]] = None) -> ScenarioResult:
         """
         Run the scenario against the agent under test.
@@ -79,6 +141,13 @@ class Scenario(ScenarioConfig):
             ScenarioResult containing the test outcome
         """
+        return await self._run(context, None)
+    async def _run(
+        self,
+        context: Optional[Dict[str, Any]] = None,
+        script: Optional[List[ScriptStep]] = None,
+    ) -> ScenarioResult:
         # We'll use a thread pool to run the execution logic, we
         # require a separate thread because even though asyncio is
         # being used throughout, any user code on the callback can
@@ -90,7 +159,9 @@ class Scenario(ScenarioConfig):
                 asyncio.set_event_loop(loop)
                 try:
-                    return loop.run_until_complete(ScenarioExecutor(self).run(context))
+                    return loop.run_until_complete(
+                        ScenarioExecutor(self, context, script).run()
+                    )
                 finally:
                     loop.close()
@@ -104,7 +175,7 @@ class Scenario(ScenarioConfig):
     @classmethod
     def configure(
         cls,
-        testing_agent: Optional[TestingAgent] = None,
+        testing_agent: Optional[Type[ScenarioAgentAdapter]] = None,
         max_turns: Optional[int] = None,
         verbose: Optional[Union[bool, int]] = None,
         cache_key: Optional[str] = None,
@@ -121,3 +192,47 @@ class Scenario(ScenarioConfig):
                 debug=debug,
             )
         )
+    # Scenario Scripting
+    def message(self, message: ChatCompletionMessageParam) -> ScriptStep:
+        return lambda state: state.message(message)
+    def user(
+        self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
+    ) -> ScriptStep:
+        return lambda state: state.user(content)
+    def agent(
+        self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
+    ) -> ScriptStep:
+        return lambda state: state.agent(content)
+    def judge(
+        self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
+    ) -> ScriptStep:
+        return lambda state: state.judge(content)
+    def proceed(
+        self,
+        turns: Optional[int] = None,
+        on_turn: Optional[
+            Union[
+                Callable[[ScenarioExecutor], None],
+                Callable[[ScenarioExecutor], Awaitable[None]],
+            ]
+        ] = None,
+        on_step: Optional[
+            Union[
+                Callable[[ScenarioExecutor], None],
+                Callable[[ScenarioExecutor], Awaitable[None]],
+            ]
+        ] = None,
+    ) -> ScriptStep:
+        return lambda state: state.proceed(turns, on_turn, on_step)
+    def succeed(self) -> ScriptStep:
+        return lambda state: state.succeed()
+    def fail(self) -> ScriptStep:
+        return lambda state: state.fail()

scenario/scenario_agent_adapter.py ADDED Viewed

@@ -0,0 +1,16 @@
+from abc import ABC, abstractmethod
+from typing import ClassVar, Set
+from .types import AgentInput, AgentReturnTypes, ScenarioAgentRole
+class ScenarioAgentAdapter(ABC):
+    roles: ClassVar[Set[ScenarioAgentRole]] = {ScenarioAgentRole.AGENT}
+    def __init__(self, input: AgentInput):
+        super().__init__()
+        pass
+    @abstractmethod
+    async def call(self, input: AgentInput) -> AgentReturnTypes:
+        pass

langwatch-scenario 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

langwatch-scenario 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl