langwatch-scenario 0.3.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,302 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: langwatch-scenario
3
- Version: 0.3.0
4
- Summary: The end-to-end agent testing library
5
- Author-email: LangWatch Team <support@langwatch.ai>
6
- License: MIT
7
- Project-URL: Homepage, https://github.com/langwatch/scenario
8
- Project-URL: Bug Tracker, https://github.com/langwatch/scenario/issues
9
- Classifier: Development Status :: 4 - Beta
10
- Classifier: Intended Audience :: Developers
11
- Classifier: License :: OSI Approved :: MIT License
12
- Classifier: Programming Language :: Python :: 3
13
- Classifier: Programming Language :: Python :: 3.8
14
- Classifier: Programming Language :: Python :: 3.9
15
- Classifier: Programming Language :: Python :: 3.10
16
- Classifier: Programming Language :: Python :: 3.11
17
- Requires-Python: >=3.9
18
- Description-Content-Type: text/markdown
19
- Requires-Dist: pytest>=8.1.1
20
- Requires-Dist: litellm>=1.49.0
21
- Requires-Dist: python-dotenv>=1.0.1
22
- Requires-Dist: termcolor>=2.4.0
23
- Requires-Dist: pydantic>=2.7.0
24
- Requires-Dist: joblib>=1.4.2
25
- Requires-Dist: wrapt>=1.17.2
26
- Requires-Dist: pytest-asyncio>=0.26.0
27
- Requires-Dist: rich<15.0.0,>=13.3.3
28
- Requires-Dist: pksuid>=1.1.2
29
- Provides-Extra: dev
30
- Requires-Dist: black; extra == "dev"
31
- Requires-Dist: isort; extra == "dev"
32
- Requires-Dist: pytest-cov; extra == "dev"
33
- Requires-Dist: pre-commit; extra == "dev"
34
- Requires-Dist: commitizen; extra == "dev"
35
-
36
- ![scenario](https://github.com/langwatch/scenario/raw/main/assets/scenario-wide.webp)
37
-
38
- <div align="center">
39
- <!-- Discord, PyPI, Docs, etc links -->
40
- </div>
41
-
42
- # Scenario: Use an Agent to test your Agent
43
-
44
- Scenario is an Agent Testing Framework for testing AI agents through Simulation Testing.
45
-
46
- You define the scenarios, and the testing agent will simulate a real user as it follows them, it will keep chatting back and forth with _your_ agent to play out the simulation, until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
47
-
48
- [📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
49
-
50
- ### See also
51
-
52
- - [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
53
- - [Scenario Go](https://github.com/langwatch/scenario-go/)
54
-
55
- ## Getting Started
56
-
57
- Install pytest and scenario:
58
-
59
- ```bash
60
- pip install pytest langwatch-scenario
61
- ```
62
-
63
- Now create your first scenario and save it as `tests/test_vegetarian_recipe_agent.py`:
64
-
65
- ```python
66
- import pytest
67
-
68
- from scenario import Scenario, TestingAgent, ScenarioAgentAdapter, AgentInput, AgentReturnTypes, scenario_cache
69
-
70
- Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
71
-
72
-
73
- # Create an adapter to call your agent
74
- class VegetarianRecipeAgentAdapter(ScenarioAgentAdapter):
75
- def __init__(self, input: AgentInput):
76
- self.agent = VegetarianRecipeAgent()
77
-
78
- async def call(self, input: AgentInput) -> AgentReturnTypes:
79
- return self.agent.run(input.last_new_user_message_str())
80
-
81
-
82
- @pytest.mark.agent_test
83
- @pytest.mark.asyncio
84
- async def test_vegetarian_recipe_agent():
85
- # Define the simulated scenario
86
- scenario = Scenario(
87
- name="dinner idea",
88
- description="""
89
- It's saturday evening, the user is very hungry and tired,
90
- but have no money to order out, so they are looking for a recipe.
91
-
92
- The user never mentions they want a vegetarian recipe.
93
- """,
94
- agent=vegetarian_recipe_agent,
95
- # List the evaluation criteria for the scenario to be considered successful
96
- criteria=[
97
- "Agent should not ask more than two follow-up questions",
98
- "Agent should generate a recipe",
99
- "Recipe should include a list of ingredients",
100
- "Recipe should include step-by-step cooking instructions",
101
- "Recipe should be vegetarian and not include any sort of meat",
102
- ],
103
- )
104
-
105
- # Run the scenario and get results
106
- result = await scenario.run()
107
-
108
- # Assert for pytest to know whether the test passed
109
- assert result.success
110
-
111
-
112
- # Example agent implementation
113
- import litellm
114
-
115
-
116
- class VegetarianRecipeAgent:
117
- def __init__(self):
118
- self.history = []
119
-
120
- @scenario_cache()
121
- def run(self, message: str):
122
- self.history.append({"role": "user", "content": message})
123
-
124
- response = litellm.completion(
125
- model="openai/gpt-4o-mini",
126
- messages=[
127
- {
128
- "role": "system",
129
- "content": """
130
- You are a vegetarian recipe agent.
131
- Given the user request, ask AT MOST ONE follow-up question,
132
- then provide a complete recipe. Keep your responses concise and focused.
133
- """,
134
- },
135
- *self.history,
136
- ],
137
- )
138
- message = response.choices[0].message # type: ignore
139
- self.history.append(message)
140
-
141
- return [message]
142
-
143
- ```
144
-
145
- Create a `.env` file and put your OpenAI API key in it:
146
-
147
- ```bash
148
- OPENAI_API_KEY=<your-api-key>
149
- ```
150
-
151
- Now run it with pytest:
152
-
153
- ```bash
154
- pytest -s tests/test_vegetarian_recipe_agent.py
155
- ```
156
-
157
- This is how it will look like:
158
-
159
- [![asciicast](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11.svg)](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11)
160
-
161
- You can find a fully working example in [examples/test_vegetarian_recipe_agent.py](examples/test_vegetarian_recipe_agent.py).
162
-
163
- ## Customize strategy and max_turns
164
-
165
- You can customize how should the testing agent go about testing by defining a `strategy` field. You can also limit the maximum number of turns the scenario will take by setting the `max_turns` field (defaults to 10).
166
-
167
- For example, in this Lovable Clone scenario test:
168
-
169
- ```python
170
- scenario = Scenario(
171
- name="dog walking startup landing page",
172
- description="""
173
- the user wants to create a new landing page for their dog walking startup
174
-
175
- send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict
176
- """,
177
- agent=lovable_agent,
178
- criteria=[
179
- "agent reads the files before go and making changes",
180
- "agent modified the index.css file, not only the Index.tsx file",
181
- "agent created a comprehensive landing page",
182
- "agent extended the landing page with a new section",
183
- "agent should NOT say it can't read the file",
184
- "agent should NOT produce incomplete code or be too lazy to finish",
185
- ],
186
- max_turns=5,
187
- )
188
-
189
- result = await scenario.run()
190
- ```
191
-
192
- You can find a fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
193
-
194
- ## Specify a script for guiding the scenario
195
-
196
- You can specify a script for guiding the scenario by passing a list of steps to the `script` field.
197
-
198
- ```python
199
- @pytest.mark.agent_test
200
- @pytest.mark.asyncio
201
- async def test_ai_assistant_agent():
202
- scenario = Scenario(
203
- name="false assumptions",
204
- description="""
205
- The agent makes false assumption about being an ATM bank, and user corrects it
206
- """,
207
- agent=AiAssistantAgentAdapter,
208
- criteria=[
209
- "user should get good recommendations on river crossing",
210
- "agent should NOT follow up about ATM recommendation after user has corrected them they are just hiking",
211
- ],
212
- max_turns=5,
213
- )
214
-
215
- def check_if_tool_was_called(state: ScenarioExecutor) -> None:
216
- assert state.has_tool_call("web_search")
217
-
218
- result = await scenario.script(
219
- [
220
- # Define existing history of messages
221
- scenario.user("how do I safely approach a bank?"),
222
- # Or let it be generate automatically
223
- scenario.agent(),
224
- # Add custom assertions, for example making sure a tool was called
225
- check_if_tool_was_called,
226
- scenario.user(),
227
- # Let the simulation proceed for 2 more turns
228
- scenario.proceed(turns=2),
229
- # Time to make a judgment call
230
- scenario.judge(),
231
- ]
232
- ).run()
233
-
234
- assert result.success
235
- ```
236
-
237
- ## Debug mode
238
-
239
- You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running, or by passing the `--debug` flag to pytest.
240
-
241
- Debug mode allows you to see the messages in slow motion step by step, and intervene with your own inputs to debug your agent from the middle of the conversation.
242
-
243
- ```python
244
- Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"), debug=True)
245
- ```
246
-
247
- or
248
-
249
- ```bash
250
- pytest -s tests/test_vegetarian_recipe_agent.py --debug
251
- ```
252
-
253
- ## Cache
254
-
255
- Each time the scenario runs, the testing agent might chose a different input to start, this is good to make sure it covers the variance of real users as well, however we understand that the non-deterministic nature of it might make it less repeatable, costly and harder to debug. To solve for it, you can use the `cache_key` field in the `Scenario.configure` method or in the specific scenario you are running, this will make the testing agent give the same input for given the same scenario:
256
-
257
- ```python
258
- Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"), cache_key="42")
259
- ```
260
-
261
- To bust the cache, you can simply pass a different `cache_key`, disable it, or delete the cache files located at `~/.scenario/cache`.
262
-
263
- To go a step further and fully cache the test end-to-end, you can also wrap the LLM calls or any other non-deterministic functions in your application side with the `@scenario_cache` decorator:
264
-
265
- ```python
266
- class MyAgent:
267
- @scenario_cache(ignore=["self"])
268
- def invoke(self, message, context):
269
- return client.chat.completions.create(
270
- # ...
271
- )
272
- ```
273
-
274
- This will cache any function call you decorate when running the tests and make them repeatable, hashed by the function arguments, the scenario being executed, and the `cache_key` you provided. You can exclude arguments that should not be hashed for the cache key by naming them in the `ignore` argument.
275
-
276
- ## Disable Output
277
-
278
- You can remove the `-s` flag from pytest to hide the output during test, which will only show up if the test fails. Alternatively, you can set `verbose=False` in the `Scenario.configure` method or in the specific scenario you are running.
279
-
280
- ## Running in parallel
281
-
282
- As the number of your scenarios grows, you might want to run them in parallel to speed up your whole test suite. We suggest you to use the [pytest-asyncio-concurrent](https://pypi.org/project/pytest-asyncio-concurrent/) plugin to do so.
283
-
284
- Simply install the plugin from the link above, then replace the `@pytest.mark.asyncio` annotation in the tests with `@pytest.mark.asyncio_concurrent`, adding a group name to it to mark the group of scenarions that should be run in parallel together, e.g.:
285
-
286
- ```python
287
- @pytest.mark.agent_test
288
- @pytest.mark.asyncio_concurrent(group="vegetarian_recipe_agent")
289
- async def test_vegetarian_recipe_agent():
290
- # ...
291
-
292
- @pytest.mark.agent_test
293
- @pytest.mark.asyncio_concurrent(group="vegetarian_recipe_agent")
294
- async def test_user_is_very_hungry():
295
- # ...
296
- ```
297
-
298
- Those two scenarios should now run in parallel.
299
-
300
- ## License
301
-
302
- MIT License
@@ -1,16 +0,0 @@
1
- scenario/__init__.py,sha256=0OavO4hoZMFL6frlplNkR7BSHfGSOhuVtmKmTrOMFEs,844
2
- scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
3
- scenario/config.py,sha256=NiCCmr8flds-VDzvF8ps4SChVTARtcWfEoHhK0UkDMQ,1076
4
- scenario/error_messages.py,sha256=8_pa3HIaqkw08qOqeiRKDCNykr9jtofpNJoEV03aRWc,4690
5
- scenario/pytest_plugin.py,sha256=oJtEPVPi5x50Z-UawVyVPNd6buvh_4msSZ-3hLFpw_Y,5770
6
- scenario/scenario.py,sha256=K4Snu4-pJaoprEFyly7ZQT8qNlAamxt-eXibCJ0EIJU,7332
7
- scenario/scenario_agent_adapter.py,sha256=Y2dP3z-2jLYCssQ20oHOphwwrRPQNo2HmLD2KBcJRu0,427
8
- scenario/scenario_executor.py,sha256=geaP3Znd1he66L6ku3l2IAODj68TtAIk8b8Ssy494xA,15681
9
- scenario/testing_agent.py,sha256=5S2PIl2hi9FBSVjjs9afXhEgiogryjBIyffH5iJBwdo,10676
10
- scenario/types.py,sha256=-Uz0qg_fY5vAEkrZnM5CMqE5hiP8OtNErpDdHJmHtac,3179
11
- scenario/utils.py,sha256=bx813RpZO3xyPfD-dTBbeLM9umWm3PGOq9pw48aJoHI,8113
12
- langwatch_scenario-0.3.0.dist-info/METADATA,sha256=pywrVOVE2eE4Zk5wePzJoEfErNXWvgK-C8G-qfWp7EI,11040
13
- langwatch_scenario-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
- langwatch_scenario-0.3.0.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
15
- langwatch_scenario-0.3.0.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
16
- langwatch_scenario-0.3.0.dist-info/RECORD,,
scenario/scenario.py DELETED
@@ -1,238 +0,0 @@
1
- """
2
- Scenario module: defines the core Scenario class for agent testing.
3
- """
4
-
5
- from typing import (
6
- Awaitable,
7
- Callable,
8
- List,
9
- Dict,
10
- Any,
11
- Optional,
12
- Type,
13
- TypedDict,
14
- Union,
15
- )
16
- import asyncio
17
- import concurrent.futures
18
-
19
- from scenario.config import ScenarioConfig
20
- from scenario.error_messages import (
21
- default_config_error_message,
22
- message_invalid_agent_type,
23
- )
24
- from scenario.scenario_agent_adapter import ScenarioAgentAdapter
25
- from scenario.scenario_executor import ScenarioExecutor
26
-
27
- from .types import ScenarioResult, ScriptStep
28
-
29
- from openai.types.chat import ChatCompletionMessageParam
30
-
31
-
32
- class AgentResult(TypedDict, total=False):
33
- message: str
34
- messages: List[ChatCompletionMessageParam]
35
- extra: Dict[str, Any]
36
-
37
-
38
- class Scenario(ScenarioConfig):
39
- """
40
- A scenario represents a specific testing case for an agent.
41
-
42
- It includes:
43
- - A description of the scenario
44
- - Criteria to determine if the agent behaved correctly
45
- - Optional additional parameters
46
- """
47
-
48
- name: str
49
- description: str
50
- agents: List[Type[ScenarioAgentAdapter]]
51
- criteria: List[str]
52
-
53
- def __init__(
54
- self,
55
- name: str,
56
- description: str,
57
- criteria: List[str] = [],
58
- agent: Optional[Type[ScenarioAgentAdapter]] = None,
59
- testing_agent: Optional[Type[ScenarioAgentAdapter]] = None,
60
- agents: List[Type[ScenarioAgentAdapter]] = [],
61
- max_turns: Optional[int] = None,
62
- verbose: Optional[Union[bool, int]] = None,
63
- cache_key: Optional[str] = None,
64
- debug: Optional[bool] = None,
65
- ):
66
- """Validate scenario configuration after initialization."""
67
-
68
- config = ScenarioConfig(
69
- testing_agent=testing_agent,
70
- max_turns=max_turns,
71
- verbose=verbose,
72
- cache_key=cache_key,
73
- debug=debug,
74
- )
75
-
76
- kwargs = config.items()
77
- default_config: Optional[ScenarioConfig] = getattr(
78
- Scenario, "default_config", None
79
- )
80
- if default_config:
81
- kwargs = default_config.merge(config).items()
82
-
83
- if not name:
84
- raise ValueError("Scenario name cannot be empty")
85
- kwargs["name"] = name
86
-
87
- if not description:
88
- raise ValueError("Scenario description cannot be empty")
89
- kwargs["description"] = description
90
-
91
- kwargs["criteria"] = criteria
92
-
93
- if kwargs.get("max_turns", 10) < 1:
94
- raise ValueError("max_turns must be a positive integer")
95
-
96
- if not agents and not agent:
97
- raise ValueError(
98
- "Missing required argument `agent`. Either `agent` or `agents` argument must be provided for the Scenario"
99
- )
100
-
101
- if not agents and not kwargs.get("testing_agent"):
102
- raise Exception(default_config_error_message)
103
-
104
- agents = agents or [
105
- kwargs.get("testing_agent"),
106
- agent, # type: ignore
107
- ]
108
-
109
- # Ensure each agent is a ScenarioAgentAdapter
110
- for agent in agents:
111
- if (
112
- not agent
113
- or not isinstance(agent, type)
114
- or not issubclass(agent, ScenarioAgentAdapter)
115
- ):
116
- raise ValueError(message_invalid_agent_type(agent))
117
- kwargs["agents"] = agents
118
-
119
- super().__init__(**kwargs)
120
-
121
- def script(self, script: List[ScriptStep]):
122
- class ScriptedScenario:
123
- def __init__(self, scenario: "Scenario"):
124
- self._scenario = scenario
125
-
126
- async def run(
127
- self, context: Optional[Dict[str, Any]] = None
128
- ) -> ScenarioResult:
129
- return await self._scenario._run(context, script)
130
-
131
- return ScriptedScenario(self)
132
-
133
- async def run(self, context: Optional[Dict[str, Any]] = None) -> ScenarioResult:
134
- """
135
- Run the scenario against the agent under test.
136
-
137
- Args:
138
- context: Optional initial context for the agent
139
-
140
- Returns:
141
- ScenarioResult containing the test outcome
142
- """
143
-
144
- return await self._run(context, None)
145
-
146
- async def _run(
147
- self,
148
- context: Optional[Dict[str, Any]] = None,
149
- script: Optional[List[ScriptStep]] = None,
150
- ) -> ScenarioResult:
151
- # We'll use a thread pool to run the execution logic, we
152
- # require a separate thread because even though asyncio is
153
- # being used throughout, any user code on the callback can
154
- # be blocking, preventing them from running scenarios in parallel
155
- with concurrent.futures.ThreadPoolExecutor() as executor:
156
-
157
- def run_in_thread():
158
- loop = asyncio.new_event_loop()
159
- asyncio.set_event_loop(loop)
160
-
161
- try:
162
- return loop.run_until_complete(
163
- ScenarioExecutor(self, context, script).run()
164
- )
165
- finally:
166
- loop.close()
167
-
168
- # Run the function in the thread pool and await its result
169
- # This converts the thread's execution into a Future that the current
170
- # event loop can await without blocking
171
- loop = asyncio.get_event_loop()
172
- result = await loop.run_in_executor(executor, run_in_thread)
173
- return result
174
-
175
- @classmethod
176
- def configure(
177
- cls,
178
- testing_agent: Optional[Type[ScenarioAgentAdapter]] = None,
179
- max_turns: Optional[int] = None,
180
- verbose: Optional[Union[bool, int]] = None,
181
- cache_key: Optional[str] = None,
182
- debug: Optional[bool] = None,
183
- ) -> None:
184
- existing_config = getattr(cls, "default_config", ScenarioConfig())
185
-
186
- cls.default_config = existing_config.merge(
187
- ScenarioConfig(
188
- testing_agent=testing_agent,
189
- max_turns=max_turns,
190
- verbose=verbose,
191
- cache_key=cache_key,
192
- debug=debug,
193
- )
194
- )
195
-
196
- # Scenario Scripting
197
-
198
- def message(self, message: ChatCompletionMessageParam) -> ScriptStep:
199
- return lambda state: state.message(message)
200
-
201
- def user(
202
- self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
203
- ) -> ScriptStep:
204
- return lambda state: state.user(content)
205
-
206
- def agent(
207
- self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
208
- ) -> ScriptStep:
209
- return lambda state: state.agent(content)
210
-
211
- def judge(
212
- self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
213
- ) -> ScriptStep:
214
- return lambda state: state.judge(content)
215
-
216
- def proceed(
217
- self,
218
- turns: Optional[int] = None,
219
- on_turn: Optional[
220
- Union[
221
- Callable[[ScenarioExecutor], None],
222
- Callable[[ScenarioExecutor], Awaitable[None]],
223
- ]
224
- ] = None,
225
- on_step: Optional[
226
- Union[
227
- Callable[[ScenarioExecutor], None],
228
- Callable[[ScenarioExecutor], Awaitable[None]],
229
- ]
230
- ] = None,
231
- ) -> ScriptStep:
232
- return lambda state: state.proceed(turns, on_turn, on_step)
233
-
234
- def succeed(self) -> ScriptStep:
235
- return lambda state: state.succeed()
236
-
237
- def fail(self) -> ScriptStep:
238
- return lambda state: state.fail()
@@ -1,16 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from typing import ClassVar, Set
3
-
4
- from .types import AgentInput, AgentReturnTypes, ScenarioAgentRole
5
-
6
-
7
- class ScenarioAgentAdapter(ABC):
8
- roles: ClassVar[Set[ScenarioAgentRole]] = {ScenarioAgentRole.AGENT}
9
-
10
- def __init__(self, input: AgentInput):
11
- super().__init__()
12
- pass
13
-
14
- @abstractmethod
15
- async def call(self, input: AgentInput) -> AgentReturnTypes:
16
- pass