langwatch-scenario 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langwatch_scenario-0.4.0.dist-info/METADATA +363 -0
- langwatch_scenario-0.4.0.dist-info/RECORD +18 -0
- scenario/__init__.py +230 -6
- scenario/agent_adapter.py +111 -0
- scenario/cache.py +132 -8
- scenario/config.py +165 -10
- scenario/error_messages.py +75 -47
- scenario/judge_agent.py +435 -0
- scenario/pytest_plugin.py +224 -16
- scenario/scenario_executor.py +704 -150
- scenario/scenario_state.py +205 -0
- scenario/script.py +361 -0
- scenario/types.py +269 -0
- scenario/user_simulator_agent.py +249 -0
- scenario/utils.py +398 -5
- langwatch_scenario-0.2.0.dist-info/METADATA +0 -254
- langwatch_scenario-0.2.0.dist-info/RECORD +0 -15
- scenario/result.py +0 -74
- scenario/scenario.py +0 -123
- scenario/testing_agent.py +0 -262
- {langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.4.0.dist-info}/WHEEL +0 -0
- {langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.4.0.dist-info}/entry_points.txt +0 -0
- {langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,363 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: langwatch-scenario
|
3
|
+
Version: 0.4.0
|
4
|
+
Summary: The end-to-end agent testing library
|
5
|
+
Author-email: LangWatch Team <support@langwatch.ai>
|
6
|
+
License: MIT
|
7
|
+
Project-URL: Homepage, https://github.com/langwatch/scenario
|
8
|
+
Project-URL: Bug Tracker, https://github.com/langwatch/scenario/issues
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
17
|
+
Requires-Python: >=3.9
|
18
|
+
Description-Content-Type: text/markdown
|
19
|
+
Requires-Dist: pytest>=8.1.1
|
20
|
+
Requires-Dist: litellm>=1.49.0
|
21
|
+
Requires-Dist: python-dotenv>=1.0.1
|
22
|
+
Requires-Dist: termcolor>=2.4.0
|
23
|
+
Requires-Dist: pydantic>=2.7.0
|
24
|
+
Requires-Dist: joblib>=1.4.2
|
25
|
+
Requires-Dist: wrapt>=1.17.2
|
26
|
+
Requires-Dist: pytest-asyncio>=0.26.0
|
27
|
+
Requires-Dist: rich<15.0.0,>=13.3.3
|
28
|
+
Requires-Dist: pksuid>=1.1.2
|
29
|
+
Provides-Extra: dev
|
30
|
+
Requires-Dist: black; extra == "dev"
|
31
|
+
Requires-Dist: isort; extra == "dev"
|
32
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
33
|
+
Requires-Dist: pre-commit; extra == "dev"
|
34
|
+
Requires-Dist: commitizen; extra == "dev"
|
35
|
+
Requires-Dist: pyright; extra == "dev"
|
36
|
+
Requires-Dist: pydantic-ai; extra == "dev"
|
37
|
+
Requires-Dist: function-schema; extra == "dev"
|
38
|
+
|
39
|
+

|
40
|
+
|
41
|
+
<div align="center">
|
42
|
+
<!-- Discord, PyPI, Docs, etc links -->
|
43
|
+
</div>
|
44
|
+
|
45
|
+
# Scenario
|
46
|
+
|
47
|
+
Scenario is an Agent Testing Framework for testing AI agents through Simulation Testing.
|
48
|
+
|
49
|
+
You define the conversation scenario and let it play out, it will keep chatting back and forth with _your_ agent until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
|
50
|
+
|
51
|
+
- Test your agents end-to-end conversations with specified scenarios to capture both happy paths and edge cases
|
52
|
+
- Full flexibility of how much you want to guide the conversation, from fully scripted scenarios to completely automated simulations
|
53
|
+
- Run evaluations at any point of the conversation, designed for multi-turn
|
54
|
+
- Works in combination with any testing and LLM evaluation frameworks, completely agnostic
|
55
|
+
- Works with any LLM and Agent Framework, easy integration
|
56
|
+
|
57
|
+
[📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
|
58
|
+
|
59
|
+
### See also
|
60
|
+
|
61
|
+
- [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
|
62
|
+
- [Scenario Go](https://github.com/langwatch/scenario-go/)
|
63
|
+
|
64
|
+
## Example
|
65
|
+
|
66
|
+
```python
|
67
|
+
@pytest.mark.agent_test
|
68
|
+
@pytest.mark.asyncio
|
69
|
+
async def test_weather_agent():
|
70
|
+
# Integrate with your agent
|
71
|
+
class WeatherAgent(scenario.AgentAdapter):
|
72
|
+
async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
|
73
|
+
return weather_agent(input.messages)
|
74
|
+
|
75
|
+
# Define any custom assertions
|
76
|
+
def check_for_weather_tool_call(state: scenario.ScenarioState):
|
77
|
+
assert state.has_tool_call("get_current_weather")
|
78
|
+
|
79
|
+
# Run the scenario
|
80
|
+
result = await scenario.run(
|
81
|
+
name="checking the weather",
|
82
|
+
description="""
|
83
|
+
The user is planning a boat trip from Barcelona to Rome,
|
84
|
+
and is wondering what the weather will be like.
|
85
|
+
""",
|
86
|
+
agents=[
|
87
|
+
WeatherAgent(),
|
88
|
+
scenario.UserSimulatorAgent(model="openai/gpt-4.1-mini"),
|
89
|
+
],
|
90
|
+
script=[
|
91
|
+
scenario.user(),
|
92
|
+
scenario.agent(),
|
93
|
+
check_for_weather_tool_call, # check for tool call after the first agent response
|
94
|
+
scenario.succeed(),
|
95
|
+
],
|
96
|
+
)
|
97
|
+
|
98
|
+
# Assert the simulation was successful
|
99
|
+
assert result.success
|
100
|
+
```
|
101
|
+
|
102
|
+
> [!NOTE]
|
103
|
+
> This is a very basic example, keep reading to see how to run a simulation completely script-free, using a Judge Agent to evaluate in real-time.
|
104
|
+
|
105
|
+
Check out more examples in the [examples folder](./examples/).
|
106
|
+
|
107
|
+
## Getting Started
|
108
|
+
|
109
|
+
Install pytest and scenario:
|
110
|
+
|
111
|
+
```bash
|
112
|
+
pip install pytest langwatch-scenario
|
113
|
+
```
|
114
|
+
|
115
|
+
Now create your first scenario and save it as `tests/test_vegetarian_recipe_agent.py`, copy the full working example below:
|
116
|
+
|
117
|
+
```python
|
118
|
+
import pytest
|
119
|
+
import scenario
|
120
|
+
import litellm
|
121
|
+
|
122
|
+
scenario.configure(default_model="openai/gpt-4.1-mini")
|
123
|
+
|
124
|
+
|
125
|
+
@pytest.mark.agent_test
|
126
|
+
@pytest.mark.asyncio
|
127
|
+
async def test_vegetarian_recipe_agent():
|
128
|
+
class Agent(scenario.AgentAdapter):
|
129
|
+
async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
|
130
|
+
return vegetarian_recipe_agent(input.messages)
|
131
|
+
|
132
|
+
# Run a simulation scenario
|
133
|
+
result = await scenario.run(
|
134
|
+
name="dinner idea",
|
135
|
+
description="""
|
136
|
+
It's saturday evening, the user is very hungry and tired,
|
137
|
+
but have no money to order out, so they are looking for a recipe.
|
138
|
+
""",
|
139
|
+
agents=[
|
140
|
+
Agent(),
|
141
|
+
scenario.UserSimulatorAgent(),
|
142
|
+
scenario.JudgeAgent(
|
143
|
+
criteria=[
|
144
|
+
"Agent should not ask more than two follow-up questions",
|
145
|
+
"Agent should generate a recipe",
|
146
|
+
"Recipe should include a list of ingredients",
|
147
|
+
"Recipe should include step-by-step cooking instructions",
|
148
|
+
"Recipe should be vegetarian and not include any sort of meat",
|
149
|
+
]
|
150
|
+
),
|
151
|
+
],
|
152
|
+
)
|
153
|
+
|
154
|
+
# Assert for pytest to know whether the test passed
|
155
|
+
assert result.success
|
156
|
+
|
157
|
+
|
158
|
+
# Example agent implementation
|
159
|
+
import litellm
|
160
|
+
|
161
|
+
|
162
|
+
@scenario.cache()
|
163
|
+
def vegetarian_recipe_agent(messages) -> scenario.AgentReturnTypes:
|
164
|
+
response = litellm.completion(
|
165
|
+
model="openai/gpt-4.1-mini",
|
166
|
+
messages=[
|
167
|
+
{
|
168
|
+
"role": "system",
|
169
|
+
"content": """
|
170
|
+
You are a vegetarian recipe agent.
|
171
|
+
Given the user request, ask AT MOST ONE follow-up question,
|
172
|
+
then provide a complete recipe. Keep your responses concise and focused.
|
173
|
+
""",
|
174
|
+
},
|
175
|
+
*messages,
|
176
|
+
],
|
177
|
+
)
|
178
|
+
|
179
|
+
return response.choices[0].message # type: ignore
|
180
|
+
```
|
181
|
+
|
182
|
+
Create a `.env` file and put your OpenAI API key in it:
|
183
|
+
|
184
|
+
```bash
|
185
|
+
OPENAI_API_KEY=<your-api-key>
|
186
|
+
```
|
187
|
+
|
188
|
+
Now run it with pytest:
|
189
|
+
|
190
|
+
```bash
|
191
|
+
pytest -s tests/test_vegetarian_recipe_agent.py
|
192
|
+
```
|
193
|
+
|
194
|
+
This is how it will look like:
|
195
|
+
|
196
|
+
[](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11)
|
197
|
+
|
198
|
+
You can find the same code example in [examples/test_vegetarian_recipe_agent.py](examples/test_vegetarian_recipe_agent.py).
|
199
|
+
|
200
|
+
## Script-free Simulation
|
201
|
+
|
202
|
+
By providing a User Simulator Agent and a description of the Scenario, the simulated user will automatically generate messages to the agent until the scenario is successful or the maximum number of turns is reached.
|
203
|
+
|
204
|
+
You can then use a Judge Agent to evaluate the scenario in real-time given certain criteria, at every turn, the Judge Agent will decide if it should let the simulation proceed or end it with a verdict.
|
205
|
+
|
206
|
+
You can combine it with a script, to control for example the beginning of the conversation, or simply let it run scriptless, this is very useful to test an open case like a vibe coding assistant:
|
207
|
+
|
208
|
+
```python
|
209
|
+
result = await scenario.run(
|
210
|
+
name="dog walking startup landing page",
|
211
|
+
description="""
|
212
|
+
the user wants to create a new landing page for their dog walking startup
|
213
|
+
|
214
|
+
send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict
|
215
|
+
""",
|
216
|
+
agents=[
|
217
|
+
LovableAgentAdapter(template_path=template_path),
|
218
|
+
scenario.UserSimulatorAgent(),
|
219
|
+
scenario.JudgeAgent(
|
220
|
+
criteria=[
|
221
|
+
"agent reads the files before go and making changes",
|
222
|
+
"agent modified the index.css file, not only the Index.tsx file",
|
223
|
+
"agent created a comprehensive landing page",
|
224
|
+
"agent extended the landing page with a new section",
|
225
|
+
"agent should NOT say it can't read the file",
|
226
|
+
"agent should NOT produce incomplete code or be too lazy to finish",
|
227
|
+
],
|
228
|
+
),
|
229
|
+
],
|
230
|
+
max_turns=5, # optional
|
231
|
+
)
|
232
|
+
```
|
233
|
+
|
234
|
+
Check out the fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
|
235
|
+
|
236
|
+
## Full Control of the Conversation
|
237
|
+
|
238
|
+
You can specify a script for guiding the scenario by passing a list of steps to the `script` field, those steps are simply arbitrary functions that take the current state of the scenario as an argument, so you can do things like:
|
239
|
+
|
240
|
+
- Control what the user says, or let it be generated automatically
|
241
|
+
- Control what the agent says, or let it be generated automatically
|
242
|
+
- Add custom assertions, for example making sure a tool was called
|
243
|
+
- Add a custom evaluation, from an external library
|
244
|
+
- Let the simulation proceed for a certain number of turns, and evaluate at each new turn
|
245
|
+
- Trigger the judge agent to decide on a verdict
|
246
|
+
- Add arbitrary messages like mock tool calls in the middle of the conversation
|
247
|
+
|
248
|
+
Everything is possible, using the same simple structure:
|
249
|
+
|
250
|
+
```python
|
251
|
+
@pytest.mark.agent_test
|
252
|
+
@pytest.mark.asyncio
|
253
|
+
async def test_ai_assistant_agent():
|
254
|
+
scenario = Scenario(
|
255
|
+
name="false assumptions",
|
256
|
+
description="""
|
257
|
+
The agent makes false assumption that the user is talking about an ATM bank, and user corrects it that they actually mean river banks
|
258
|
+
""",
|
259
|
+
agent=AiAssistantAgentAdapter,
|
260
|
+
criteria=[
|
261
|
+
"user should get good recommendations on river crossing",
|
262
|
+
"agent should NOT follow up about ATM recommendation after user has corrected them they are just hiking",
|
263
|
+
],
|
264
|
+
max_turns=5,
|
265
|
+
)
|
266
|
+
|
267
|
+
def check_if_tool_was_called(state: ScenarioExecutor) -> None:
|
268
|
+
assert state.has_tool_call("web_search")
|
269
|
+
|
270
|
+
result = await scenario.script(
|
271
|
+
[
|
272
|
+
# Define existing history of messages
|
273
|
+
scenario.user("how do I safely approach a bank?"),
|
274
|
+
|
275
|
+
# Or let it be generate automatically
|
276
|
+
scenario.agent(),
|
277
|
+
|
278
|
+
# Add custom assertions, for example making sure a tool was called
|
279
|
+
check_if_tool_was_called,
|
280
|
+
|
281
|
+
# Another user message
|
282
|
+
scenario.user(),
|
283
|
+
|
284
|
+
# Let the simulation proceed for 2 more turns, print at every turn
|
285
|
+
scenario.proceed(
|
286
|
+
turns=2,
|
287
|
+
on_turn=lambda state: print(f"Turn {state.current_turn}: {state.messages}"),
|
288
|
+
),
|
289
|
+
|
290
|
+
# Time to make a judgment call
|
291
|
+
scenario.judge(),
|
292
|
+
]
|
293
|
+
).run()
|
294
|
+
|
295
|
+
assert result.success
|
296
|
+
```
|
297
|
+
|
298
|
+
## Debug mode
|
299
|
+
|
300
|
+
You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running, or by passing the `--debug` flag to pytest.
|
301
|
+
|
302
|
+
Debug mode allows you to see the messages in slow motion step by step, and intervene with your own inputs to debug your agent from the middle of the conversation.
|
303
|
+
|
304
|
+
```python
|
305
|
+
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"), debug=True)
|
306
|
+
```
|
307
|
+
|
308
|
+
or
|
309
|
+
|
310
|
+
```bash
|
311
|
+
pytest -s tests/test_vegetarian_recipe_agent.py --debug
|
312
|
+
```
|
313
|
+
|
314
|
+
## Cache
|
315
|
+
|
316
|
+
Each time the scenario runs, the testing agent might chose a different input to start, this is good to make sure it covers the variance of real users as well, however we understand that the non-deterministic nature of it might make it less repeatable, costly and harder to debug. To solve for it, you can use the `cache_key` field in the `Scenario.configure` method or in the specific scenario you are running, this will make the testing agent give the same input for given the same scenario:
|
317
|
+
|
318
|
+
```python
|
319
|
+
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"), cache_key="42")
|
320
|
+
```
|
321
|
+
|
322
|
+
To bust the cache, you can simply pass a different `cache_key`, disable it, or delete the cache files located at `~/.scenario/cache`.
|
323
|
+
|
324
|
+
To go a step further and fully cache the test end-to-end, you can also wrap the LLM calls or any other non-deterministic functions in your application side with the `@scenario_cache` decorator:
|
325
|
+
|
326
|
+
```python
|
327
|
+
class MyAgent:
|
328
|
+
@scenario_cache(ignore=["self"])
|
329
|
+
def invoke(self, message, context):
|
330
|
+
return client.chat.completions.create(
|
331
|
+
# ...
|
332
|
+
)
|
333
|
+
```
|
334
|
+
|
335
|
+
This will cache any function call you decorate when running the tests and make them repeatable, hashed by the function arguments, the scenario being executed, and the `cache_key` you provided. You can exclude arguments that should not be hashed for the cache key by naming them in the `ignore` argument.
|
336
|
+
|
337
|
+
## Disable Output
|
338
|
+
|
339
|
+
You can remove the `-s` flag from pytest to hide the output during test, which will only show up if the test fails. Alternatively, you can set `verbose=False` in the `Scenario.configure` method or in the specific scenario you are running.
|
340
|
+
|
341
|
+
## Running in parallel
|
342
|
+
|
343
|
+
As the number of your scenarios grows, you might want to run them in parallel to speed up your whole test suite. We suggest you to use the [pytest-asyncio-concurrent](https://pypi.org/project/pytest-asyncio-concurrent/) plugin to do so.
|
344
|
+
|
345
|
+
Simply install the plugin from the link above, then replace the `@pytest.mark.asyncio` annotation in the tests with `@pytest.mark.asyncio_concurrent`, adding a group name to it to mark the group of scenarions that should be run in parallel together, e.g.:
|
346
|
+
|
347
|
+
```python
|
348
|
+
@pytest.mark.agent_test
|
349
|
+
@pytest.mark.asyncio_concurrent(group="vegetarian_recipe_agent")
|
350
|
+
async def test_vegetarian_recipe_agent():
|
351
|
+
# ...
|
352
|
+
|
353
|
+
@pytest.mark.agent_test
|
354
|
+
@pytest.mark.asyncio_concurrent(group="vegetarian_recipe_agent")
|
355
|
+
async def test_user_is_very_hungry():
|
356
|
+
# ...
|
357
|
+
```
|
358
|
+
|
359
|
+
Those two scenarios should now run in parallel.
|
360
|
+
|
361
|
+
## License
|
362
|
+
|
363
|
+
MIT License
|
@@ -0,0 +1,18 @@
|
|
1
|
+
scenario/__init__.py,sha256=oMh5le4c4sIN2K1Ylv2xnkyKHpcOzBeqvW58fTWAFlU,7794
|
2
|
+
scenario/agent_adapter.py,sha256=pd3BdNUWna8h_9hykn1FvcyareMzUofQKKvXaAfQluY,4338
|
3
|
+
scenario/cache.py,sha256=iPpMmjKruLnnxCeLnRiQjiH89LhcVIfQQXKH5etU_m4,6217
|
4
|
+
scenario/config.py,sha256=AeDbKE-_Rrxkan64tDDDynaSNyijoIKHxWaRMqGd4oY,6121
|
5
|
+
scenario/error_messages.py,sha256=6lEx3jBGMbPx0kG0eX5zoZE-ENVM3O_ZkIbVMlnidYs,3892
|
6
|
+
scenario/judge_agent.py,sha256=7fKK_oevXzWKXDioBjHzgGSDpS0aby3oRcrc6oaip68,16973
|
7
|
+
scenario/pytest_plugin.py,sha256=s2M2mll9JSCSWB5SKDQIWT5DOCvzZOo_8JCCfJzyy8k,12849
|
8
|
+
scenario/scenario_executor.py,sha256=oz7Odv41HNLcNd_7sKUW-AKKdY-on_PyVLaxpvKjrGE,27211
|
9
|
+
scenario/scenario_state.py,sha256=I_fWoY_LvNuKCBL-b62z5bQOAI25dx55FuZNWwtIeVs,7075
|
10
|
+
scenario/script.py,sha256=7wsHZxdSgFaYLflkV6sysDxefkkag79mySR7yp7N3ug,12278
|
11
|
+
scenario/types.py,sha256=CsexCupg2WUi4dToYF5RqFdNIHx1JhaRaRRBs78YVd0,9498
|
12
|
+
scenario/user_simulator_agent.py,sha256=o8sZLMWOcTf7BKgPO_a5rPnC6GgdZQe3HujqwjPzjV8,9346
|
13
|
+
scenario/utils.py,sha256=ryJYcMoSAjVzA_f5V6Mcga5GkipYbCzaYNNpBjAQI_g,16992
|
14
|
+
langwatch_scenario-0.4.0.dist-info/METADATA,sha256=d9tNTNioHH5_1q8oIvIABaTgC6J9XmEJR4Tjim3sFks,13827
|
15
|
+
langwatch_scenario-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
16
|
+
langwatch_scenario-0.4.0.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
|
17
|
+
langwatch_scenario-0.4.0.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
|
18
|
+
langwatch_scenario-0.4.0.dist-info/RECORD,,
|
scenario/__init__.py
CHANGED
@@ -1,24 +1,248 @@
|
|
1
1
|
"""
|
2
|
-
Scenario:
|
2
|
+
Scenario: Agent Testing Framework through Simulation Testing
|
3
|
+
|
4
|
+
Scenario is a comprehensive testing framework for AI agents that uses simulation testing
|
5
|
+
to validate agent behavior through realistic conversations. It enables testing of both
|
6
|
+
happy paths and edge cases by simulating user interactions and evaluating agent responses
|
7
|
+
against configurable success criteria.
|
8
|
+
|
9
|
+
Key Features:
|
10
|
+
- End-to-end conversation testing with specified scenarios
|
11
|
+
- Flexible control from fully scripted to completely automated simulations
|
12
|
+
- Multi-turn evaluation designed for complex conversational agents
|
13
|
+
- Works with any testing framework (pytest, unittest, etc.)
|
14
|
+
- Framework-agnostic integration with any LLM or agent architecture
|
15
|
+
- Built-in caching for deterministic and faster test execution
|
16
|
+
|
17
|
+
Basic Usage:
|
18
|
+
```python
|
19
|
+
import scenario
|
20
|
+
|
21
|
+
# Configure global settings
|
22
|
+
scenario.configure(default_model="openai/gpt-4.1-mini")
|
23
|
+
|
24
|
+
# Create your agent adapter
|
25
|
+
class MyAgent(scenario.AgentAdapter):
|
26
|
+
async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
|
27
|
+
return my_agent_function(input.last_new_user_message_str())
|
28
|
+
|
29
|
+
# Run a scenario test
|
30
|
+
result = await scenario.run(
|
31
|
+
name="customer service test",
|
32
|
+
description="Customer asks about billing, agent should help politely",
|
33
|
+
agents=[
|
34
|
+
MyAgent(),
|
35
|
+
scenario.UserSimulatorAgent(),
|
36
|
+
scenario.JudgeAgent(criteria=[
|
37
|
+
"Agent is polite and professional",
|
38
|
+
"Agent addresses the billing question",
|
39
|
+
"Agent provides clear next steps"
|
40
|
+
])
|
41
|
+
]
|
42
|
+
)
|
43
|
+
|
44
|
+
assert result.success
|
45
|
+
```
|
46
|
+
|
47
|
+
Advanced Usage:
|
48
|
+
```python
|
49
|
+
# Script-controlled scenario with custom evaluations
|
50
|
+
def check_tool_usage(state: scenario.ScenarioState) -> None:
|
51
|
+
assert state.has_tool_call("get_customer_info")
|
52
|
+
|
53
|
+
result = await scenario.run(
|
54
|
+
name="scripted interaction",
|
55
|
+
description="Test specific conversation flow",
|
56
|
+
agents=[
|
57
|
+
MyAgent(),
|
58
|
+
scenario.UserSimulatorAgent(),
|
59
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful response"])
|
60
|
+
],
|
61
|
+
script=[
|
62
|
+
scenario.user("I have a billing question"),
|
63
|
+
scenario.agent(),
|
64
|
+
check_tool_usage, # Custom assertion
|
65
|
+
scenario.proceed(turns=2), # Let it continue automatically
|
66
|
+
scenario.succeed("All requirements met")
|
67
|
+
]
|
68
|
+
)
|
69
|
+
```
|
70
|
+
|
71
|
+
Integration with Testing Frameworks:
|
72
|
+
```python
|
73
|
+
import pytest
|
74
|
+
|
75
|
+
@pytest.mark.agent_test
|
76
|
+
@pytest.mark.asyncio
|
77
|
+
async def test_weather_agent():
|
78
|
+
result = await scenario.run(
|
79
|
+
name="weather query",
|
80
|
+
description="User asks about weather in a specific city",
|
81
|
+
agents=[
|
82
|
+
WeatherAgent(),
|
83
|
+
scenario.UserSimulatorAgent(),
|
84
|
+
scenario.JudgeAgent(criteria=["Provides accurate weather information"])
|
85
|
+
]
|
86
|
+
)
|
87
|
+
assert result.success
|
88
|
+
```
|
89
|
+
|
90
|
+
For more examples and detailed documentation, visit: https://github.com/langwatch/scenario
|
3
91
|
"""
|
4
92
|
|
5
93
|
# First import non-dependent modules
|
6
|
-
from .
|
94
|
+
from .types import ScenarioResult, AgentInput, AgentRole, AgentReturnTypes
|
7
95
|
from .config import ScenarioConfig
|
8
96
|
|
9
97
|
# Then import modules with dependencies
|
10
|
-
from .
|
11
|
-
from .
|
98
|
+
from .scenario_executor import ScenarioExecutor
|
99
|
+
from .scenario_state import ScenarioState
|
100
|
+
from .agent_adapter import AgentAdapter
|
101
|
+
from .judge_agent import JudgeAgent
|
102
|
+
from .user_simulator_agent import UserSimulatorAgent
|
12
103
|
from .cache import scenario_cache
|
104
|
+
from .script import message, user, agent, judge, proceed, succeed, fail
|
13
105
|
|
14
106
|
# Import pytest plugin components
|
15
107
|
from .pytest_plugin import pytest_configure, scenario_reporter
|
16
108
|
|
109
|
+
run = ScenarioExecutor.run
|
110
|
+
"""
|
111
|
+
High-level interface for running scenario tests.
|
112
|
+
|
113
|
+
This is the main entry point for executing scenario-based agent tests. It creates
|
114
|
+
and runs a complete scenario simulation including user interactions, agent responses,
|
115
|
+
and success evaluation.
|
116
|
+
|
117
|
+
Args:
|
118
|
+
name: Human-readable name for the scenario
|
119
|
+
description: Detailed description that guides the simulation behavior
|
120
|
+
agents: List of agent adapters (agent under test, user simulator, judge)
|
121
|
+
max_turns: Maximum conversation turns before timeout (default: 10)
|
122
|
+
verbose: Show detailed output during execution
|
123
|
+
cache_key: Cache key for deterministic behavior across runs
|
124
|
+
debug: Enable debug mode for step-by-step execution
|
125
|
+
script: Optional script steps to control scenario flow
|
126
|
+
|
127
|
+
Returns:
|
128
|
+
ScenarioResult containing test outcome, conversation history, and detailed analysis
|
129
|
+
|
130
|
+
Example:
|
131
|
+
```python
|
132
|
+
result = await scenario.run(
|
133
|
+
name="help request",
|
134
|
+
description="User needs help with a technical problem",
|
135
|
+
agents=[
|
136
|
+
MyAgentAdapter(),
|
137
|
+
scenario.UserSimulatorAgent(),
|
138
|
+
scenario.JudgeAgent(criteria=["Provides helpful response"])
|
139
|
+
]
|
140
|
+
)
|
141
|
+
|
142
|
+
print(f"Test {'PASSED' if result.success else 'FAILED'}")
|
143
|
+
print(f"Reasoning: {result.reasoning}")
|
144
|
+
```
|
145
|
+
"""
|
146
|
+
|
147
|
+
configure = ScenarioConfig.configure
|
148
|
+
"""
|
149
|
+
Set global configuration settings for all scenario executions.
|
150
|
+
|
151
|
+
This function allows you to configure default behavior that will be applied
|
152
|
+
to all scenarios unless explicitly overridden in individual scenario runs.
|
153
|
+
|
154
|
+
Args:
|
155
|
+
default_model: Default LLM model identifier for user simulator and judge agents
|
156
|
+
max_turns: Maximum number of conversation turns before timeout (default: 10)
|
157
|
+
verbose: Enable verbose output during scenario execution
|
158
|
+
cache_key: Cache key for deterministic scenario behavior across runs
|
159
|
+
debug: Enable debug mode for step-by-step execution with user intervention
|
160
|
+
|
161
|
+
Example:
|
162
|
+
```python
|
163
|
+
# Set up global defaults
|
164
|
+
scenario.configure(
|
165
|
+
default_model="openai/gpt-4.1-mini",
|
166
|
+
max_turns=15,
|
167
|
+
verbose=True,
|
168
|
+
cache_key="my-test-suite-v1"
|
169
|
+
)
|
170
|
+
|
171
|
+
# All subsequent scenarios will use these defaults
|
172
|
+
result = await scenario.run(...)
|
173
|
+
```
|
174
|
+
"""
|
175
|
+
|
176
|
+
default_config = ScenarioConfig.default_config
|
177
|
+
"""
|
178
|
+
Access to the current global configuration settings.
|
179
|
+
|
180
|
+
This provides read-only access to the default configuration that has been
|
181
|
+
set via scenario.configure(). Useful for debugging or conditional logic
|
182
|
+
based on current settings.
|
183
|
+
|
184
|
+
Example:
|
185
|
+
```python
|
186
|
+
if scenario.default_config and scenario.default_config.debug:
|
187
|
+
print("Debug mode is enabled")
|
188
|
+
```
|
189
|
+
"""
|
190
|
+
|
191
|
+
cache = scenario_cache
|
192
|
+
"""
|
193
|
+
Decorator for caching function calls during scenario execution.
|
194
|
+
|
195
|
+
This decorator enables deterministic testing by caching LLM calls and other
|
196
|
+
non-deterministic operations based on scenario configuration and function arguments.
|
197
|
+
Results are cached when a cache_key is configured, making tests repeatable and faster.
|
198
|
+
|
199
|
+
Args:
|
200
|
+
ignore: List of argument names to exclude from cache key computation
|
201
|
+
|
202
|
+
Example:
|
203
|
+
```python
|
204
|
+
class MyAgent:
|
205
|
+
@scenario.cache(ignore=["self"])
|
206
|
+
def invoke(self, message: str) -> str:
|
207
|
+
# This LLM call will be cached when cache_key is set
|
208
|
+
return llm_client.complete(model="gpt-4", prompt=message)
|
209
|
+
|
210
|
+
# Enable caching for deterministic tests
|
211
|
+
scenario.configure(cache_key="test-suite-v1")
|
212
|
+
```
|
213
|
+
"""
|
214
|
+
|
17
215
|
__all__ = [
|
18
|
-
|
19
|
-
"
|
216
|
+
# Functions
|
217
|
+
"run",
|
218
|
+
"configure",
|
219
|
+
"default_config",
|
220
|
+
"cache",
|
221
|
+
|
222
|
+
# Script
|
223
|
+
"message",
|
224
|
+
"proceed",
|
225
|
+
"succeed",
|
226
|
+
"fail",
|
227
|
+
"judge",
|
228
|
+
"agent",
|
229
|
+
"user",
|
230
|
+
|
231
|
+
# Types
|
20
232
|
"ScenarioResult",
|
233
|
+
"AgentInput",
|
234
|
+
"AgentRole",
|
21
235
|
"ScenarioConfig",
|
236
|
+
"AgentReturnTypes",
|
237
|
+
|
238
|
+
# Classes
|
239
|
+
"ScenarioExecutor",
|
240
|
+
"ScenarioState",
|
241
|
+
"AgentAdapter",
|
242
|
+
"UserSimulatorAgent",
|
243
|
+
"JudgeAgent",
|
244
|
+
|
245
|
+
# Plugins
|
22
246
|
"pytest_configure",
|
23
247
|
"scenario_reporter",
|
24
248
|
"scenario_cache",
|