langwatch-scenario 0.4.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/PKG-INFO +93 -71
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/README.md +87 -70
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/langwatch_scenario.egg-info/PKG-INFO +93 -71
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/langwatch_scenario.egg-info/SOURCES.txt +13 -2
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/langwatch_scenario.egg-info/requires.txt +5 -0
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/pyproject.toml +9 -3
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/scenario/__init__.py +11 -114
- langwatch_scenario-0.6.0/scenario/_utils/__init__.py +32 -0
- langwatch_scenario-0.6.0/scenario/_utils/ids.py +58 -0
- langwatch_scenario-0.6.0/scenario/_utils/message_conversion.py +103 -0
- {langwatch_scenario-0.4.0/scenario → langwatch_scenario-0.6.0/scenario/_utils}/utils.py +21 -110
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/scenario/agent_adapter.py +8 -4
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/scenario/cache.py +4 -3
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/scenario/config.py +7 -5
- langwatch_scenario-0.6.0/scenario/events/__init__.py +66 -0
- langwatch_scenario-0.6.0/scenario/events/event_bus.py +175 -0
- langwatch_scenario-0.6.0/scenario/events/event_reporter.py +83 -0
- langwatch_scenario-0.6.0/scenario/events/events.py +169 -0
- langwatch_scenario-0.6.0/scenario/events/messages.py +84 -0
- langwatch_scenario-0.6.0/scenario/events/utils.py +86 -0
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/scenario/judge_agent.py +7 -28
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/scenario/pytest_plugin.py +2 -47
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/scenario/scenario_executor.py +268 -84
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/scenario/scenario_state.py +6 -6
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/scenario/script.py +9 -9
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/scenario/types.py +10 -6
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/scenario/user_simulator_agent.py +4 -11
- langwatch_scenario-0.6.0/tests/test_event_reporter.py +53 -0
- langwatch_scenario-0.6.0/tests/test_scenario_event_bus.py +186 -0
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/tests/test_scenario_executor.py +28 -1
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/langwatch_scenario.egg-info/dependency_links.txt +0 -0
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/langwatch_scenario.egg-info/entry_points.txt +0 -0
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/langwatch_scenario.egg-info/top_level.txt +0 -0
- /langwatch_scenario-0.4.0/scenario/error_messages.py → /langwatch_scenario-0.6.0/scenario/_error_messages.py +0 -0
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/setup.cfg +0 -0
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/setup.py +0 -0
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/tests/test_scenario.py +0 -0
- {langwatch_scenario-0.4.0 → langwatch_scenario-0.6.0}/tests/test_scenario_agent.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langwatch-scenario
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.6.0
|
4
4
|
Summary: The end-to-end agent testing library
|
5
5
|
Author-email: LangWatch Team <support@langwatch.ai>
|
6
6
|
License: MIT
|
@@ -26,6 +26,11 @@ Requires-Dist: wrapt>=1.17.2
|
|
26
26
|
Requires-Dist: pytest-asyncio>=0.26.0
|
27
27
|
Requires-Dist: rich<15.0.0,>=13.3.3
|
28
28
|
Requires-Dist: pksuid>=1.1.2
|
29
|
+
Requires-Dist: pdoc3>=0.11.6
|
30
|
+
Requires-Dist: ag-ui-protocol>=0.1.0
|
31
|
+
Requires-Dist: httpx>=0.27.0
|
32
|
+
Requires-Dist: rx>=3.2.0
|
33
|
+
Requires-Dist: respx>=0.22.0
|
29
34
|
Provides-Extra: dev
|
30
35
|
Requires-Dist: black; extra == "dev"
|
31
36
|
Requires-Dist: isort; extra == "dev"
|
@@ -44,65 +49,59 @@ Requires-Dist: function-schema; extra == "dev"
|
|
44
49
|
|
45
50
|
# Scenario
|
46
51
|
|
47
|
-
Scenario is an Agent Testing Framework
|
52
|
+
Scenario is an Agent Testing Framework based on simulations, it can:
|
48
53
|
|
49
|
-
|
50
|
-
|
51
|
-
-
|
52
|
-
-
|
53
|
-
-
|
54
|
-
- Works in combination with any testing and LLM evaluation frameworks, completely agnostic
|
55
|
-
- Works with any LLM and Agent Framework, easy integration
|
54
|
+
- Test real agent behavior by simulating users in different scenarios and edge cases
|
55
|
+
- Evaluate and judge at any point of the conversation, powerful multi-turn control
|
56
|
+
- Combine it with any LLM eval framework or custom evals, agnostic by design
|
57
|
+
- Integrate your Agent by implementing just one `call()` method
|
58
|
+
- Available in Python, TypeScript and Go
|
56
59
|
|
57
60
|
[📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
|
58
61
|
|
59
|
-
###
|
62
|
+
### In other languages
|
60
63
|
|
61
64
|
- [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
|
62
65
|
- [Scenario Go](https://github.com/langwatch/scenario-go/)
|
63
66
|
|
64
67
|
## Example
|
65
68
|
|
69
|
+
This is how a simple simulation with tool check looks like with Scenario:
|
70
|
+
|
66
71
|
```python
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
# Integrate with your agent
|
71
|
-
class WeatherAgent(scenario.AgentAdapter):
|
72
|
-
async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
|
73
|
-
return weather_agent(input.messages)
|
72
|
+
# Define any custom assertions
|
73
|
+
def check_for_weather_tool_call(state: scenario.ScenarioState):
|
74
|
+
assert state.has_tool_call("get_current_weather")
|
74
75
|
|
75
|
-
|
76
|
-
|
77
|
-
assert state.has_tool_call("get_current_weather")
|
76
|
+
result = await scenario.run(
|
77
|
+
name="checking the weather",
|
78
78
|
|
79
|
-
#
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
and is wondering what the weather will be like.
|
85
|
-
""",
|
86
|
-
agents=[
|
87
|
-
WeatherAgent(),
|
88
|
-
scenario.UserSimulatorAgent(model="openai/gpt-4.1-mini"),
|
89
|
-
],
|
90
|
-
script=[
|
91
|
-
scenario.user(),
|
92
|
-
scenario.agent(),
|
93
|
-
check_for_weather_tool_call, # check for tool call after the first agent response
|
94
|
-
scenario.succeed(),
|
95
|
-
],
|
96
|
-
)
|
79
|
+
# Define the prompt to guide the simulation
|
80
|
+
description="""
|
81
|
+
The user is planning a boat trip from Barcelona to Rome,
|
82
|
+
and is wondering what the weather will be like.
|
83
|
+
""",
|
97
84
|
|
98
|
-
#
|
99
|
-
|
85
|
+
# Define the agents that will play this simulation
|
86
|
+
agents=[
|
87
|
+
WeatherAgent(),
|
88
|
+
scenario.UserSimulatorAgent(model="openai/gpt-4.1-mini"),
|
89
|
+
],
|
90
|
+
|
91
|
+
# (Optional) Control the simulation
|
92
|
+
script=[
|
93
|
+
scenario.user(), # let the user simulator generate a user message
|
94
|
+
scenario.agent(), # agent responds
|
95
|
+
check_for_weather_tool_call, # check for tool call after the first agent response
|
96
|
+
scenario.succeed(), # simulation ends successfully
|
97
|
+
],
|
98
|
+
)
|
99
|
+
|
100
|
+
assert result.success
|
100
101
|
```
|
101
102
|
|
102
103
|
> [!NOTE]
|
103
|
-
>
|
104
|
-
|
105
|
-
Check out more examples in the [examples folder](./examples/).
|
104
|
+
> Check out full examples in the [examples folder](./examples/).
|
106
105
|
|
107
106
|
## Getting Started
|
108
107
|
|
@@ -193,17 +192,17 @@ pytest -s tests/test_vegetarian_recipe_agent.py
|
|
193
192
|
|
194
193
|
This is how it will look like:
|
195
194
|
|
196
|
-
[](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11)
|
197
196
|
|
198
197
|
You can find the same code example in [examples/test_vegetarian_recipe_agent.py](examples/test_vegetarian_recipe_agent.py).
|
199
198
|
|
200
|
-
##
|
199
|
+
## Simulation on Autopilot
|
201
200
|
|
202
|
-
By providing a User Simulator Agent and a description of the Scenario, the simulated user will automatically generate messages to the agent until the scenario is successful or the maximum number of turns is reached.
|
201
|
+
By providing a User Simulator Agent and a description of the Scenario without a script, the simulated user will automatically generate messages to the agent until the scenario is successful or the maximum number of turns is reached.
|
203
202
|
|
204
203
|
You can then use a Judge Agent to evaluate the scenario in real-time given certain criteria, at every turn, the Judge Agent will decide if it should let the simulation proceed or end it with a verdict.
|
205
204
|
|
206
|
-
|
205
|
+
For example, here is a scenario that tests a vibe coding assistant:
|
207
206
|
|
208
207
|
```python
|
209
208
|
result = await scenario.run(
|
@@ -233,6 +232,8 @@ result = await scenario.run(
|
|
233
232
|
|
234
233
|
Check out the fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
|
235
234
|
|
235
|
+
You can also combine it with a partial script too! By for example controlling only the beginning of the conversation, and let the rest proceed on autopilot, see the next section.
|
236
|
+
|
236
237
|
## Full Control of the Conversation
|
237
238
|
|
238
239
|
You can specify a script for guiding the scenario by passing a list of steps to the `script` field, those steps are simply arbitrary functions that take the current state of the scenario as an argument, so you can do things like:
|
@@ -250,35 +251,35 @@ Everything is possible, using the same simple structure:
|
|
250
251
|
```python
|
251
252
|
@pytest.mark.agent_test
|
252
253
|
@pytest.mark.asyncio
|
253
|
-
async def
|
254
|
-
|
255
|
-
name="
|
254
|
+
async def test_early_assumption_bias():
|
255
|
+
result = await scenario.run(
|
256
|
+
name="early assumption bias",
|
256
257
|
description="""
|
257
258
|
The agent makes false assumption that the user is talking about an ATM bank, and user corrects it that they actually mean river banks
|
258
259
|
""",
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
260
|
+
agents=[
|
261
|
+
Agent(),
|
262
|
+
scenario.UserSimulatorAgent(),
|
263
|
+
scenario.JudgeAgent(
|
264
|
+
criteria=[
|
265
|
+
"user should get good recommendations on river crossing",
|
266
|
+
"agent should NOT keep following up about ATM recommendation after user has corrected them that they are actually just hiking",
|
267
|
+
],
|
268
|
+
),
|
263
269
|
],
|
264
|
-
max_turns=
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
assert state.has_tool_call("web_search")
|
269
|
-
|
270
|
-
result = await scenario.script(
|
271
|
-
[
|
272
|
-
# Define existing history of messages
|
270
|
+
max_turns=10,
|
271
|
+
script=[
|
272
|
+
# Define hardcoded messages
|
273
|
+
scenario.agent("Hello, how can I help you today?"),
|
273
274
|
scenario.user("how do I safely approach a bank?"),
|
274
275
|
|
275
|
-
# Or let it be
|
276
|
+
# Or let it be generated automatically
|
276
277
|
scenario.agent(),
|
277
278
|
|
278
279
|
# Add custom assertions, for example making sure a tool was called
|
279
280
|
check_if_tool_was_called,
|
280
281
|
|
281
|
-
#
|
282
|
+
# Generate a user follow-up message
|
282
283
|
scenario.user(),
|
283
284
|
|
284
285
|
# Let the simulation proceed for 2 more turns, print at every turn
|
@@ -289,8 +290,8 @@ async def test_ai_assistant_agent():
|
|
289
290
|
|
290
291
|
# Time to make a judgment call
|
291
292
|
scenario.judge(),
|
292
|
-
]
|
293
|
-
)
|
293
|
+
],
|
294
|
+
)
|
294
295
|
|
295
296
|
assert result.success
|
296
297
|
```
|
@@ -302,7 +303,7 @@ You can enable debug mode by setting the `debug` field to `True` in the `Scenari
|
|
302
303
|
Debug mode allows you to see the messages in slow motion step by step, and intervene with your own inputs to debug your agent from the middle of the conversation.
|
303
304
|
|
304
305
|
```python
|
305
|
-
|
306
|
+
scenario.configure(default_model="openai/gpt-4.1-mini", debug=True)
|
306
307
|
```
|
307
308
|
|
308
309
|
or
|
@@ -316,16 +317,17 @@ pytest -s tests/test_vegetarian_recipe_agent.py --debug
|
|
316
317
|
Each time the scenario runs, the testing agent might chose a different input to start, this is good to make sure it covers the variance of real users as well, however we understand that the non-deterministic nature of it might make it less repeatable, costly and harder to debug. To solve for it, you can use the `cache_key` field in the `Scenario.configure` method or in the specific scenario you are running, this will make the testing agent give the same input for given the same scenario:
|
317
318
|
|
318
319
|
```python
|
319
|
-
|
320
|
+
scenario.configure(default_model="openai/gpt-4.1-mini", cache_key="42")
|
320
321
|
```
|
321
322
|
|
322
323
|
To bust the cache, you can simply pass a different `cache_key`, disable it, or delete the cache files located at `~/.scenario/cache`.
|
323
324
|
|
324
|
-
To go a step further and fully cache the test end-to-end, you can also wrap the LLM calls or any other non-deterministic functions in your application side with the `@
|
325
|
+
To go a step further and fully cache the test end-to-end, you can also wrap the LLM calls or any other non-deterministic functions in your application side with the `@scenario.cache` decorator:
|
325
326
|
|
326
327
|
```python
|
328
|
+
# Inside your actual agent implementation
|
327
329
|
class MyAgent:
|
328
|
-
@
|
330
|
+
@scenario.cache()
|
329
331
|
def invoke(self, message, context):
|
330
332
|
return client.chat.completions.create(
|
331
333
|
# ...
|
@@ -358,6 +360,26 @@ async def test_user_is_very_hungry():
|
|
358
360
|
|
359
361
|
Those two scenarios should now run in parallel.
|
360
362
|
|
363
|
+
## Events System
|
364
|
+
|
365
|
+
Scenario automatically publishes events during execution for monitoring and observability. You can enable event reporting by setting environment variables:
|
366
|
+
|
367
|
+
```bash
|
368
|
+
# Enable automatic event reporting
|
369
|
+
export LANGWATCH_ENDPOINT="https://api.langwatch.ai"
|
370
|
+
export LANGWATCH_API_KEY="your-api-key"
|
371
|
+
```
|
372
|
+
|
373
|
+
With these variables set, Scenario will automatically:
|
374
|
+
|
375
|
+
- Publish events when scenarios start, finish, and when messages are added
|
376
|
+
- Handle retries and error handling automatically
|
377
|
+
- Process events asynchronously without blocking your tests
|
378
|
+
|
379
|
+
The events include timing information, conversation history, and success/failure metrics for analysis.
|
380
|
+
|
381
|
+
For advanced customization, see the event classes in the codebase for detailed documentation.
|
382
|
+
|
361
383
|
## License
|
362
384
|
|
363
385
|
MIT License
|
@@ -6,65 +6,59 @@
|
|
6
6
|
|
7
7
|
# Scenario
|
8
8
|
|
9
|
-
Scenario is an Agent Testing Framework
|
9
|
+
Scenario is an Agent Testing Framework based on simulations, it can:
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
-
|
14
|
-
-
|
15
|
-
-
|
16
|
-
- Works in combination with any testing and LLM evaluation frameworks, completely agnostic
|
17
|
-
- Works with any LLM and Agent Framework, easy integration
|
11
|
+
- Test real agent behavior by simulating users in different scenarios and edge cases
|
12
|
+
- Evaluate and judge at any point of the conversation, powerful multi-turn control
|
13
|
+
- Combine it with any LLM eval framework or custom evals, agnostic by design
|
14
|
+
- Integrate your Agent by implementing just one `call()` method
|
15
|
+
- Available in Python, TypeScript and Go
|
18
16
|
|
19
17
|
[📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
|
20
18
|
|
21
|
-
###
|
19
|
+
### In other languages
|
22
20
|
|
23
21
|
- [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
|
24
22
|
- [Scenario Go](https://github.com/langwatch/scenario-go/)
|
25
23
|
|
26
24
|
## Example
|
27
25
|
|
26
|
+
This is how a simple simulation with tool check looks like with Scenario:
|
27
|
+
|
28
28
|
```python
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
# Integrate with your agent
|
33
|
-
class WeatherAgent(scenario.AgentAdapter):
|
34
|
-
async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
|
35
|
-
return weather_agent(input.messages)
|
29
|
+
# Define any custom assertions
|
30
|
+
def check_for_weather_tool_call(state: scenario.ScenarioState):
|
31
|
+
assert state.has_tool_call("get_current_weather")
|
36
32
|
|
37
|
-
|
38
|
-
|
39
|
-
assert state.has_tool_call("get_current_weather")
|
33
|
+
result = await scenario.run(
|
34
|
+
name="checking the weather",
|
40
35
|
|
41
|
-
#
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
and is wondering what the weather will be like.
|
47
|
-
""",
|
48
|
-
agents=[
|
49
|
-
WeatherAgent(),
|
50
|
-
scenario.UserSimulatorAgent(model="openai/gpt-4.1-mini"),
|
51
|
-
],
|
52
|
-
script=[
|
53
|
-
scenario.user(),
|
54
|
-
scenario.agent(),
|
55
|
-
check_for_weather_tool_call, # check for tool call after the first agent response
|
56
|
-
scenario.succeed(),
|
57
|
-
],
|
58
|
-
)
|
36
|
+
# Define the prompt to guide the simulation
|
37
|
+
description="""
|
38
|
+
The user is planning a boat trip from Barcelona to Rome,
|
39
|
+
and is wondering what the weather will be like.
|
40
|
+
""",
|
59
41
|
|
60
|
-
#
|
61
|
-
|
42
|
+
# Define the agents that will play this simulation
|
43
|
+
agents=[
|
44
|
+
WeatherAgent(),
|
45
|
+
scenario.UserSimulatorAgent(model="openai/gpt-4.1-mini"),
|
46
|
+
],
|
47
|
+
|
48
|
+
# (Optional) Control the simulation
|
49
|
+
script=[
|
50
|
+
scenario.user(), # let the user simulator generate a user message
|
51
|
+
scenario.agent(), # agent responds
|
52
|
+
check_for_weather_tool_call, # check for tool call after the first agent response
|
53
|
+
scenario.succeed(), # simulation ends successfully
|
54
|
+
],
|
55
|
+
)
|
56
|
+
|
57
|
+
assert result.success
|
62
58
|
```
|
63
59
|
|
64
60
|
> [!NOTE]
|
65
|
-
>
|
66
|
-
|
67
|
-
Check out more examples in the [examples folder](./examples/).
|
61
|
+
> Check out full examples in the [examples folder](./examples/).
|
68
62
|
|
69
63
|
## Getting Started
|
70
64
|
|
@@ -155,17 +149,17 @@ pytest -s tests/test_vegetarian_recipe_agent.py
|
|
155
149
|
|
156
150
|
This is how it will look like:
|
157
151
|
|
158
|
-
[](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11)
|
159
153
|
|
160
154
|
You can find the same code example in [examples/test_vegetarian_recipe_agent.py](examples/test_vegetarian_recipe_agent.py).
|
161
155
|
|
162
|
-
##
|
156
|
+
## Simulation on Autopilot
|
163
157
|
|
164
|
-
By providing a User Simulator Agent and a description of the Scenario, the simulated user will automatically generate messages to the agent until the scenario is successful or the maximum number of turns is reached.
|
158
|
+
By providing a User Simulator Agent and a description of the Scenario without a script, the simulated user will automatically generate messages to the agent until the scenario is successful or the maximum number of turns is reached.
|
165
159
|
|
166
160
|
You can then use a Judge Agent to evaluate the scenario in real-time given certain criteria, at every turn, the Judge Agent will decide if it should let the simulation proceed or end it with a verdict.
|
167
161
|
|
168
|
-
|
162
|
+
For example, here is a scenario that tests a vibe coding assistant:
|
169
163
|
|
170
164
|
```python
|
171
165
|
result = await scenario.run(
|
@@ -195,6 +189,8 @@ result = await scenario.run(
|
|
195
189
|
|
196
190
|
Check out the fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
|
197
191
|
|
192
|
+
You can also combine it with a partial script too! By for example controlling only the beginning of the conversation, and let the rest proceed on autopilot, see the next section.
|
193
|
+
|
198
194
|
## Full Control of the Conversation
|
199
195
|
|
200
196
|
You can specify a script for guiding the scenario by passing a list of steps to the `script` field, those steps are simply arbitrary functions that take the current state of the scenario as an argument, so you can do things like:
|
@@ -212,35 +208,35 @@ Everything is possible, using the same simple structure:
|
|
212
208
|
```python
|
213
209
|
@pytest.mark.agent_test
|
214
210
|
@pytest.mark.asyncio
|
215
|
-
async def
|
216
|
-
|
217
|
-
name="
|
211
|
+
async def test_early_assumption_bias():
|
212
|
+
result = await scenario.run(
|
213
|
+
name="early assumption bias",
|
218
214
|
description="""
|
219
215
|
The agent makes false assumption that the user is talking about an ATM bank, and user corrects it that they actually mean river banks
|
220
216
|
""",
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
217
|
+
agents=[
|
218
|
+
Agent(),
|
219
|
+
scenario.UserSimulatorAgent(),
|
220
|
+
scenario.JudgeAgent(
|
221
|
+
criteria=[
|
222
|
+
"user should get good recommendations on river crossing",
|
223
|
+
"agent should NOT keep following up about ATM recommendation after user has corrected them that they are actually just hiking",
|
224
|
+
],
|
225
|
+
),
|
225
226
|
],
|
226
|
-
max_turns=
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
assert state.has_tool_call("web_search")
|
231
|
-
|
232
|
-
result = await scenario.script(
|
233
|
-
[
|
234
|
-
# Define existing history of messages
|
227
|
+
max_turns=10,
|
228
|
+
script=[
|
229
|
+
# Define hardcoded messages
|
230
|
+
scenario.agent("Hello, how can I help you today?"),
|
235
231
|
scenario.user("how do I safely approach a bank?"),
|
236
232
|
|
237
|
-
# Or let it be
|
233
|
+
# Or let it be generated automatically
|
238
234
|
scenario.agent(),
|
239
235
|
|
240
236
|
# Add custom assertions, for example making sure a tool was called
|
241
237
|
check_if_tool_was_called,
|
242
238
|
|
243
|
-
#
|
239
|
+
# Generate a user follow-up message
|
244
240
|
scenario.user(),
|
245
241
|
|
246
242
|
# Let the simulation proceed for 2 more turns, print at every turn
|
@@ -251,8 +247,8 @@ async def test_ai_assistant_agent():
|
|
251
247
|
|
252
248
|
# Time to make a judgment call
|
253
249
|
scenario.judge(),
|
254
|
-
]
|
255
|
-
)
|
250
|
+
],
|
251
|
+
)
|
256
252
|
|
257
253
|
assert result.success
|
258
254
|
```
|
@@ -264,7 +260,7 @@ You can enable debug mode by setting the `debug` field to `True` in the `Scenari
|
|
264
260
|
Debug mode allows you to see the messages in slow motion step by step, and intervene with your own inputs to debug your agent from the middle of the conversation.
|
265
261
|
|
266
262
|
```python
|
267
|
-
|
263
|
+
scenario.configure(default_model="openai/gpt-4.1-mini", debug=True)
|
268
264
|
```
|
269
265
|
|
270
266
|
or
|
@@ -278,16 +274,17 @@ pytest -s tests/test_vegetarian_recipe_agent.py --debug
|
|
278
274
|
Each time the scenario runs, the testing agent might chose a different input to start, this is good to make sure it covers the variance of real users as well, however we understand that the non-deterministic nature of it might make it less repeatable, costly and harder to debug. To solve for it, you can use the `cache_key` field in the `Scenario.configure` method or in the specific scenario you are running, this will make the testing agent give the same input for given the same scenario:
|
279
275
|
|
280
276
|
```python
|
281
|
-
|
277
|
+
scenario.configure(default_model="openai/gpt-4.1-mini", cache_key="42")
|
282
278
|
```
|
283
279
|
|
284
280
|
To bust the cache, you can simply pass a different `cache_key`, disable it, or delete the cache files located at `~/.scenario/cache`.
|
285
281
|
|
286
|
-
To go a step further and fully cache the test end-to-end, you can also wrap the LLM calls or any other non-deterministic functions in your application side with the `@
|
282
|
+
To go a step further and fully cache the test end-to-end, you can also wrap the LLM calls or any other non-deterministic functions in your application side with the `@scenario.cache` decorator:
|
287
283
|
|
288
284
|
```python
|
285
|
+
# Inside your actual agent implementation
|
289
286
|
class MyAgent:
|
290
|
-
@
|
287
|
+
@scenario.cache()
|
291
288
|
def invoke(self, message, context):
|
292
289
|
return client.chat.completions.create(
|
293
290
|
# ...
|
@@ -320,6 +317,26 @@ async def test_user_is_very_hungry():
|
|
320
317
|
|
321
318
|
Those two scenarios should now run in parallel.
|
322
319
|
|
320
|
+
## Events System
|
321
|
+
|
322
|
+
Scenario automatically publishes events during execution for monitoring and observability. You can enable event reporting by setting environment variables:
|
323
|
+
|
324
|
+
```bash
|
325
|
+
# Enable automatic event reporting
|
326
|
+
export LANGWATCH_ENDPOINT="https://api.langwatch.ai"
|
327
|
+
export LANGWATCH_API_KEY="your-api-key"
|
328
|
+
```
|
329
|
+
|
330
|
+
With these variables set, Scenario will automatically:
|
331
|
+
|
332
|
+
- Publish events when scenarios start, finish, and when messages are added
|
333
|
+
- Handle retries and error handling automatically
|
334
|
+
- Process events asynchronously without blocking your tests
|
335
|
+
|
336
|
+
The events include timing information, conversation history, and success/failure metrics for analysis.
|
337
|
+
|
338
|
+
For advanced customization, see the event classes in the codebase for detailed documentation.
|
339
|
+
|
323
340
|
## License
|
324
341
|
|
325
342
|
MIT License
|