langwatch-scenario 0.3.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/PKG-INFO +140 -79
- {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/README.md +136 -78
- {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/langwatch_scenario.egg-info/PKG-INFO +140 -79
- {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/langwatch_scenario.egg-info/SOURCES.txt +5 -3
- {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/langwatch_scenario.egg-info/requires.txt +3 -0
- {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/pyproject.toml +5 -1
- langwatch_scenario-0.4.0/scenario/__init__.py +250 -0
- langwatch_scenario-0.4.0/scenario/agent_adapter.py +111 -0
- langwatch_scenario-0.4.0/scenario/cache.py +186 -0
- langwatch_scenario-0.4.0/scenario/config.py +183 -0
- {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/scenario/error_messages.py +8 -38
- langwatch_scenario-0.4.0/scenario/judge_agent.py +435 -0
- langwatch_scenario-0.4.0/scenario/pytest_plugin.py +385 -0
- langwatch_scenario-0.4.0/scenario/scenario_executor.py +758 -0
- langwatch_scenario-0.4.0/scenario/scenario_state.py +205 -0
- langwatch_scenario-0.4.0/scenario/script.py +361 -0
- langwatch_scenario-0.4.0/scenario/types.py +269 -0
- langwatch_scenario-0.4.0/scenario/user_simulator_agent.py +249 -0
- langwatch_scenario-0.4.0/scenario/utils.py +514 -0
- {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/tests/test_scenario.py +177 -192
- {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/tests/test_scenario_agent.py +7 -8
- {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/tests/test_scenario_executor.py +103 -63
- langwatch_scenario-0.3.0/scenario/__init__.py +0 -36
- langwatch_scenario-0.3.0/scenario/cache.py +0 -62
- langwatch_scenario-0.3.0/scenario/config.py +0 -39
- langwatch_scenario-0.3.0/scenario/pytest_plugin.py +0 -177
- langwatch_scenario-0.3.0/scenario/scenario.py +0 -238
- langwatch_scenario-0.3.0/scenario/scenario_agent_adapter.py +0 -16
- langwatch_scenario-0.3.0/scenario/scenario_executor.py +0 -466
- langwatch_scenario-0.3.0/scenario/testing_agent.py +0 -279
- langwatch_scenario-0.3.0/scenario/types.py +0 -96
- langwatch_scenario-0.3.0/scenario/utils.py +0 -264
- {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/langwatch_scenario.egg-info/dependency_links.txt +0 -0
- {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/langwatch_scenario.egg-info/entry_points.txt +0 -0
- {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/langwatch_scenario.egg-info/top_level.txt +0 -0
- {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/setup.cfg +0 -0
- {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langwatch-scenario
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.4.0
|
4
4
|
Summary: The end-to-end agent testing library
|
5
5
|
Author-email: LangWatch Team <support@langwatch.ai>
|
6
6
|
License: MIT
|
@@ -32,6 +32,9 @@ Requires-Dist: isort; extra == "dev"
|
|
32
32
|
Requires-Dist: pytest-cov; extra == "dev"
|
33
33
|
Requires-Dist: pre-commit; extra == "dev"
|
34
34
|
Requires-Dist: commitizen; extra == "dev"
|
35
|
+
Requires-Dist: pyright; extra == "dev"
|
36
|
+
Requires-Dist: pydantic-ai; extra == "dev"
|
37
|
+
Requires-Dist: function-schema; extra == "dev"
|
35
38
|
|
36
39
|

|
37
40
|
|
@@ -39,11 +42,17 @@ Requires-Dist: commitizen; extra == "dev"
|
|
39
42
|
<!-- Discord, PyPI, Docs, etc links -->
|
40
43
|
</div>
|
41
44
|
|
42
|
-
# Scenario
|
45
|
+
# Scenario
|
43
46
|
|
44
47
|
Scenario is an Agent Testing Framework for testing AI agents through Simulation Testing.
|
45
48
|
|
46
|
-
You define the
|
49
|
+
You define the conversation scenario and let it play out, it will keep chatting back and forth with _your_ agent until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
|
50
|
+
|
51
|
+
- Test your agents end-to-end conversations with specified scenarios to capture both happy paths and edge cases
|
52
|
+
- Full flexibility of how much you want to guide the conversation, from fully scripted scenarios to completely automated simulations
|
53
|
+
- Run evaluations at any point of the conversation, designed for multi-turn
|
54
|
+
- Works in combination with any testing and LLM evaluation frameworks, completely agnostic
|
55
|
+
- Works with any LLM and Agent Framework, easy integration
|
47
56
|
|
48
57
|
[📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
|
49
58
|
|
@@ -52,6 +61,49 @@ You define the scenarios, and the testing agent will simulate a real user as it
|
|
52
61
|
- [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
|
53
62
|
- [Scenario Go](https://github.com/langwatch/scenario-go/)
|
54
63
|
|
64
|
+
## Example
|
65
|
+
|
66
|
+
```python
|
67
|
+
@pytest.mark.agent_test
|
68
|
+
@pytest.mark.asyncio
|
69
|
+
async def test_weather_agent():
|
70
|
+
# Integrate with your agent
|
71
|
+
class WeatherAgent(scenario.AgentAdapter):
|
72
|
+
async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
|
73
|
+
return weather_agent(input.messages)
|
74
|
+
|
75
|
+
# Define any custom assertions
|
76
|
+
def check_for_weather_tool_call(state: scenario.ScenarioState):
|
77
|
+
assert state.has_tool_call("get_current_weather")
|
78
|
+
|
79
|
+
# Run the scenario
|
80
|
+
result = await scenario.run(
|
81
|
+
name="checking the weather",
|
82
|
+
description="""
|
83
|
+
The user is planning a boat trip from Barcelona to Rome,
|
84
|
+
and is wondering what the weather will be like.
|
85
|
+
""",
|
86
|
+
agents=[
|
87
|
+
WeatherAgent(),
|
88
|
+
scenario.UserSimulatorAgent(model="openai/gpt-4.1-mini"),
|
89
|
+
],
|
90
|
+
script=[
|
91
|
+
scenario.user(),
|
92
|
+
scenario.agent(),
|
93
|
+
check_for_weather_tool_call, # check for tool call after the first agent response
|
94
|
+
scenario.succeed(),
|
95
|
+
],
|
96
|
+
)
|
97
|
+
|
98
|
+
# Assert the simulation was successful
|
99
|
+
assert result.success
|
100
|
+
```
|
101
|
+
|
102
|
+
> [!NOTE]
|
103
|
+
> This is a very basic example, keep reading to see how to run a simulation completely script-free, using a Judge Agent to evaluate in real-time.
|
104
|
+
|
105
|
+
Check out more examples in the [examples folder](./examples/).
|
106
|
+
|
55
107
|
## Getting Started
|
56
108
|
|
57
109
|
Install pytest and scenario:
|
@@ -60,51 +112,45 @@ Install pytest and scenario:
|
|
60
112
|
pip install pytest langwatch-scenario
|
61
113
|
```
|
62
114
|
|
63
|
-
Now create your first scenario and save it as `tests/test_vegetarian_recipe_agent.py
|
115
|
+
Now create your first scenario and save it as `tests/test_vegetarian_recipe_agent.py`, copy the full working example below:
|
64
116
|
|
65
117
|
```python
|
66
118
|
import pytest
|
119
|
+
import scenario
|
120
|
+
import litellm
|
67
121
|
|
68
|
-
|
69
|
-
|
70
|
-
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
|
71
|
-
|
72
|
-
|
73
|
-
# Create an adapter to call your agent
|
74
|
-
class VegetarianRecipeAgentAdapter(ScenarioAgentAdapter):
|
75
|
-
def __init__(self, input: AgentInput):
|
76
|
-
self.agent = VegetarianRecipeAgent()
|
77
|
-
|
78
|
-
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
79
|
-
return self.agent.run(input.last_new_user_message_str())
|
122
|
+
scenario.configure(default_model="openai/gpt-4.1-mini")
|
80
123
|
|
81
124
|
|
82
125
|
@pytest.mark.agent_test
|
83
126
|
@pytest.mark.asyncio
|
84
127
|
async def test_vegetarian_recipe_agent():
|
85
|
-
|
86
|
-
|
128
|
+
class Agent(scenario.AgentAdapter):
|
129
|
+
async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
|
130
|
+
return vegetarian_recipe_agent(input.messages)
|
131
|
+
|
132
|
+
# Run a simulation scenario
|
133
|
+
result = await scenario.run(
|
87
134
|
name="dinner idea",
|
88
135
|
description="""
|
89
136
|
It's saturday evening, the user is very hungry and tired,
|
90
137
|
but have no money to order out, so they are looking for a recipe.
|
91
|
-
|
92
|
-
The user never mentions they want a vegetarian recipe.
|
93
138
|
""",
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
139
|
+
agents=[
|
140
|
+
Agent(),
|
141
|
+
scenario.UserSimulatorAgent(),
|
142
|
+
scenario.JudgeAgent(
|
143
|
+
criteria=[
|
144
|
+
"Agent should not ask more than two follow-up questions",
|
145
|
+
"Agent should generate a recipe",
|
146
|
+
"Recipe should include a list of ingredients",
|
147
|
+
"Recipe should include step-by-step cooking instructions",
|
148
|
+
"Recipe should be vegetarian and not include any sort of meat",
|
149
|
+
]
|
150
|
+
),
|
102
151
|
],
|
103
152
|
)
|
104
153
|
|
105
|
-
# Run the scenario and get results
|
106
|
-
result = await scenario.run()
|
107
|
-
|
108
154
|
# Assert for pytest to know whether the test passed
|
109
155
|
assert result.success
|
110
156
|
|
@@ -113,33 +159,24 @@ async def test_vegetarian_recipe_agent():
|
|
113
159
|
import litellm
|
114
160
|
|
115
161
|
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
then provide a complete recipe. Keep your responses concise and focused.
|
133
|
-
""",
|
134
|
-
},
|
135
|
-
*self.history,
|
136
|
-
],
|
137
|
-
)
|
138
|
-
message = response.choices[0].message # type: ignore
|
139
|
-
self.history.append(message)
|
140
|
-
|
141
|
-
return [message]
|
162
|
+
@scenario.cache()
|
163
|
+
def vegetarian_recipe_agent(messages) -> scenario.AgentReturnTypes:
|
164
|
+
response = litellm.completion(
|
165
|
+
model="openai/gpt-4.1-mini",
|
166
|
+
messages=[
|
167
|
+
{
|
168
|
+
"role": "system",
|
169
|
+
"content": """
|
170
|
+
You are a vegetarian recipe agent.
|
171
|
+
Given the user request, ask AT MOST ONE follow-up question,
|
172
|
+
then provide a complete recipe. Keep your responses concise and focused.
|
173
|
+
""",
|
174
|
+
},
|
175
|
+
*messages,
|
176
|
+
],
|
177
|
+
)
|
142
178
|
|
179
|
+
return response.choices[0].message # type: ignore
|
143
180
|
```
|
144
181
|
|
145
182
|
Create a `.env` file and put your OpenAI API key in it:
|
@@ -158,42 +195,57 @@ This is how it will look like:
|
|
158
195
|
|
159
196
|
[](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11)
|
160
197
|
|
161
|
-
You can find
|
198
|
+
You can find the same code example in [examples/test_vegetarian_recipe_agent.py](examples/test_vegetarian_recipe_agent.py).
|
162
199
|
|
163
|
-
##
|
200
|
+
## Script-free Simulation
|
164
201
|
|
165
|
-
|
202
|
+
By providing a User Simulator Agent and a description of the Scenario, the simulated user will automatically generate messages to the agent until the scenario is successful or the maximum number of turns is reached.
|
166
203
|
|
167
|
-
|
204
|
+
You can then use a Judge Agent to evaluate the scenario in real-time given certain criteria, at every turn, the Judge Agent will decide if it should let the simulation proceed or end it with a verdict.
|
205
|
+
|
206
|
+
You can combine it with a script, to control for example the beginning of the conversation, or simply let it run scriptless, this is very useful to test an open case like a vibe coding assistant:
|
168
207
|
|
169
208
|
```python
|
170
|
-
|
209
|
+
result = await scenario.run(
|
171
210
|
name="dog walking startup landing page",
|
172
211
|
description="""
|
173
212
|
the user wants to create a new landing page for their dog walking startup
|
174
213
|
|
175
214
|
send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict
|
176
215
|
""",
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
216
|
+
agents=[
|
217
|
+
LovableAgentAdapter(template_path=template_path),
|
218
|
+
scenario.UserSimulatorAgent(),
|
219
|
+
scenario.JudgeAgent(
|
220
|
+
criteria=[
|
221
|
+
"agent reads the files before go and making changes",
|
222
|
+
"agent modified the index.css file, not only the Index.tsx file",
|
223
|
+
"agent created a comprehensive landing page",
|
224
|
+
"agent extended the landing page with a new section",
|
225
|
+
"agent should NOT say it can't read the file",
|
226
|
+
"agent should NOT produce incomplete code or be too lazy to finish",
|
227
|
+
],
|
228
|
+
),
|
185
229
|
],
|
186
|
-
max_turns=5,
|
230
|
+
max_turns=5, # optional
|
187
231
|
)
|
188
|
-
|
189
|
-
result = await scenario.run()
|
190
232
|
```
|
191
233
|
|
192
|
-
|
234
|
+
Check out the fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
|
235
|
+
|
236
|
+
## Full Control of the Conversation
|
237
|
+
|
238
|
+
You can specify a script for guiding the scenario by passing a list of steps to the `script` field, those steps are simply arbitrary functions that take the current state of the scenario as an argument, so you can do things like:
|
193
239
|
|
194
|
-
|
240
|
+
- Control what the user says, or let it be generated automatically
|
241
|
+
- Control what the agent says, or let it be generated automatically
|
242
|
+
- Add custom assertions, for example making sure a tool was called
|
243
|
+
- Add a custom evaluation, from an external library
|
244
|
+
- Let the simulation proceed for a certain number of turns, and evaluate at each new turn
|
245
|
+
- Trigger the judge agent to decide on a verdict
|
246
|
+
- Add arbitrary messages like mock tool calls in the middle of the conversation
|
195
247
|
|
196
|
-
|
248
|
+
Everything is possible, using the same simple structure:
|
197
249
|
|
198
250
|
```python
|
199
251
|
@pytest.mark.agent_test
|
@@ -202,7 +254,7 @@ async def test_ai_assistant_agent():
|
|
202
254
|
scenario = Scenario(
|
203
255
|
name="false assumptions",
|
204
256
|
description="""
|
205
|
-
The agent makes false assumption about
|
257
|
+
The agent makes false assumption that the user is talking about an ATM bank, and user corrects it that they actually mean river banks
|
206
258
|
""",
|
207
259
|
agent=AiAssistantAgentAdapter,
|
208
260
|
criteria=[
|
@@ -219,13 +271,22 @@ async def test_ai_assistant_agent():
|
|
219
271
|
[
|
220
272
|
# Define existing history of messages
|
221
273
|
scenario.user("how do I safely approach a bank?"),
|
274
|
+
|
222
275
|
# Or let it be generate automatically
|
223
276
|
scenario.agent(),
|
277
|
+
|
224
278
|
# Add custom assertions, for example making sure a tool was called
|
225
279
|
check_if_tool_was_called,
|
280
|
+
|
281
|
+
# Another user message
|
226
282
|
scenario.user(),
|
227
|
-
|
228
|
-
|
283
|
+
|
284
|
+
# Let the simulation proceed for 2 more turns, print at every turn
|
285
|
+
scenario.proceed(
|
286
|
+
turns=2,
|
287
|
+
on_turn=lambda state: print(f"Turn {state.current_turn}: {state.messages}"),
|
288
|
+
),
|
289
|
+
|
229
290
|
# Time to make a judgment call
|
230
291
|
scenario.judge(),
|
231
292
|
]
|
@@ -4,11 +4,17 @@
|
|
4
4
|
<!-- Discord, PyPI, Docs, etc links -->
|
5
5
|
</div>
|
6
6
|
|
7
|
-
# Scenario
|
7
|
+
# Scenario
|
8
8
|
|
9
9
|
Scenario is an Agent Testing Framework for testing AI agents through Simulation Testing.
|
10
10
|
|
11
|
-
You define the
|
11
|
+
You define the conversation scenario and let it play out, it will keep chatting back and forth with _your_ agent until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
|
12
|
+
|
13
|
+
- Test your agents end-to-end conversations with specified scenarios to capture both happy paths and edge cases
|
14
|
+
- Full flexibility of how much you want to guide the conversation, from fully scripted scenarios to completely automated simulations
|
15
|
+
- Run evaluations at any point of the conversation, designed for multi-turn
|
16
|
+
- Works in combination with any testing and LLM evaluation frameworks, completely agnostic
|
17
|
+
- Works with any LLM and Agent Framework, easy integration
|
12
18
|
|
13
19
|
[📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
|
14
20
|
|
@@ -17,6 +23,49 @@ You define the scenarios, and the testing agent will simulate a real user as it
|
|
17
23
|
- [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
|
18
24
|
- [Scenario Go](https://github.com/langwatch/scenario-go/)
|
19
25
|
|
26
|
+
## Example
|
27
|
+
|
28
|
+
```python
|
29
|
+
@pytest.mark.agent_test
|
30
|
+
@pytest.mark.asyncio
|
31
|
+
async def test_weather_agent():
|
32
|
+
# Integrate with your agent
|
33
|
+
class WeatherAgent(scenario.AgentAdapter):
|
34
|
+
async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
|
35
|
+
return weather_agent(input.messages)
|
36
|
+
|
37
|
+
# Define any custom assertions
|
38
|
+
def check_for_weather_tool_call(state: scenario.ScenarioState):
|
39
|
+
assert state.has_tool_call("get_current_weather")
|
40
|
+
|
41
|
+
# Run the scenario
|
42
|
+
result = await scenario.run(
|
43
|
+
name="checking the weather",
|
44
|
+
description="""
|
45
|
+
The user is planning a boat trip from Barcelona to Rome,
|
46
|
+
and is wondering what the weather will be like.
|
47
|
+
""",
|
48
|
+
agents=[
|
49
|
+
WeatherAgent(),
|
50
|
+
scenario.UserSimulatorAgent(model="openai/gpt-4.1-mini"),
|
51
|
+
],
|
52
|
+
script=[
|
53
|
+
scenario.user(),
|
54
|
+
scenario.agent(),
|
55
|
+
check_for_weather_tool_call, # check for tool call after the first agent response
|
56
|
+
scenario.succeed(),
|
57
|
+
],
|
58
|
+
)
|
59
|
+
|
60
|
+
# Assert the simulation was successful
|
61
|
+
assert result.success
|
62
|
+
```
|
63
|
+
|
64
|
+
> [!NOTE]
|
65
|
+
> This is a very basic example, keep reading to see how to run a simulation completely script-free, using a Judge Agent to evaluate in real-time.
|
66
|
+
|
67
|
+
Check out more examples in the [examples folder](./examples/).
|
68
|
+
|
20
69
|
## Getting Started
|
21
70
|
|
22
71
|
Install pytest and scenario:
|
@@ -25,51 +74,45 @@ Install pytest and scenario:
|
|
25
74
|
pip install pytest langwatch-scenario
|
26
75
|
```
|
27
76
|
|
28
|
-
Now create your first scenario and save it as `tests/test_vegetarian_recipe_agent.py
|
77
|
+
Now create your first scenario and save it as `tests/test_vegetarian_recipe_agent.py`, copy the full working example below:
|
29
78
|
|
30
79
|
```python
|
31
80
|
import pytest
|
81
|
+
import scenario
|
82
|
+
import litellm
|
32
83
|
|
33
|
-
|
34
|
-
|
35
|
-
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
|
36
|
-
|
37
|
-
|
38
|
-
# Create an adapter to call your agent
|
39
|
-
class VegetarianRecipeAgentAdapter(ScenarioAgentAdapter):
|
40
|
-
def __init__(self, input: AgentInput):
|
41
|
-
self.agent = VegetarianRecipeAgent()
|
42
|
-
|
43
|
-
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
44
|
-
return self.agent.run(input.last_new_user_message_str())
|
84
|
+
scenario.configure(default_model="openai/gpt-4.1-mini")
|
45
85
|
|
46
86
|
|
47
87
|
@pytest.mark.agent_test
|
48
88
|
@pytest.mark.asyncio
|
49
89
|
async def test_vegetarian_recipe_agent():
|
50
|
-
|
51
|
-
|
90
|
+
class Agent(scenario.AgentAdapter):
|
91
|
+
async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
|
92
|
+
return vegetarian_recipe_agent(input.messages)
|
93
|
+
|
94
|
+
# Run a simulation scenario
|
95
|
+
result = await scenario.run(
|
52
96
|
name="dinner idea",
|
53
97
|
description="""
|
54
98
|
It's saturday evening, the user is very hungry and tired,
|
55
99
|
but have no money to order out, so they are looking for a recipe.
|
56
|
-
|
57
|
-
The user never mentions they want a vegetarian recipe.
|
58
100
|
""",
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
101
|
+
agents=[
|
102
|
+
Agent(),
|
103
|
+
scenario.UserSimulatorAgent(),
|
104
|
+
scenario.JudgeAgent(
|
105
|
+
criteria=[
|
106
|
+
"Agent should not ask more than two follow-up questions",
|
107
|
+
"Agent should generate a recipe",
|
108
|
+
"Recipe should include a list of ingredients",
|
109
|
+
"Recipe should include step-by-step cooking instructions",
|
110
|
+
"Recipe should be vegetarian and not include any sort of meat",
|
111
|
+
]
|
112
|
+
),
|
67
113
|
],
|
68
114
|
)
|
69
115
|
|
70
|
-
# Run the scenario and get results
|
71
|
-
result = await scenario.run()
|
72
|
-
|
73
116
|
# Assert for pytest to know whether the test passed
|
74
117
|
assert result.success
|
75
118
|
|
@@ -78,33 +121,24 @@ async def test_vegetarian_recipe_agent():
|
|
78
121
|
import litellm
|
79
122
|
|
80
123
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
then provide a complete recipe. Keep your responses concise and focused.
|
98
|
-
""",
|
99
|
-
},
|
100
|
-
*self.history,
|
101
|
-
],
|
102
|
-
)
|
103
|
-
message = response.choices[0].message # type: ignore
|
104
|
-
self.history.append(message)
|
105
|
-
|
106
|
-
return [message]
|
124
|
+
@scenario.cache()
|
125
|
+
def vegetarian_recipe_agent(messages) -> scenario.AgentReturnTypes:
|
126
|
+
response = litellm.completion(
|
127
|
+
model="openai/gpt-4.1-mini",
|
128
|
+
messages=[
|
129
|
+
{
|
130
|
+
"role": "system",
|
131
|
+
"content": """
|
132
|
+
You are a vegetarian recipe agent.
|
133
|
+
Given the user request, ask AT MOST ONE follow-up question,
|
134
|
+
then provide a complete recipe. Keep your responses concise and focused.
|
135
|
+
""",
|
136
|
+
},
|
137
|
+
*messages,
|
138
|
+
],
|
139
|
+
)
|
107
140
|
|
141
|
+
return response.choices[0].message # type: ignore
|
108
142
|
```
|
109
143
|
|
110
144
|
Create a `.env` file and put your OpenAI API key in it:
|
@@ -123,42 +157,57 @@ This is how it will look like:
|
|
123
157
|
|
124
158
|
[](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11)
|
125
159
|
|
126
|
-
You can find
|
160
|
+
You can find the same code example in [examples/test_vegetarian_recipe_agent.py](examples/test_vegetarian_recipe_agent.py).
|
127
161
|
|
128
|
-
##
|
162
|
+
## Script-free Simulation
|
129
163
|
|
130
|
-
|
164
|
+
By providing a User Simulator Agent and a description of the Scenario, the simulated user will automatically generate messages to the agent until the scenario is successful or the maximum number of turns is reached.
|
131
165
|
|
132
|
-
|
166
|
+
You can then use a Judge Agent to evaluate the scenario in real-time given certain criteria, at every turn, the Judge Agent will decide if it should let the simulation proceed or end it with a verdict.
|
167
|
+
|
168
|
+
You can combine it with a script, to control for example the beginning of the conversation, or simply let it run scriptless, this is very useful to test an open case like a vibe coding assistant:
|
133
169
|
|
134
170
|
```python
|
135
|
-
|
171
|
+
result = await scenario.run(
|
136
172
|
name="dog walking startup landing page",
|
137
173
|
description="""
|
138
174
|
the user wants to create a new landing page for their dog walking startup
|
139
175
|
|
140
176
|
send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict
|
141
177
|
""",
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
178
|
+
agents=[
|
179
|
+
LovableAgentAdapter(template_path=template_path),
|
180
|
+
scenario.UserSimulatorAgent(),
|
181
|
+
scenario.JudgeAgent(
|
182
|
+
criteria=[
|
183
|
+
"agent reads the files before go and making changes",
|
184
|
+
"agent modified the index.css file, not only the Index.tsx file",
|
185
|
+
"agent created a comprehensive landing page",
|
186
|
+
"agent extended the landing page with a new section",
|
187
|
+
"agent should NOT say it can't read the file",
|
188
|
+
"agent should NOT produce incomplete code or be too lazy to finish",
|
189
|
+
],
|
190
|
+
),
|
150
191
|
],
|
151
|
-
max_turns=5,
|
192
|
+
max_turns=5, # optional
|
152
193
|
)
|
153
|
-
|
154
|
-
result = await scenario.run()
|
155
194
|
```
|
156
195
|
|
157
|
-
|
196
|
+
Check out the fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
|
197
|
+
|
198
|
+
## Full Control of the Conversation
|
199
|
+
|
200
|
+
You can specify a script for guiding the scenario by passing a list of steps to the `script` field, those steps are simply arbitrary functions that take the current state of the scenario as an argument, so you can do things like:
|
158
201
|
|
159
|
-
|
202
|
+
- Control what the user says, or let it be generated automatically
|
203
|
+
- Control what the agent says, or let it be generated automatically
|
204
|
+
- Add custom assertions, for example making sure a tool was called
|
205
|
+
- Add a custom evaluation, from an external library
|
206
|
+
- Let the simulation proceed for a certain number of turns, and evaluate at each new turn
|
207
|
+
- Trigger the judge agent to decide on a verdict
|
208
|
+
- Add arbitrary messages like mock tool calls in the middle of the conversation
|
160
209
|
|
161
|
-
|
210
|
+
Everything is possible, using the same simple structure:
|
162
211
|
|
163
212
|
```python
|
164
213
|
@pytest.mark.agent_test
|
@@ -167,7 +216,7 @@ async def test_ai_assistant_agent():
|
|
167
216
|
scenario = Scenario(
|
168
217
|
name="false assumptions",
|
169
218
|
description="""
|
170
|
-
The agent makes false assumption about
|
219
|
+
The agent makes false assumption that the user is talking about an ATM bank, and user corrects it that they actually mean river banks
|
171
220
|
""",
|
172
221
|
agent=AiAssistantAgentAdapter,
|
173
222
|
criteria=[
|
@@ -184,13 +233,22 @@ async def test_ai_assistant_agent():
|
|
184
233
|
[
|
185
234
|
# Define existing history of messages
|
186
235
|
scenario.user("how do I safely approach a bank?"),
|
236
|
+
|
187
237
|
# Or let it be generate automatically
|
188
238
|
scenario.agent(),
|
239
|
+
|
189
240
|
# Add custom assertions, for example making sure a tool was called
|
190
241
|
check_if_tool_was_called,
|
242
|
+
|
243
|
+
# Another user message
|
191
244
|
scenario.user(),
|
192
|
-
|
193
|
-
|
245
|
+
|
246
|
+
# Let the simulation proceed for 2 more turns, print at every turn
|
247
|
+
scenario.proceed(
|
248
|
+
turns=2,
|
249
|
+
on_turn=lambda state: print(f"Turn {state.current_turn}: {state.messages}"),
|
250
|
+
),
|
251
|
+
|
194
252
|
# Time to make a judgment call
|
195
253
|
scenario.judge(),
|
196
254
|
]
|