langwatch-scenario 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/PKG-INFO +140 -79
  2. {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/README.md +136 -78
  3. {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/langwatch_scenario.egg-info/PKG-INFO +140 -79
  4. {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/langwatch_scenario.egg-info/SOURCES.txt +5 -3
  5. {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/langwatch_scenario.egg-info/requires.txt +3 -0
  6. {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/pyproject.toml +5 -1
  7. langwatch_scenario-0.4.0/scenario/__init__.py +250 -0
  8. langwatch_scenario-0.4.0/scenario/agent_adapter.py +111 -0
  9. langwatch_scenario-0.4.0/scenario/cache.py +186 -0
  10. langwatch_scenario-0.4.0/scenario/config.py +183 -0
  11. {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/scenario/error_messages.py +8 -38
  12. langwatch_scenario-0.4.0/scenario/judge_agent.py +435 -0
  13. langwatch_scenario-0.4.0/scenario/pytest_plugin.py +385 -0
  14. langwatch_scenario-0.4.0/scenario/scenario_executor.py +758 -0
  15. langwatch_scenario-0.4.0/scenario/scenario_state.py +205 -0
  16. langwatch_scenario-0.4.0/scenario/script.py +361 -0
  17. langwatch_scenario-0.4.0/scenario/types.py +269 -0
  18. langwatch_scenario-0.4.0/scenario/user_simulator_agent.py +249 -0
  19. langwatch_scenario-0.4.0/scenario/utils.py +514 -0
  20. {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/tests/test_scenario.py +177 -192
  21. {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/tests/test_scenario_agent.py +7 -8
  22. {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/tests/test_scenario_executor.py +103 -63
  23. langwatch_scenario-0.3.0/scenario/__init__.py +0 -36
  24. langwatch_scenario-0.3.0/scenario/cache.py +0 -62
  25. langwatch_scenario-0.3.0/scenario/config.py +0 -39
  26. langwatch_scenario-0.3.0/scenario/pytest_plugin.py +0 -177
  27. langwatch_scenario-0.3.0/scenario/scenario.py +0 -238
  28. langwatch_scenario-0.3.0/scenario/scenario_agent_adapter.py +0 -16
  29. langwatch_scenario-0.3.0/scenario/scenario_executor.py +0 -466
  30. langwatch_scenario-0.3.0/scenario/testing_agent.py +0 -279
  31. langwatch_scenario-0.3.0/scenario/types.py +0 -96
  32. langwatch_scenario-0.3.0/scenario/utils.py +0 -264
  33. {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/langwatch_scenario.egg-info/dependency_links.txt +0 -0
  34. {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/langwatch_scenario.egg-info/entry_points.txt +0 -0
  35. {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/langwatch_scenario.egg-info/top_level.txt +0 -0
  36. {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/setup.cfg +0 -0
  37. {langwatch_scenario-0.3.0 → langwatch_scenario-0.4.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langwatch-scenario
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: The end-to-end agent testing library
5
5
  Author-email: LangWatch Team <support@langwatch.ai>
6
6
  License: MIT
@@ -32,6 +32,9 @@ Requires-Dist: isort; extra == "dev"
32
32
  Requires-Dist: pytest-cov; extra == "dev"
33
33
  Requires-Dist: pre-commit; extra == "dev"
34
34
  Requires-Dist: commitizen; extra == "dev"
35
+ Requires-Dist: pyright; extra == "dev"
36
+ Requires-Dist: pydantic-ai; extra == "dev"
37
+ Requires-Dist: function-schema; extra == "dev"
35
38
 
36
39
  ![scenario](https://github.com/langwatch/scenario/raw/main/assets/scenario-wide.webp)
37
40
 
@@ -39,11 +42,17 @@ Requires-Dist: commitizen; extra == "dev"
39
42
  <!-- Discord, PyPI, Docs, etc links -->
40
43
  </div>
41
44
 
42
- # Scenario: Use an Agent to test your Agent
45
+ # Scenario
43
46
 
44
47
  Scenario is an Agent Testing Framework for testing AI agents through Simulation Testing.
45
48
 
46
- You define the scenarios, and the testing agent will simulate a real user as it follows them, it will keep chatting back and forth with _your_ agent to play out the simulation, until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
49
+ You define the conversation scenario and let it play out, it will keep chatting back and forth with _your_ agent until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
50
+
51
+ - Test your agents end-to-end conversations with specified scenarios to capture both happy paths and edge cases
52
+ - Full flexibility of how much you want to guide the conversation, from fully scripted scenarios to completely automated simulations
53
+ - Run evaluations at any point of the conversation, designed for multi-turn
54
+ - Works in combination with any testing and LLM evaluation frameworks, completely agnostic
55
+ - Works with any LLM and Agent Framework, easy integration
47
56
 
48
57
  [📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
49
58
 
@@ -52,6 +61,49 @@ You define the scenarios, and the testing agent will simulate a real user as it
52
61
  - [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
53
62
  - [Scenario Go](https://github.com/langwatch/scenario-go/)
54
63
 
64
+ ## Example
65
+
66
+ ```python
67
+ @pytest.mark.agent_test
68
+ @pytest.mark.asyncio
69
+ async def test_weather_agent():
70
+ # Integrate with your agent
71
+ class WeatherAgent(scenario.AgentAdapter):
72
+ async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
73
+ return weather_agent(input.messages)
74
+
75
+ # Define any custom assertions
76
+ def check_for_weather_tool_call(state: scenario.ScenarioState):
77
+ assert state.has_tool_call("get_current_weather")
78
+
79
+ # Run the scenario
80
+ result = await scenario.run(
81
+ name="checking the weather",
82
+ description="""
83
+ The user is planning a boat trip from Barcelona to Rome,
84
+ and is wondering what the weather will be like.
85
+ """,
86
+ agents=[
87
+ WeatherAgent(),
88
+ scenario.UserSimulatorAgent(model="openai/gpt-4.1-mini"),
89
+ ],
90
+ script=[
91
+ scenario.user(),
92
+ scenario.agent(),
93
+ check_for_weather_tool_call, # check for tool call after the first agent response
94
+ scenario.succeed(),
95
+ ],
96
+ )
97
+
98
+ # Assert the simulation was successful
99
+ assert result.success
100
+ ```
101
+
102
+ > [!NOTE]
103
+ > This is a very basic example, keep reading to see how to run a simulation completely script-free, using a Judge Agent to evaluate in real-time.
104
+
105
+ Check out more examples in the [examples folder](./examples/).
106
+
55
107
  ## Getting Started
56
108
 
57
109
  Install pytest and scenario:
@@ -60,51 +112,45 @@ Install pytest and scenario:
60
112
  pip install pytest langwatch-scenario
61
113
  ```
62
114
 
63
- Now create your first scenario and save it as `tests/test_vegetarian_recipe_agent.py`:
115
+ Now create your first scenario and save it as `tests/test_vegetarian_recipe_agent.py`, copy the full working example below:
64
116
 
65
117
  ```python
66
118
  import pytest
119
+ import scenario
120
+ import litellm
67
121
 
68
- from scenario import Scenario, TestingAgent, ScenarioAgentAdapter, AgentInput, AgentReturnTypes, scenario_cache
69
-
70
- Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
71
-
72
-
73
- # Create an adapter to call your agent
74
- class VegetarianRecipeAgentAdapter(ScenarioAgentAdapter):
75
- def __init__(self, input: AgentInput):
76
- self.agent = VegetarianRecipeAgent()
77
-
78
- async def call(self, input: AgentInput) -> AgentReturnTypes:
79
- return self.agent.run(input.last_new_user_message_str())
122
+ scenario.configure(default_model="openai/gpt-4.1-mini")
80
123
 
81
124
 
82
125
  @pytest.mark.agent_test
83
126
  @pytest.mark.asyncio
84
127
  async def test_vegetarian_recipe_agent():
85
- # Define the simulated scenario
86
- scenario = Scenario(
128
+ class Agent(scenario.AgentAdapter):
129
+ async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
130
+ return vegetarian_recipe_agent(input.messages)
131
+
132
+ # Run a simulation scenario
133
+ result = await scenario.run(
87
134
  name="dinner idea",
88
135
  description="""
89
136
  It's saturday evening, the user is very hungry and tired,
90
137
  but have no money to order out, so they are looking for a recipe.
91
-
92
- The user never mentions they want a vegetarian recipe.
93
138
  """,
94
- agent=vegetarian_recipe_agent,
95
- # List the evaluation criteria for the scenario to be considered successful
96
- criteria=[
97
- "Agent should not ask more than two follow-up questions",
98
- "Agent should generate a recipe",
99
- "Recipe should include a list of ingredients",
100
- "Recipe should include step-by-step cooking instructions",
101
- "Recipe should be vegetarian and not include any sort of meat",
139
+ agents=[
140
+ Agent(),
141
+ scenario.UserSimulatorAgent(),
142
+ scenario.JudgeAgent(
143
+ criteria=[
144
+ "Agent should not ask more than two follow-up questions",
145
+ "Agent should generate a recipe",
146
+ "Recipe should include a list of ingredients",
147
+ "Recipe should include step-by-step cooking instructions",
148
+ "Recipe should be vegetarian and not include any sort of meat",
149
+ ]
150
+ ),
102
151
  ],
103
152
  )
104
153
 
105
- # Run the scenario and get results
106
- result = await scenario.run()
107
-
108
154
  # Assert for pytest to know whether the test passed
109
155
  assert result.success
110
156
 
@@ -113,33 +159,24 @@ async def test_vegetarian_recipe_agent():
113
159
  import litellm
114
160
 
115
161
 
116
- class VegetarianRecipeAgent:
117
- def __init__(self):
118
- self.history = []
119
-
120
- @scenario_cache()
121
- def run(self, message: str):
122
- self.history.append({"role": "user", "content": message})
123
-
124
- response = litellm.completion(
125
- model="openai/gpt-4o-mini",
126
- messages=[
127
- {
128
- "role": "system",
129
- "content": """
130
- You are a vegetarian recipe agent.
131
- Given the user request, ask AT MOST ONE follow-up question,
132
- then provide a complete recipe. Keep your responses concise and focused.
133
- """,
134
- },
135
- *self.history,
136
- ],
137
- )
138
- message = response.choices[0].message # type: ignore
139
- self.history.append(message)
140
-
141
- return [message]
162
+ @scenario.cache()
163
+ def vegetarian_recipe_agent(messages) -> scenario.AgentReturnTypes:
164
+ response = litellm.completion(
165
+ model="openai/gpt-4.1-mini",
166
+ messages=[
167
+ {
168
+ "role": "system",
169
+ "content": """
170
+ You are a vegetarian recipe agent.
171
+ Given the user request, ask AT MOST ONE follow-up question,
172
+ then provide a complete recipe. Keep your responses concise and focused.
173
+ """,
174
+ },
175
+ *messages,
176
+ ],
177
+ )
142
178
 
179
+ return response.choices[0].message # type: ignore
143
180
  ```
144
181
 
145
182
  Create a `.env` file and put your OpenAI API key in it:
@@ -158,42 +195,57 @@ This is how it will look like:
158
195
 
159
196
  [![asciicast](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11.svg)](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11)
160
197
 
161
- You can find a fully working example in [examples/test_vegetarian_recipe_agent.py](examples/test_vegetarian_recipe_agent.py).
198
+ You can find the same code example in [examples/test_vegetarian_recipe_agent.py](examples/test_vegetarian_recipe_agent.py).
162
199
 
163
- ## Customize strategy and max_turns
200
+ ## Script-free Simulation
164
201
 
165
- You can customize how should the testing agent go about testing by defining a `strategy` field. You can also limit the maximum number of turns the scenario will take by setting the `max_turns` field (defaults to 10).
202
+ By providing a User Simulator Agent and a description of the Scenario, the simulated user will automatically generate messages to the agent until the scenario is successful or the maximum number of turns is reached.
166
203
 
167
- For example, in this Lovable Clone scenario test:
204
+ You can then use a Judge Agent to evaluate the scenario in real-time given certain criteria, at every turn, the Judge Agent will decide if it should let the simulation proceed or end it with a verdict.
205
+
206
+ You can combine it with a script, to control for example the beginning of the conversation, or simply let it run scriptless, this is very useful to test an open case like a vibe coding assistant:
168
207
 
169
208
  ```python
170
- scenario = Scenario(
209
+ result = await scenario.run(
171
210
  name="dog walking startup landing page",
172
211
  description="""
173
212
  the user wants to create a new landing page for their dog walking startup
174
213
 
175
214
  send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict
176
215
  """,
177
- agent=lovable_agent,
178
- criteria=[
179
- "agent reads the files before go and making changes",
180
- "agent modified the index.css file, not only the Index.tsx file",
181
- "agent created a comprehensive landing page",
182
- "agent extended the landing page with a new section",
183
- "agent should NOT say it can't read the file",
184
- "agent should NOT produce incomplete code or be too lazy to finish",
216
+ agents=[
217
+ LovableAgentAdapter(template_path=template_path),
218
+ scenario.UserSimulatorAgent(),
219
+ scenario.JudgeAgent(
220
+ criteria=[
221
+ "agent reads the files before go and making changes",
222
+ "agent modified the index.css file, not only the Index.tsx file",
223
+ "agent created a comprehensive landing page",
224
+ "agent extended the landing page with a new section",
225
+ "agent should NOT say it can't read the file",
226
+ "agent should NOT produce incomplete code or be too lazy to finish",
227
+ ],
228
+ ),
185
229
  ],
186
- max_turns=5,
230
+ max_turns=5, # optional
187
231
  )
188
-
189
- result = await scenario.run()
190
232
  ```
191
233
 
192
- You can find a fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
234
+ Check out the fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
235
+
236
+ ## Full Control of the Conversation
237
+
238
+ You can specify a script for guiding the scenario by passing a list of steps to the `script` field, those steps are simply arbitrary functions that take the current state of the scenario as an argument, so you can do things like:
193
239
 
194
- ## Specify a script for guiding the scenario
240
+ - Control what the user says, or let it be generated automatically
241
+ - Control what the agent says, or let it be generated automatically
242
+ - Add custom assertions, for example making sure a tool was called
243
+ - Add a custom evaluation, from an external library
244
+ - Let the simulation proceed for a certain number of turns, and evaluate at each new turn
245
+ - Trigger the judge agent to decide on a verdict
246
+ - Add arbitrary messages like mock tool calls in the middle of the conversation
195
247
 
196
- You can specify a script for guiding the scenario by passing a list of steps to the `script` field.
248
+ Everything is possible, using the same simple structure:
197
249
 
198
250
  ```python
199
251
  @pytest.mark.agent_test
@@ -202,7 +254,7 @@ async def test_ai_assistant_agent():
202
254
  scenario = Scenario(
203
255
  name="false assumptions",
204
256
  description="""
205
- The agent makes false assumption about being an ATM bank, and user corrects it
257
+ The agent makes false assumption that the user is talking about an ATM bank, and user corrects it that they actually mean river banks
206
258
  """,
207
259
  agent=AiAssistantAgentAdapter,
208
260
  criteria=[
@@ -219,13 +271,22 @@ async def test_ai_assistant_agent():
219
271
  [
220
272
  # Define existing history of messages
221
273
  scenario.user("how do I safely approach a bank?"),
274
+
222
275
  # Or let it be generate automatically
223
276
  scenario.agent(),
277
+
224
278
  # Add custom assertions, for example making sure a tool was called
225
279
  check_if_tool_was_called,
280
+
281
+ # Another user message
226
282
  scenario.user(),
227
- # Let the simulation proceed for 2 more turns
228
- scenario.proceed(turns=2),
283
+
284
+ # Let the simulation proceed for 2 more turns, print at every turn
285
+ scenario.proceed(
286
+ turns=2,
287
+ on_turn=lambda state: print(f"Turn {state.current_turn}: {state.messages}"),
288
+ ),
289
+
229
290
  # Time to make a judgment call
230
291
  scenario.judge(),
231
292
  ]
@@ -4,11 +4,17 @@
4
4
  <!-- Discord, PyPI, Docs, etc links -->
5
5
  </div>
6
6
 
7
- # Scenario: Use an Agent to test your Agent
7
+ # Scenario
8
8
 
9
9
  Scenario is an Agent Testing Framework for testing AI agents through Simulation Testing.
10
10
 
11
- You define the scenarios, and the testing agent will simulate a real user as it follows them, it will keep chatting back and forth with _your_ agent to play out the simulation, until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
11
+ You define the conversation scenario and let it play out, it will keep chatting back and forth with _your_ agent until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
12
+
13
+ - Test your agents end-to-end conversations with specified scenarios to capture both happy paths and edge cases
14
+ - Full flexibility of how much you want to guide the conversation, from fully scripted scenarios to completely automated simulations
15
+ - Run evaluations at any point of the conversation, designed for multi-turn
16
+ - Works in combination with any testing and LLM evaluation frameworks, completely agnostic
17
+ - Works with any LLM and Agent Framework, easy integration
12
18
 
13
19
  [📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
14
20
 
@@ -17,6 +23,49 @@ You define the scenarios, and the testing agent will simulate a real user as it
17
23
  - [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
18
24
  - [Scenario Go](https://github.com/langwatch/scenario-go/)
19
25
 
26
+ ## Example
27
+
28
+ ```python
29
+ @pytest.mark.agent_test
30
+ @pytest.mark.asyncio
31
+ async def test_weather_agent():
32
+ # Integrate with your agent
33
+ class WeatherAgent(scenario.AgentAdapter):
34
+ async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
35
+ return weather_agent(input.messages)
36
+
37
+ # Define any custom assertions
38
+ def check_for_weather_tool_call(state: scenario.ScenarioState):
39
+ assert state.has_tool_call("get_current_weather")
40
+
41
+ # Run the scenario
42
+ result = await scenario.run(
43
+ name="checking the weather",
44
+ description="""
45
+ The user is planning a boat trip from Barcelona to Rome,
46
+ and is wondering what the weather will be like.
47
+ """,
48
+ agents=[
49
+ WeatherAgent(),
50
+ scenario.UserSimulatorAgent(model="openai/gpt-4.1-mini"),
51
+ ],
52
+ script=[
53
+ scenario.user(),
54
+ scenario.agent(),
55
+ check_for_weather_tool_call, # check for tool call after the first agent response
56
+ scenario.succeed(),
57
+ ],
58
+ )
59
+
60
+ # Assert the simulation was successful
61
+ assert result.success
62
+ ```
63
+
64
+ > [!NOTE]
65
+ > This is a very basic example, keep reading to see how to run a simulation completely script-free, using a Judge Agent to evaluate in real-time.
66
+
67
+ Check out more examples in the [examples folder](./examples/).
68
+
20
69
  ## Getting Started
21
70
 
22
71
  Install pytest and scenario:
@@ -25,51 +74,45 @@ Install pytest and scenario:
25
74
  pip install pytest langwatch-scenario
26
75
  ```
27
76
 
28
- Now create your first scenario and save it as `tests/test_vegetarian_recipe_agent.py`:
77
+ Now create your first scenario and save it as `tests/test_vegetarian_recipe_agent.py`, copy the full working example below:
29
78
 
30
79
  ```python
31
80
  import pytest
81
+ import scenario
82
+ import litellm
32
83
 
33
- from scenario import Scenario, TestingAgent, ScenarioAgentAdapter, AgentInput, AgentReturnTypes, scenario_cache
34
-
35
- Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
36
-
37
-
38
- # Create an adapter to call your agent
39
- class VegetarianRecipeAgentAdapter(ScenarioAgentAdapter):
40
- def __init__(self, input: AgentInput):
41
- self.agent = VegetarianRecipeAgent()
42
-
43
- async def call(self, input: AgentInput) -> AgentReturnTypes:
44
- return self.agent.run(input.last_new_user_message_str())
84
+ scenario.configure(default_model="openai/gpt-4.1-mini")
45
85
 
46
86
 
47
87
  @pytest.mark.agent_test
48
88
  @pytest.mark.asyncio
49
89
  async def test_vegetarian_recipe_agent():
50
- # Define the simulated scenario
51
- scenario = Scenario(
90
+ class Agent(scenario.AgentAdapter):
91
+ async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
92
+ return vegetarian_recipe_agent(input.messages)
93
+
94
+ # Run a simulation scenario
95
+ result = await scenario.run(
52
96
  name="dinner idea",
53
97
  description="""
54
98
  It's saturday evening, the user is very hungry and tired,
55
99
  but have no money to order out, so they are looking for a recipe.
56
-
57
- The user never mentions they want a vegetarian recipe.
58
100
  """,
59
- agent=vegetarian_recipe_agent,
60
- # List the evaluation criteria for the scenario to be considered successful
61
- criteria=[
62
- "Agent should not ask more than two follow-up questions",
63
- "Agent should generate a recipe",
64
- "Recipe should include a list of ingredients",
65
- "Recipe should include step-by-step cooking instructions",
66
- "Recipe should be vegetarian and not include any sort of meat",
101
+ agents=[
102
+ Agent(),
103
+ scenario.UserSimulatorAgent(),
104
+ scenario.JudgeAgent(
105
+ criteria=[
106
+ "Agent should not ask more than two follow-up questions",
107
+ "Agent should generate a recipe",
108
+ "Recipe should include a list of ingredients",
109
+ "Recipe should include step-by-step cooking instructions",
110
+ "Recipe should be vegetarian and not include any sort of meat",
111
+ ]
112
+ ),
67
113
  ],
68
114
  )
69
115
 
70
- # Run the scenario and get results
71
- result = await scenario.run()
72
-
73
116
  # Assert for pytest to know whether the test passed
74
117
  assert result.success
75
118
 
@@ -78,33 +121,24 @@ async def test_vegetarian_recipe_agent():
78
121
  import litellm
79
122
 
80
123
 
81
- class VegetarianRecipeAgent:
82
- def __init__(self):
83
- self.history = []
84
-
85
- @scenario_cache()
86
- def run(self, message: str):
87
- self.history.append({"role": "user", "content": message})
88
-
89
- response = litellm.completion(
90
- model="openai/gpt-4o-mini",
91
- messages=[
92
- {
93
- "role": "system",
94
- "content": """
95
- You are a vegetarian recipe agent.
96
- Given the user request, ask AT MOST ONE follow-up question,
97
- then provide a complete recipe. Keep your responses concise and focused.
98
- """,
99
- },
100
- *self.history,
101
- ],
102
- )
103
- message = response.choices[0].message # type: ignore
104
- self.history.append(message)
105
-
106
- return [message]
124
+ @scenario.cache()
125
+ def vegetarian_recipe_agent(messages) -> scenario.AgentReturnTypes:
126
+ response = litellm.completion(
127
+ model="openai/gpt-4.1-mini",
128
+ messages=[
129
+ {
130
+ "role": "system",
131
+ "content": """
132
+ You are a vegetarian recipe agent.
133
+ Given the user request, ask AT MOST ONE follow-up question,
134
+ then provide a complete recipe. Keep your responses concise and focused.
135
+ """,
136
+ },
137
+ *messages,
138
+ ],
139
+ )
107
140
 
141
+ return response.choices[0].message # type: ignore
108
142
  ```
109
143
 
110
144
  Create a `.env` file and put your OpenAI API key in it:
@@ -123,42 +157,57 @@ This is how it will look like:
123
157
 
124
158
  [![asciicast](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11.svg)](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11)
125
159
 
126
- You can find a fully working example in [examples/test_vegetarian_recipe_agent.py](examples/test_vegetarian_recipe_agent.py).
160
+ You can find the same code example in [examples/test_vegetarian_recipe_agent.py](examples/test_vegetarian_recipe_agent.py).
127
161
 
128
- ## Customize strategy and max_turns
162
+ ## Script-free Simulation
129
163
 
130
- You can customize how should the testing agent go about testing by defining a `strategy` field. You can also limit the maximum number of turns the scenario will take by setting the `max_turns` field (defaults to 10).
164
+ By providing a User Simulator Agent and a description of the Scenario, the simulated user will automatically generate messages to the agent until the scenario is successful or the maximum number of turns is reached.
131
165
 
132
- For example, in this Lovable Clone scenario test:
166
+ You can then use a Judge Agent to evaluate the scenario in real-time given certain criteria, at every turn, the Judge Agent will decide if it should let the simulation proceed or end it with a verdict.
167
+
168
+ You can combine it with a script, to control for example the beginning of the conversation, or simply let it run scriptless, this is very useful to test an open case like a vibe coding assistant:
133
169
 
134
170
  ```python
135
- scenario = Scenario(
171
+ result = await scenario.run(
136
172
  name="dog walking startup landing page",
137
173
  description="""
138
174
  the user wants to create a new landing page for their dog walking startup
139
175
 
140
176
  send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict
141
177
  """,
142
- agent=lovable_agent,
143
- criteria=[
144
- "agent reads the files before go and making changes",
145
- "agent modified the index.css file, not only the Index.tsx file",
146
- "agent created a comprehensive landing page",
147
- "agent extended the landing page with a new section",
148
- "agent should NOT say it can't read the file",
149
- "agent should NOT produce incomplete code or be too lazy to finish",
178
+ agents=[
179
+ LovableAgentAdapter(template_path=template_path),
180
+ scenario.UserSimulatorAgent(),
181
+ scenario.JudgeAgent(
182
+ criteria=[
183
+ "agent reads the files before go and making changes",
184
+ "agent modified the index.css file, not only the Index.tsx file",
185
+ "agent created a comprehensive landing page",
186
+ "agent extended the landing page with a new section",
187
+ "agent should NOT say it can't read the file",
188
+ "agent should NOT produce incomplete code or be too lazy to finish",
189
+ ],
190
+ ),
150
191
  ],
151
- max_turns=5,
192
+ max_turns=5, # optional
152
193
  )
153
-
154
- result = await scenario.run()
155
194
  ```
156
195
 
157
- You can find a fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
196
+ Check out the fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
197
+
198
+ ## Full Control of the Conversation
199
+
200
+ You can specify a script for guiding the scenario by passing a list of steps to the `script` field, those steps are simply arbitrary functions that take the current state of the scenario as an argument, so you can do things like:
158
201
 
159
- ## Specify a script for guiding the scenario
202
+ - Control what the user says, or let it be generated automatically
203
+ - Control what the agent says, or let it be generated automatically
204
+ - Add custom assertions, for example making sure a tool was called
205
+ - Add a custom evaluation, from an external library
206
+ - Let the simulation proceed for a certain number of turns, and evaluate at each new turn
207
+ - Trigger the judge agent to decide on a verdict
208
+ - Add arbitrary messages like mock tool calls in the middle of the conversation
160
209
 
161
- You can specify a script for guiding the scenario by passing a list of steps to the `script` field.
210
+ Everything is possible, using the same simple structure:
162
211
 
163
212
  ```python
164
213
  @pytest.mark.agent_test
@@ -167,7 +216,7 @@ async def test_ai_assistant_agent():
167
216
  scenario = Scenario(
168
217
  name="false assumptions",
169
218
  description="""
170
- The agent makes false assumption about being an ATM bank, and user corrects it
219
+ The agent makes false assumption that the user is talking about an ATM bank, and user corrects it that they actually mean river banks
171
220
  """,
172
221
  agent=AiAssistantAgentAdapter,
173
222
  criteria=[
@@ -184,13 +233,22 @@ async def test_ai_assistant_agent():
184
233
  [
185
234
  # Define existing history of messages
186
235
  scenario.user("how do I safely approach a bank?"),
236
+
187
237
  # Or let it be generate automatically
188
238
  scenario.agent(),
239
+
189
240
  # Add custom assertions, for example making sure a tool was called
190
241
  check_if_tool_was_called,
242
+
243
+ # Another user message
191
244
  scenario.user(),
192
- # Let the simulation proceed for 2 more turns
193
- scenario.proceed(turns=2),
245
+
246
+ # Let the simulation proceed for 2 more turns, print at every turn
247
+ scenario.proceed(
248
+ turns=2,
249
+ on_turn=lambda state: print(f"Turn {state.current_turn}: {state.messages}"),
250
+ ),
251
+
194
252
  # Time to make a judgment call
195
253
  scenario.judge(),
196
254
  ]