langwatch-scenario 0.1.3__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/PKG-INFO +95 -34
  2. {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/README.md +91 -32
  3. {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/PKG-INFO +95 -34
  4. {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/SOURCES.txt +6 -2
  5. {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/requires.txt +3 -1
  6. {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/pyproject.toml +18 -9
  7. {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/scenario/__init__.py +13 -3
  8. {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/scenario/config.py +18 -7
  9. langwatch_scenario-0.3.0/scenario/error_messages.py +134 -0
  10. {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/scenario/pytest_plugin.py +8 -8
  11. langwatch_scenario-0.3.0/scenario/scenario.py +238 -0
  12. langwatch_scenario-0.3.0/scenario/scenario_agent_adapter.py +16 -0
  13. langwatch_scenario-0.3.0/scenario/scenario_executor.py +466 -0
  14. {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/scenario/testing_agent.py +123 -109
  15. langwatch_scenario-0.3.0/scenario/types.py +96 -0
  16. langwatch_scenario-0.3.0/scenario/utils.py +264 -0
  17. {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/setup.py +1 -1
  18. langwatch_scenario-0.3.0/tests/test_scenario.py +434 -0
  19. langwatch_scenario-0.3.0/tests/test_scenario_agent.py +39 -0
  20. langwatch_scenario-0.3.0/tests/test_scenario_executor.py +162 -0
  21. langwatch_scenario-0.1.3/scenario/error_messages.py +0 -76
  22. langwatch_scenario-0.1.3/scenario/result.py +0 -81
  23. langwatch_scenario-0.1.3/scenario/scenario.py +0 -120
  24. langwatch_scenario-0.1.3/scenario/scenario_executor.py +0 -204
  25. langwatch_scenario-0.1.3/scenario/utils.py +0 -121
  26. {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/dependency_links.txt +0 -0
  27. {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/entry_points.txt +0 -0
  28. {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/top_level.txt +0 -0
  29. {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/scenario/cache.py +0 -0
  30. {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langwatch-scenario
3
- Version: 0.1.3
3
+ Version: 0.3.0
4
4
  Summary: The end-to-end agent testing library
5
5
  Author-email: LangWatch Team <support@langwatch.ai>
6
6
  License: MIT
@@ -25,11 +25,13 @@ Requires-Dist: joblib>=1.4.2
25
25
  Requires-Dist: wrapt>=1.17.2
26
26
  Requires-Dist: pytest-asyncio>=0.26.0
27
27
  Requires-Dist: rich<15.0.0,>=13.3.3
28
+ Requires-Dist: pksuid>=1.1.2
28
29
  Provides-Extra: dev
29
30
  Requires-Dist: black; extra == "dev"
30
31
  Requires-Dist: isort; extra == "dev"
31
- Requires-Dist: mypy; extra == "dev"
32
32
  Requires-Dist: pytest-cov; extra == "dev"
33
+ Requires-Dist: pre-commit; extra == "dev"
34
+ Requires-Dist: commitizen; extra == "dev"
33
35
 
34
36
  ![scenario](https://github.com/langwatch/scenario/raw/main/assets/scenario-wide.webp)
35
37
 
@@ -39,12 +41,17 @@ Requires-Dist: pytest-cov; extra == "dev"
39
41
 
40
42
  # Scenario: Use an Agent to test your Agent
41
43
 
42
- Scenario is a library for testing agents end-to-end as a human would, but without having to manually do it. The automated testing agent covers every single scenario for you.
44
+ Scenario is an Agent Testing Framework for testing AI agents through Simulation Testing.
43
45
 
44
- You define the scenarios, and the testing agent will simulate your users as it follows them, it will keep chatting and evaluating your agent until it reaches the desired goal or detects an unexpected behavior.
46
+ You define the scenarios, and the testing agent will simulate a real user as it follows them, it will keep chatting back and forth with _your_ agent to play out the simulation, until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
45
47
 
46
48
  [📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
47
49
 
50
+ ### See also
51
+
52
+ - [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
53
+ - [Scenario Go](https://github.com/langwatch/scenario-go/)
54
+
48
55
  ## Getting Started
49
56
 
50
57
  Install pytest and scenario:
@@ -58,32 +65,40 @@ Now create your first scenario and save it as `tests/test_vegetarian_recipe_agen
58
65
  ```python
59
66
  import pytest
60
67
 
61
- from scenario import Scenario, TestingAgent, scenario_cache
68
+ from scenario import Scenario, TestingAgent, ScenarioAgentAdapter, AgentInput, AgentReturnTypes, scenario_cache
62
69
 
63
70
  Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
64
71
 
65
72
 
73
+ # Create an adapter to call your agent
74
+ class VegetarianRecipeAgentAdapter(ScenarioAgentAdapter):
75
+ def __init__(self, input: AgentInput):
76
+ self.agent = VegetarianRecipeAgent()
77
+
78
+ async def call(self, input: AgentInput) -> AgentReturnTypes:
79
+ return self.agent.run(input.last_new_user_message_str())
80
+
81
+
66
82
  @pytest.mark.agent_test
67
83
  @pytest.mark.asyncio
68
84
  async def test_vegetarian_recipe_agent():
69
- agent = VegetarianRecipeAgent()
70
-
71
- def vegetarian_recipe_agent(message, context):
72
- # Call your agent here
73
- return agent.run(message)
74
-
75
- # Define the scenario
85
+ # Define the simulated scenario
76
86
  scenario = Scenario(
77
- "User is looking for a dinner idea",
87
+ name="dinner idea",
88
+ description="""
89
+ It's saturday evening, the user is very hungry and tired,
90
+ but have no money to order out, so they are looking for a recipe.
91
+
92
+ The user never mentions they want a vegetarian recipe.
93
+ """,
78
94
  agent=vegetarian_recipe_agent,
79
- success_criteria=[
80
- "Recipe agent generates a vegetarian recipe",
81
- "Recipe includes a list of ingredients",
82
- "Recipe includes step-by-step cooking instructions",
83
- ],
84
- failure_criteria=[
85
- "The recipe is not vegetarian or includes meat",
86
- "The agent asks more than two follow-up questions",
95
+ # List the evaluation criteria for the scenario to be considered successful
96
+ criteria=[
97
+ "Agent should not ask more than two follow-up questions",
98
+ "Agent should generate a recipe",
99
+ "Recipe should include a list of ingredients",
100
+ "Recipe should include step-by-step cooking instructions",
101
+ "Recipe should be vegetarian and not include any sort of meat",
87
102
  ],
88
103
  )
89
104
 
@@ -111,9 +126,11 @@ class VegetarianRecipeAgent:
111
126
  messages=[
112
127
  {
113
128
  "role": "system",
114
- "content": """You are a vegetarian recipe agent.
115
- Given the user request, ask AT MOST ONE follow-up question,
116
- then provide a complete recipe. Keep your responses concise and focused.""",
129
+ "content": """
130
+ You are a vegetarian recipe agent.
131
+ Given the user request, ask AT MOST ONE follow-up question,
132
+ then provide a complete recipe. Keep your responses concise and focused.
133
+ """,
117
134
  },
118
135
  *self.history,
119
136
  ],
@@ -121,7 +138,7 @@ class VegetarianRecipeAgent:
121
138
  message = response.choices[0].message # type: ignore
122
139
  self.history.append(message)
123
140
 
124
- return {"messages": [message]}
141
+ return [message]
125
142
 
126
143
  ```
127
144
 
@@ -151,19 +168,20 @@ For example, in this Lovable Clone scenario test:
151
168
 
152
169
  ```python
153
170
  scenario = Scenario(
154
- "user wants to create a new landing page for their dog walking startup",
171
+ name="dog walking startup landing page",
172
+ description="""
173
+ the user wants to create a new landing page for their dog walking startup
174
+
175
+ send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict
176
+ """,
155
177
  agent=lovable_agent,
156
- strategy="send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict",
157
- success_criteria=[
178
+ criteria=[
158
179
  "agent reads the files before go and making changes",
159
- "agent modified the index.css file",
160
- "agent modified the Index.tsx file",
180
+ "agent modified the index.css file, not only the Index.tsx file",
161
181
  "agent created a comprehensive landing page",
162
182
  "agent extended the landing page with a new section",
163
- ],
164
- failure_criteria=[
165
- "agent says it can't read the file",
166
- "agent produces incomplete code or is too lazy to finish",
183
+ "agent should NOT say it can't read the file",
184
+ "agent should NOT produce incomplete code or be too lazy to finish",
167
185
  ],
168
186
  max_turns=5,
169
187
  )
@@ -173,6 +191,49 @@ result = await scenario.run()
173
191
 
174
192
  You can find a fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
175
193
 
194
+ ## Specify a script for guiding the scenario
195
+
196
+ You can specify a script for guiding the scenario by passing a list of steps to the `script` field.
197
+
198
+ ```python
199
+ @pytest.mark.agent_test
200
+ @pytest.mark.asyncio
201
+ async def test_ai_assistant_agent():
202
+ scenario = Scenario(
203
+ name="false assumptions",
204
+ description="""
205
+ The agent makes false assumption about being an ATM bank, and user corrects it
206
+ """,
207
+ agent=AiAssistantAgentAdapter,
208
+ criteria=[
209
+ "user should get good recommendations on river crossing",
210
+ "agent should NOT follow up about ATM recommendation after user has corrected them they are just hiking",
211
+ ],
212
+ max_turns=5,
213
+ )
214
+
215
+ def check_if_tool_was_called(state: ScenarioExecutor) -> None:
216
+ assert state.has_tool_call("web_search")
217
+
218
+ result = await scenario.script(
219
+ [
220
+ # Define existing history of messages
221
+ scenario.user("how do I safely approach a bank?"),
222
+ # Or let it be generate automatically
223
+ scenario.agent(),
224
+ # Add custom assertions, for example making sure a tool was called
225
+ check_if_tool_was_called,
226
+ scenario.user(),
227
+ # Let the simulation proceed for 2 more turns
228
+ scenario.proceed(turns=2),
229
+ # Time to make a judgment call
230
+ scenario.judge(),
231
+ ]
232
+ ).run()
233
+
234
+ assert result.success
235
+ ```
236
+
176
237
  ## Debug mode
177
238
 
178
239
  You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running, or by passing the `--debug` flag to pytest.
@@ -6,12 +6,17 @@
6
6
 
7
7
  # Scenario: Use an Agent to test your Agent
8
8
 
9
- Scenario is a library for testing agents end-to-end as a human would, but without having to manually do it. The automated testing agent covers every single scenario for you.
9
+ Scenario is an Agent Testing Framework for testing AI agents through Simulation Testing.
10
10
 
11
- You define the scenarios, and the testing agent will simulate your users as it follows them, it will keep chatting and evaluating your agent until it reaches the desired goal or detects an unexpected behavior.
11
+ You define the scenarios, and the testing agent will simulate a real user as it follows them, it will keep chatting back and forth with _your_ agent to play out the simulation, until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
12
12
 
13
13
  [📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
14
14
 
15
+ ### See also
16
+
17
+ - [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
18
+ - [Scenario Go](https://github.com/langwatch/scenario-go/)
19
+
15
20
  ## Getting Started
16
21
 
17
22
  Install pytest and scenario:
@@ -25,32 +30,40 @@ Now create your first scenario and save it as `tests/test_vegetarian_recipe_agen
25
30
  ```python
26
31
  import pytest
27
32
 
28
- from scenario import Scenario, TestingAgent, scenario_cache
33
+ from scenario import Scenario, TestingAgent, ScenarioAgentAdapter, AgentInput, AgentReturnTypes, scenario_cache
29
34
 
30
35
  Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
31
36
 
32
37
 
38
+ # Create an adapter to call your agent
39
+ class VegetarianRecipeAgentAdapter(ScenarioAgentAdapter):
40
+ def __init__(self, input: AgentInput):
41
+ self.agent = VegetarianRecipeAgent()
42
+
43
+ async def call(self, input: AgentInput) -> AgentReturnTypes:
44
+ return self.agent.run(input.last_new_user_message_str())
45
+
46
+
33
47
  @pytest.mark.agent_test
34
48
  @pytest.mark.asyncio
35
49
  async def test_vegetarian_recipe_agent():
36
- agent = VegetarianRecipeAgent()
37
-
38
- def vegetarian_recipe_agent(message, context):
39
- # Call your agent here
40
- return agent.run(message)
41
-
42
- # Define the scenario
50
+ # Define the simulated scenario
43
51
  scenario = Scenario(
44
- "User is looking for a dinner idea",
52
+ name="dinner idea",
53
+ description="""
54
+ It's saturday evening, the user is very hungry and tired,
55
+ but have no money to order out, so they are looking for a recipe.
56
+
57
+ The user never mentions they want a vegetarian recipe.
58
+ """,
45
59
  agent=vegetarian_recipe_agent,
46
- success_criteria=[
47
- "Recipe agent generates a vegetarian recipe",
48
- "Recipe includes a list of ingredients",
49
- "Recipe includes step-by-step cooking instructions",
50
- ],
51
- failure_criteria=[
52
- "The recipe is not vegetarian or includes meat",
53
- "The agent asks more than two follow-up questions",
60
+ # List the evaluation criteria for the scenario to be considered successful
61
+ criteria=[
62
+ "Agent should not ask more than two follow-up questions",
63
+ "Agent should generate a recipe",
64
+ "Recipe should include a list of ingredients",
65
+ "Recipe should include step-by-step cooking instructions",
66
+ "Recipe should be vegetarian and not include any sort of meat",
54
67
  ],
55
68
  )
56
69
 
@@ -78,9 +91,11 @@ class VegetarianRecipeAgent:
78
91
  messages=[
79
92
  {
80
93
  "role": "system",
81
- "content": """You are a vegetarian recipe agent.
82
- Given the user request, ask AT MOST ONE follow-up question,
83
- then provide a complete recipe. Keep your responses concise and focused.""",
94
+ "content": """
95
+ You are a vegetarian recipe agent.
96
+ Given the user request, ask AT MOST ONE follow-up question,
97
+ then provide a complete recipe. Keep your responses concise and focused.
98
+ """,
84
99
  },
85
100
  *self.history,
86
101
  ],
@@ -88,7 +103,7 @@ class VegetarianRecipeAgent:
88
103
  message = response.choices[0].message # type: ignore
89
104
  self.history.append(message)
90
105
 
91
- return {"messages": [message]}
106
+ return [message]
92
107
 
93
108
  ```
94
109
 
@@ -118,19 +133,20 @@ For example, in this Lovable Clone scenario test:
118
133
 
119
134
  ```python
120
135
  scenario = Scenario(
121
- "user wants to create a new landing page for their dog walking startup",
136
+ name="dog walking startup landing page",
137
+ description="""
138
+ the user wants to create a new landing page for their dog walking startup
139
+
140
+ send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict
141
+ """,
122
142
  agent=lovable_agent,
123
- strategy="send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict",
124
- success_criteria=[
143
+ criteria=[
125
144
  "agent reads the files before go and making changes",
126
- "agent modified the index.css file",
127
- "agent modified the Index.tsx file",
145
+ "agent modified the index.css file, not only the Index.tsx file",
128
146
  "agent created a comprehensive landing page",
129
147
  "agent extended the landing page with a new section",
130
- ],
131
- failure_criteria=[
132
- "agent says it can't read the file",
133
- "agent produces incomplete code or is too lazy to finish",
148
+ "agent should NOT say it can't read the file",
149
+ "agent should NOT produce incomplete code or be too lazy to finish",
134
150
  ],
135
151
  max_turns=5,
136
152
  )
@@ -140,6 +156,49 @@ result = await scenario.run()
140
156
 
141
157
  You can find a fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
142
158
 
159
+ ## Specify a script for guiding the scenario
160
+
161
+ You can specify a script for guiding the scenario by passing a list of steps to the `script` field.
162
+
163
+ ```python
164
+ @pytest.mark.agent_test
165
+ @pytest.mark.asyncio
166
+ async def test_ai_assistant_agent():
167
+ scenario = Scenario(
168
+ name="false assumptions",
169
+ description="""
170
+ The agent makes false assumption about being an ATM bank, and user corrects it
171
+ """,
172
+ agent=AiAssistantAgentAdapter,
173
+ criteria=[
174
+ "user should get good recommendations on river crossing",
175
+ "agent should NOT follow up about ATM recommendation after user has corrected them they are just hiking",
176
+ ],
177
+ max_turns=5,
178
+ )
179
+
180
+ def check_if_tool_was_called(state: ScenarioExecutor) -> None:
181
+ assert state.has_tool_call("web_search")
182
+
183
+ result = await scenario.script(
184
+ [
185
+ # Define existing history of messages
186
+ scenario.user("how do I safely approach a bank?"),
187
+ # Or let it be generate automatically
188
+ scenario.agent(),
189
+ # Add custom assertions, for example making sure a tool was called
190
+ check_if_tool_was_called,
191
+ scenario.user(),
192
+ # Let the simulation proceed for 2 more turns
193
+ scenario.proceed(turns=2),
194
+ # Time to make a judgment call
195
+ scenario.judge(),
196
+ ]
197
+ ).run()
198
+
199
+ assert result.success
200
+ ```
201
+
143
202
  ## Debug mode
144
203
 
145
204
  You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running, or by passing the `--debug` flag to pytest.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langwatch-scenario
3
- Version: 0.1.3
3
+ Version: 0.3.0
4
4
  Summary: The end-to-end agent testing library
5
5
  Author-email: LangWatch Team <support@langwatch.ai>
6
6
  License: MIT
@@ -25,11 +25,13 @@ Requires-Dist: joblib>=1.4.2
25
25
  Requires-Dist: wrapt>=1.17.2
26
26
  Requires-Dist: pytest-asyncio>=0.26.0
27
27
  Requires-Dist: rich<15.0.0,>=13.3.3
28
+ Requires-Dist: pksuid>=1.1.2
28
29
  Provides-Extra: dev
29
30
  Requires-Dist: black; extra == "dev"
30
31
  Requires-Dist: isort; extra == "dev"
31
- Requires-Dist: mypy; extra == "dev"
32
32
  Requires-Dist: pytest-cov; extra == "dev"
33
+ Requires-Dist: pre-commit; extra == "dev"
34
+ Requires-Dist: commitizen; extra == "dev"
33
35
 
34
36
  ![scenario](https://github.com/langwatch/scenario/raw/main/assets/scenario-wide.webp)
35
37
 
@@ -39,12 +41,17 @@ Requires-Dist: pytest-cov; extra == "dev"
39
41
 
40
42
  # Scenario: Use an Agent to test your Agent
41
43
 
42
- Scenario is a library for testing agents end-to-end as a human would, but without having to manually do it. The automated testing agent covers every single scenario for you.
44
+ Scenario is an Agent Testing Framework for testing AI agents through Simulation Testing.
43
45
 
44
- You define the scenarios, and the testing agent will simulate your users as it follows them, it will keep chatting and evaluating your agent until it reaches the desired goal or detects an unexpected behavior.
46
+ You define the scenarios, and the testing agent will simulate a real user as it follows them, it will keep chatting back and forth with _your_ agent to play out the simulation, until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
45
47
 
46
48
  [📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
47
49
 
50
+ ### See also
51
+
52
+ - [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
53
+ - [Scenario Go](https://github.com/langwatch/scenario-go/)
54
+
48
55
  ## Getting Started
49
56
 
50
57
  Install pytest and scenario:
@@ -58,32 +65,40 @@ Now create your first scenario and save it as `tests/test_vegetarian_recipe_agen
58
65
  ```python
59
66
  import pytest
60
67
 
61
- from scenario import Scenario, TestingAgent, scenario_cache
68
+ from scenario import Scenario, TestingAgent, ScenarioAgentAdapter, AgentInput, AgentReturnTypes, scenario_cache
62
69
 
63
70
  Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
64
71
 
65
72
 
73
+ # Create an adapter to call your agent
74
+ class VegetarianRecipeAgentAdapter(ScenarioAgentAdapter):
75
+ def __init__(self, input: AgentInput):
76
+ self.agent = VegetarianRecipeAgent()
77
+
78
+ async def call(self, input: AgentInput) -> AgentReturnTypes:
79
+ return self.agent.run(input.last_new_user_message_str())
80
+
81
+
66
82
  @pytest.mark.agent_test
67
83
  @pytest.mark.asyncio
68
84
  async def test_vegetarian_recipe_agent():
69
- agent = VegetarianRecipeAgent()
70
-
71
- def vegetarian_recipe_agent(message, context):
72
- # Call your agent here
73
- return agent.run(message)
74
-
75
- # Define the scenario
85
+ # Define the simulated scenario
76
86
  scenario = Scenario(
77
- "User is looking for a dinner idea",
87
+ name="dinner idea",
88
+ description="""
89
+ It's saturday evening, the user is very hungry and tired,
90
+ but have no money to order out, so they are looking for a recipe.
91
+
92
+ The user never mentions they want a vegetarian recipe.
93
+ """,
78
94
  agent=vegetarian_recipe_agent,
79
- success_criteria=[
80
- "Recipe agent generates a vegetarian recipe",
81
- "Recipe includes a list of ingredients",
82
- "Recipe includes step-by-step cooking instructions",
83
- ],
84
- failure_criteria=[
85
- "The recipe is not vegetarian or includes meat",
86
- "The agent asks more than two follow-up questions",
95
+ # List the evaluation criteria for the scenario to be considered successful
96
+ criteria=[
97
+ "Agent should not ask more than two follow-up questions",
98
+ "Agent should generate a recipe",
99
+ "Recipe should include a list of ingredients",
100
+ "Recipe should include step-by-step cooking instructions",
101
+ "Recipe should be vegetarian and not include any sort of meat",
87
102
  ],
88
103
  )
89
104
 
@@ -111,9 +126,11 @@ class VegetarianRecipeAgent:
111
126
  messages=[
112
127
  {
113
128
  "role": "system",
114
- "content": """You are a vegetarian recipe agent.
115
- Given the user request, ask AT MOST ONE follow-up question,
116
- then provide a complete recipe. Keep your responses concise and focused.""",
129
+ "content": """
130
+ You are a vegetarian recipe agent.
131
+ Given the user request, ask AT MOST ONE follow-up question,
132
+ then provide a complete recipe. Keep your responses concise and focused.
133
+ """,
117
134
  },
118
135
  *self.history,
119
136
  ],
@@ -121,7 +138,7 @@ class VegetarianRecipeAgent:
121
138
  message = response.choices[0].message # type: ignore
122
139
  self.history.append(message)
123
140
 
124
- return {"messages": [message]}
141
+ return [message]
125
142
 
126
143
  ```
127
144
 
@@ -151,19 +168,20 @@ For example, in this Lovable Clone scenario test:
151
168
 
152
169
  ```python
153
170
  scenario = Scenario(
154
- "user wants to create a new landing page for their dog walking startup",
171
+ name="dog walking startup landing page",
172
+ description="""
173
+ the user wants to create a new landing page for their dog walking startup
174
+
175
+ send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict
176
+ """,
155
177
  agent=lovable_agent,
156
- strategy="send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict",
157
- success_criteria=[
178
+ criteria=[
158
179
  "agent reads the files before go and making changes",
159
- "agent modified the index.css file",
160
- "agent modified the Index.tsx file",
180
+ "agent modified the index.css file, not only the Index.tsx file",
161
181
  "agent created a comprehensive landing page",
162
182
  "agent extended the landing page with a new section",
163
- ],
164
- failure_criteria=[
165
- "agent says it can't read the file",
166
- "agent produces incomplete code or is too lazy to finish",
183
+ "agent should NOT say it can't read the file",
184
+ "agent should NOT produce incomplete code or be too lazy to finish",
167
185
  ],
168
186
  max_turns=5,
169
187
  )
@@ -173,6 +191,49 @@ result = await scenario.run()
173
191
 
174
192
  You can find a fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
175
193
 
194
+ ## Specify a script for guiding the scenario
195
+
196
+ You can specify a script for guiding the scenario by passing a list of steps to the `script` field.
197
+
198
+ ```python
199
+ @pytest.mark.agent_test
200
+ @pytest.mark.asyncio
201
+ async def test_ai_assistant_agent():
202
+ scenario = Scenario(
203
+ name="false assumptions",
204
+ description="""
205
+ The agent makes false assumption about being an ATM bank, and user corrects it
206
+ """,
207
+ agent=AiAssistantAgentAdapter,
208
+ criteria=[
209
+ "user should get good recommendations on river crossing",
210
+ "agent should NOT follow up about ATM recommendation after user has corrected them they are just hiking",
211
+ ],
212
+ max_turns=5,
213
+ )
214
+
215
+ def check_if_tool_was_called(state: ScenarioExecutor) -> None:
216
+ assert state.has_tool_call("web_search")
217
+
218
+ result = await scenario.script(
219
+ [
220
+ # Define existing history of messages
221
+ scenario.user("how do I safely approach a bank?"),
222
+ # Or let it be generate automatically
223
+ scenario.agent(),
224
+ # Add custom assertions, for example making sure a tool was called
225
+ check_if_tool_was_called,
226
+ scenario.user(),
227
+ # Let the simulation proceed for 2 more turns
228
+ scenario.proceed(turns=2),
229
+ # Time to make a judgment call
230
+ scenario.judge(),
231
+ ]
232
+ ).run()
233
+
234
+ assert result.success
235
+ ```
236
+
176
237
  ## Debug mode
177
238
 
178
239
  You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running, or by passing the `--debug` flag to pytest.
@@ -12,8 +12,12 @@ scenario/cache.py
12
12
  scenario/config.py
13
13
  scenario/error_messages.py
14
14
  scenario/pytest_plugin.py
15
- scenario/result.py
16
15
  scenario/scenario.py
16
+ scenario/scenario_agent_adapter.py
17
17
  scenario/scenario_executor.py
18
18
  scenario/testing_agent.py
19
- scenario/utils.py
19
+ scenario/types.py
20
+ scenario/utils.py
21
+ tests/test_scenario.py
22
+ tests/test_scenario_agent.py
23
+ tests/test_scenario_executor.py
@@ -7,9 +7,11 @@ joblib>=1.4.2
7
7
  wrapt>=1.17.2
8
8
  pytest-asyncio>=0.26.0
9
9
  rich<15.0.0,>=13.3.3
10
+ pksuid>=1.1.2
10
11
 
11
12
  [dev]
12
13
  black
13
14
  isort
14
- mypy
15
15
  pytest-cov
16
+ pre-commit
17
+ commitizen
@@ -4,13 +4,11 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "langwatch-scenario"
7
- version = "0.1.3"
7
+ version = "0.3.0"
8
8
  description = "The end-to-end agent testing library"
9
9
  readme = "README.md"
10
- authors = [
11
- {name = "LangWatch Team", email = "support@langwatch.ai"}
12
- ]
13
- license = {text = "MIT"}
10
+ authors = [{ name = "LangWatch Team", email = "support@langwatch.ai" }]
11
+ license = { text = "MIT" }
14
12
  requires-python = ">=3.9"
15
13
  classifiers = [
16
14
  "Development Status :: 4 - Beta",
@@ -32,14 +30,16 @@ dependencies = [
32
30
  "wrapt>=1.17.2",
33
31
  "pytest-asyncio>=0.26.0",
34
32
  "rich>=13.3.3,<15.0.0",
33
+ "pksuid>=1.1.2",
35
34
  ]
36
35
 
37
36
  [project.optional-dependencies]
38
37
  dev = [
39
38
  "black",
40
39
  "isort",
41
- "mypy",
42
40
  "pytest-cov",
41
+ "pre-commit",
42
+ "commitizen",
43
43
  ]
44
44
 
45
45
  [project.urls]
@@ -47,12 +47,21 @@ dev = [
47
47
  "Bug Tracker" = "https://github.com/langwatch/scenario/issues"
48
48
 
49
49
  [tool.pytest.ini_options]
50
- markers = [
51
- "agent_test: marks tests as agent scenario tests",
52
- ]
50
+ markers = ["agent_test: marks tests as agent scenario tests"]
53
51
 
54
52
  [dependency-groups]
55
53
  dev = [
54
+ "commitizen>=4.8.3",
55
+ "pre-commit>=4.2.0",
56
56
  "pydantic-ai>=0.0.52",
57
+ "pyright>=1.1.401",
57
58
  "pytest-asyncio-concurrent>=0.4.1",
58
59
  ]
60
+
61
+ [tool.commitizen]
62
+ name = "cz_conventional_commits"
63
+ version = "0.2.0"
64
+ tag_format = "v$version"
65
+ version_files = ["pyproject.toml:version"]
66
+ bump_message = "bump: version $current_version → $new_version"
67
+ major_version_zero = true