langwatch-scenario 0.1.3__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/PKG-INFO +95 -34
- {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/README.md +91 -32
- {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/PKG-INFO +95 -34
- {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/SOURCES.txt +6 -2
- {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/requires.txt +3 -1
- {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/pyproject.toml +18 -9
- {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/scenario/__init__.py +13 -3
- {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/scenario/config.py +18 -7
- langwatch_scenario-0.3.0/scenario/error_messages.py +134 -0
- {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/scenario/pytest_plugin.py +8 -8
- langwatch_scenario-0.3.0/scenario/scenario.py +238 -0
- langwatch_scenario-0.3.0/scenario/scenario_agent_adapter.py +16 -0
- langwatch_scenario-0.3.0/scenario/scenario_executor.py +466 -0
- {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/scenario/testing_agent.py +123 -109
- langwatch_scenario-0.3.0/scenario/types.py +96 -0
- langwatch_scenario-0.3.0/scenario/utils.py +264 -0
- {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/setup.py +1 -1
- langwatch_scenario-0.3.0/tests/test_scenario.py +434 -0
- langwatch_scenario-0.3.0/tests/test_scenario_agent.py +39 -0
- langwatch_scenario-0.3.0/tests/test_scenario_executor.py +162 -0
- langwatch_scenario-0.1.3/scenario/error_messages.py +0 -76
- langwatch_scenario-0.1.3/scenario/result.py +0 -81
- langwatch_scenario-0.1.3/scenario/scenario.py +0 -120
- langwatch_scenario-0.1.3/scenario/scenario_executor.py +0 -204
- langwatch_scenario-0.1.3/scenario/utils.py +0 -121
- {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/dependency_links.txt +0 -0
- {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/entry_points.txt +0 -0
- {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/top_level.txt +0 -0
- {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/scenario/cache.py +0 -0
- {langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langwatch-scenario
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.0
|
4
4
|
Summary: The end-to-end agent testing library
|
5
5
|
Author-email: LangWatch Team <support@langwatch.ai>
|
6
6
|
License: MIT
|
@@ -25,11 +25,13 @@ Requires-Dist: joblib>=1.4.2
|
|
25
25
|
Requires-Dist: wrapt>=1.17.2
|
26
26
|
Requires-Dist: pytest-asyncio>=0.26.0
|
27
27
|
Requires-Dist: rich<15.0.0,>=13.3.3
|
28
|
+
Requires-Dist: pksuid>=1.1.2
|
28
29
|
Provides-Extra: dev
|
29
30
|
Requires-Dist: black; extra == "dev"
|
30
31
|
Requires-Dist: isort; extra == "dev"
|
31
|
-
Requires-Dist: mypy; extra == "dev"
|
32
32
|
Requires-Dist: pytest-cov; extra == "dev"
|
33
|
+
Requires-Dist: pre-commit; extra == "dev"
|
34
|
+
Requires-Dist: commitizen; extra == "dev"
|
33
35
|
|
34
36
|

|
35
37
|
|
@@ -39,12 +41,17 @@ Requires-Dist: pytest-cov; extra == "dev"
|
|
39
41
|
|
40
42
|
# Scenario: Use an Agent to test your Agent
|
41
43
|
|
42
|
-
Scenario is
|
44
|
+
Scenario is an Agent Testing Framework for testing AI agents through Simulation Testing.
|
43
45
|
|
44
|
-
You define the scenarios, and the testing agent will simulate
|
46
|
+
You define the scenarios, and the testing agent will simulate a real user as it follows them, it will keep chatting back and forth with _your_ agent to play out the simulation, until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
|
45
47
|
|
46
48
|
[📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
|
47
49
|
|
50
|
+
### See also
|
51
|
+
|
52
|
+
- [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
|
53
|
+
- [Scenario Go](https://github.com/langwatch/scenario-go/)
|
54
|
+
|
48
55
|
## Getting Started
|
49
56
|
|
50
57
|
Install pytest and scenario:
|
@@ -58,32 +65,40 @@ Now create your first scenario and save it as `tests/test_vegetarian_recipe_agen
|
|
58
65
|
```python
|
59
66
|
import pytest
|
60
67
|
|
61
|
-
from scenario import Scenario, TestingAgent, scenario_cache
|
68
|
+
from scenario import Scenario, TestingAgent, ScenarioAgentAdapter, AgentInput, AgentReturnTypes, scenario_cache
|
62
69
|
|
63
70
|
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
|
64
71
|
|
65
72
|
|
73
|
+
# Create an adapter to call your agent
|
74
|
+
class VegetarianRecipeAgentAdapter(ScenarioAgentAdapter):
|
75
|
+
def __init__(self, input: AgentInput):
|
76
|
+
self.agent = VegetarianRecipeAgent()
|
77
|
+
|
78
|
+
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
79
|
+
return self.agent.run(input.last_new_user_message_str())
|
80
|
+
|
81
|
+
|
66
82
|
@pytest.mark.agent_test
|
67
83
|
@pytest.mark.asyncio
|
68
84
|
async def test_vegetarian_recipe_agent():
|
69
|
-
|
70
|
-
|
71
|
-
def vegetarian_recipe_agent(message, context):
|
72
|
-
# Call your agent here
|
73
|
-
return agent.run(message)
|
74
|
-
|
75
|
-
# Define the scenario
|
85
|
+
# Define the simulated scenario
|
76
86
|
scenario = Scenario(
|
77
|
-
"
|
87
|
+
name="dinner idea",
|
88
|
+
description="""
|
89
|
+
It's saturday evening, the user is very hungry and tired,
|
90
|
+
but have no money to order out, so they are looking for a recipe.
|
91
|
+
|
92
|
+
The user never mentions they want a vegetarian recipe.
|
93
|
+
""",
|
78
94
|
agent=vegetarian_recipe_agent,
|
79
|
-
|
80
|
-
|
81
|
-
"
|
82
|
-
"
|
83
|
-
|
84
|
-
|
85
|
-
"
|
86
|
-
"The agent asks more than two follow-up questions",
|
95
|
+
# List the evaluation criteria for the scenario to be considered successful
|
96
|
+
criteria=[
|
97
|
+
"Agent should not ask more than two follow-up questions",
|
98
|
+
"Agent should generate a recipe",
|
99
|
+
"Recipe should include a list of ingredients",
|
100
|
+
"Recipe should include step-by-step cooking instructions",
|
101
|
+
"Recipe should be vegetarian and not include any sort of meat",
|
87
102
|
],
|
88
103
|
)
|
89
104
|
|
@@ -111,9 +126,11 @@ class VegetarianRecipeAgent:
|
|
111
126
|
messages=[
|
112
127
|
{
|
113
128
|
"role": "system",
|
114
|
-
"content": """
|
115
|
-
|
116
|
-
|
129
|
+
"content": """
|
130
|
+
You are a vegetarian recipe agent.
|
131
|
+
Given the user request, ask AT MOST ONE follow-up question,
|
132
|
+
then provide a complete recipe. Keep your responses concise and focused.
|
133
|
+
""",
|
117
134
|
},
|
118
135
|
*self.history,
|
119
136
|
],
|
@@ -121,7 +138,7 @@ class VegetarianRecipeAgent:
|
|
121
138
|
message = response.choices[0].message # type: ignore
|
122
139
|
self.history.append(message)
|
123
140
|
|
124
|
-
return
|
141
|
+
return [message]
|
125
142
|
|
126
143
|
```
|
127
144
|
|
@@ -151,19 +168,20 @@ For example, in this Lovable Clone scenario test:
|
|
151
168
|
|
152
169
|
```python
|
153
170
|
scenario = Scenario(
|
154
|
-
"
|
171
|
+
name="dog walking startup landing page",
|
172
|
+
description="""
|
173
|
+
the user wants to create a new landing page for their dog walking startup
|
174
|
+
|
175
|
+
send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict
|
176
|
+
""",
|
155
177
|
agent=lovable_agent,
|
156
|
-
|
157
|
-
success_criteria=[
|
178
|
+
criteria=[
|
158
179
|
"agent reads the files before go and making changes",
|
159
|
-
"agent modified the index.css file",
|
160
|
-
"agent modified the Index.tsx file",
|
180
|
+
"agent modified the index.css file, not only the Index.tsx file",
|
161
181
|
"agent created a comprehensive landing page",
|
162
182
|
"agent extended the landing page with a new section",
|
163
|
-
|
164
|
-
|
165
|
-
"agent says it can't read the file",
|
166
|
-
"agent produces incomplete code or is too lazy to finish",
|
183
|
+
"agent should NOT say it can't read the file",
|
184
|
+
"agent should NOT produce incomplete code or be too lazy to finish",
|
167
185
|
],
|
168
186
|
max_turns=5,
|
169
187
|
)
|
@@ -173,6 +191,49 @@ result = await scenario.run()
|
|
173
191
|
|
174
192
|
You can find a fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
|
175
193
|
|
194
|
+
## Specify a script for guiding the scenario
|
195
|
+
|
196
|
+
You can specify a script for guiding the scenario by passing a list of steps to the `script` field.
|
197
|
+
|
198
|
+
```python
|
199
|
+
@pytest.mark.agent_test
|
200
|
+
@pytest.mark.asyncio
|
201
|
+
async def test_ai_assistant_agent():
|
202
|
+
scenario = Scenario(
|
203
|
+
name="false assumptions",
|
204
|
+
description="""
|
205
|
+
The agent makes false assumption about being an ATM bank, and user corrects it
|
206
|
+
""",
|
207
|
+
agent=AiAssistantAgentAdapter,
|
208
|
+
criteria=[
|
209
|
+
"user should get good recommendations on river crossing",
|
210
|
+
"agent should NOT follow up about ATM recommendation after user has corrected them they are just hiking",
|
211
|
+
],
|
212
|
+
max_turns=5,
|
213
|
+
)
|
214
|
+
|
215
|
+
def check_if_tool_was_called(state: ScenarioExecutor) -> None:
|
216
|
+
assert state.has_tool_call("web_search")
|
217
|
+
|
218
|
+
result = await scenario.script(
|
219
|
+
[
|
220
|
+
# Define existing history of messages
|
221
|
+
scenario.user("how do I safely approach a bank?"),
|
222
|
+
# Or let it be generate automatically
|
223
|
+
scenario.agent(),
|
224
|
+
# Add custom assertions, for example making sure a tool was called
|
225
|
+
check_if_tool_was_called,
|
226
|
+
scenario.user(),
|
227
|
+
# Let the simulation proceed for 2 more turns
|
228
|
+
scenario.proceed(turns=2),
|
229
|
+
# Time to make a judgment call
|
230
|
+
scenario.judge(),
|
231
|
+
]
|
232
|
+
).run()
|
233
|
+
|
234
|
+
assert result.success
|
235
|
+
```
|
236
|
+
|
176
237
|
## Debug mode
|
177
238
|
|
178
239
|
You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running, or by passing the `--debug` flag to pytest.
|
@@ -6,12 +6,17 @@
|
|
6
6
|
|
7
7
|
# Scenario: Use an Agent to test your Agent
|
8
8
|
|
9
|
-
Scenario is
|
9
|
+
Scenario is an Agent Testing Framework for testing AI agents through Simulation Testing.
|
10
10
|
|
11
|
-
You define the scenarios, and the testing agent will simulate
|
11
|
+
You define the scenarios, and the testing agent will simulate a real user as it follows them, it will keep chatting back and forth with _your_ agent to play out the simulation, until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
|
12
12
|
|
13
13
|
[📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
|
14
14
|
|
15
|
+
### See also
|
16
|
+
|
17
|
+
- [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
|
18
|
+
- [Scenario Go](https://github.com/langwatch/scenario-go/)
|
19
|
+
|
15
20
|
## Getting Started
|
16
21
|
|
17
22
|
Install pytest and scenario:
|
@@ -25,32 +30,40 @@ Now create your first scenario and save it as `tests/test_vegetarian_recipe_agen
|
|
25
30
|
```python
|
26
31
|
import pytest
|
27
32
|
|
28
|
-
from scenario import Scenario, TestingAgent, scenario_cache
|
33
|
+
from scenario import Scenario, TestingAgent, ScenarioAgentAdapter, AgentInput, AgentReturnTypes, scenario_cache
|
29
34
|
|
30
35
|
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
|
31
36
|
|
32
37
|
|
38
|
+
# Create an adapter to call your agent
|
39
|
+
class VegetarianRecipeAgentAdapter(ScenarioAgentAdapter):
|
40
|
+
def __init__(self, input: AgentInput):
|
41
|
+
self.agent = VegetarianRecipeAgent()
|
42
|
+
|
43
|
+
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
44
|
+
return self.agent.run(input.last_new_user_message_str())
|
45
|
+
|
46
|
+
|
33
47
|
@pytest.mark.agent_test
|
34
48
|
@pytest.mark.asyncio
|
35
49
|
async def test_vegetarian_recipe_agent():
|
36
|
-
|
37
|
-
|
38
|
-
def vegetarian_recipe_agent(message, context):
|
39
|
-
# Call your agent here
|
40
|
-
return agent.run(message)
|
41
|
-
|
42
|
-
# Define the scenario
|
50
|
+
# Define the simulated scenario
|
43
51
|
scenario = Scenario(
|
44
|
-
"
|
52
|
+
name="dinner idea",
|
53
|
+
description="""
|
54
|
+
It's saturday evening, the user is very hungry and tired,
|
55
|
+
but have no money to order out, so they are looking for a recipe.
|
56
|
+
|
57
|
+
The user never mentions they want a vegetarian recipe.
|
58
|
+
""",
|
45
59
|
agent=vegetarian_recipe_agent,
|
46
|
-
|
47
|
-
|
48
|
-
"
|
49
|
-
"
|
50
|
-
|
51
|
-
|
52
|
-
"
|
53
|
-
"The agent asks more than two follow-up questions",
|
60
|
+
# List the evaluation criteria for the scenario to be considered successful
|
61
|
+
criteria=[
|
62
|
+
"Agent should not ask more than two follow-up questions",
|
63
|
+
"Agent should generate a recipe",
|
64
|
+
"Recipe should include a list of ingredients",
|
65
|
+
"Recipe should include step-by-step cooking instructions",
|
66
|
+
"Recipe should be vegetarian and not include any sort of meat",
|
54
67
|
],
|
55
68
|
)
|
56
69
|
|
@@ -78,9 +91,11 @@ class VegetarianRecipeAgent:
|
|
78
91
|
messages=[
|
79
92
|
{
|
80
93
|
"role": "system",
|
81
|
-
"content": """
|
82
|
-
|
83
|
-
|
94
|
+
"content": """
|
95
|
+
You are a vegetarian recipe agent.
|
96
|
+
Given the user request, ask AT MOST ONE follow-up question,
|
97
|
+
then provide a complete recipe. Keep your responses concise and focused.
|
98
|
+
""",
|
84
99
|
},
|
85
100
|
*self.history,
|
86
101
|
],
|
@@ -88,7 +103,7 @@ class VegetarianRecipeAgent:
|
|
88
103
|
message = response.choices[0].message # type: ignore
|
89
104
|
self.history.append(message)
|
90
105
|
|
91
|
-
return
|
106
|
+
return [message]
|
92
107
|
|
93
108
|
```
|
94
109
|
|
@@ -118,19 +133,20 @@ For example, in this Lovable Clone scenario test:
|
|
118
133
|
|
119
134
|
```python
|
120
135
|
scenario = Scenario(
|
121
|
-
"
|
136
|
+
name="dog walking startup landing page",
|
137
|
+
description="""
|
138
|
+
the user wants to create a new landing page for their dog walking startup
|
139
|
+
|
140
|
+
send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict
|
141
|
+
""",
|
122
142
|
agent=lovable_agent,
|
123
|
-
|
124
|
-
success_criteria=[
|
143
|
+
criteria=[
|
125
144
|
"agent reads the files before go and making changes",
|
126
|
-
"agent modified the index.css file",
|
127
|
-
"agent modified the Index.tsx file",
|
145
|
+
"agent modified the index.css file, not only the Index.tsx file",
|
128
146
|
"agent created a comprehensive landing page",
|
129
147
|
"agent extended the landing page with a new section",
|
130
|
-
|
131
|
-
|
132
|
-
"agent says it can't read the file",
|
133
|
-
"agent produces incomplete code or is too lazy to finish",
|
148
|
+
"agent should NOT say it can't read the file",
|
149
|
+
"agent should NOT produce incomplete code or be too lazy to finish",
|
134
150
|
],
|
135
151
|
max_turns=5,
|
136
152
|
)
|
@@ -140,6 +156,49 @@ result = await scenario.run()
|
|
140
156
|
|
141
157
|
You can find a fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
|
142
158
|
|
159
|
+
## Specify a script for guiding the scenario
|
160
|
+
|
161
|
+
You can specify a script for guiding the scenario by passing a list of steps to the `script` field.
|
162
|
+
|
163
|
+
```python
|
164
|
+
@pytest.mark.agent_test
|
165
|
+
@pytest.mark.asyncio
|
166
|
+
async def test_ai_assistant_agent():
|
167
|
+
scenario = Scenario(
|
168
|
+
name="false assumptions",
|
169
|
+
description="""
|
170
|
+
The agent makes false assumption about being an ATM bank, and user corrects it
|
171
|
+
""",
|
172
|
+
agent=AiAssistantAgentAdapter,
|
173
|
+
criteria=[
|
174
|
+
"user should get good recommendations on river crossing",
|
175
|
+
"agent should NOT follow up about ATM recommendation after user has corrected them they are just hiking",
|
176
|
+
],
|
177
|
+
max_turns=5,
|
178
|
+
)
|
179
|
+
|
180
|
+
def check_if_tool_was_called(state: ScenarioExecutor) -> None:
|
181
|
+
assert state.has_tool_call("web_search")
|
182
|
+
|
183
|
+
result = await scenario.script(
|
184
|
+
[
|
185
|
+
# Define existing history of messages
|
186
|
+
scenario.user("how do I safely approach a bank?"),
|
187
|
+
# Or let it be generate automatically
|
188
|
+
scenario.agent(),
|
189
|
+
# Add custom assertions, for example making sure a tool was called
|
190
|
+
check_if_tool_was_called,
|
191
|
+
scenario.user(),
|
192
|
+
# Let the simulation proceed for 2 more turns
|
193
|
+
scenario.proceed(turns=2),
|
194
|
+
# Time to make a judgment call
|
195
|
+
scenario.judge(),
|
196
|
+
]
|
197
|
+
).run()
|
198
|
+
|
199
|
+
assert result.success
|
200
|
+
```
|
201
|
+
|
143
202
|
## Debug mode
|
144
203
|
|
145
204
|
You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running, or by passing the `--debug` flag to pytest.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langwatch-scenario
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.0
|
4
4
|
Summary: The end-to-end agent testing library
|
5
5
|
Author-email: LangWatch Team <support@langwatch.ai>
|
6
6
|
License: MIT
|
@@ -25,11 +25,13 @@ Requires-Dist: joblib>=1.4.2
|
|
25
25
|
Requires-Dist: wrapt>=1.17.2
|
26
26
|
Requires-Dist: pytest-asyncio>=0.26.0
|
27
27
|
Requires-Dist: rich<15.0.0,>=13.3.3
|
28
|
+
Requires-Dist: pksuid>=1.1.2
|
28
29
|
Provides-Extra: dev
|
29
30
|
Requires-Dist: black; extra == "dev"
|
30
31
|
Requires-Dist: isort; extra == "dev"
|
31
|
-
Requires-Dist: mypy; extra == "dev"
|
32
32
|
Requires-Dist: pytest-cov; extra == "dev"
|
33
|
+
Requires-Dist: pre-commit; extra == "dev"
|
34
|
+
Requires-Dist: commitizen; extra == "dev"
|
33
35
|
|
34
36
|

|
35
37
|
|
@@ -39,12 +41,17 @@ Requires-Dist: pytest-cov; extra == "dev"
|
|
39
41
|
|
40
42
|
# Scenario: Use an Agent to test your Agent
|
41
43
|
|
42
|
-
Scenario is
|
44
|
+
Scenario is an Agent Testing Framework for testing AI agents through Simulation Testing.
|
43
45
|
|
44
|
-
You define the scenarios, and the testing agent will simulate
|
46
|
+
You define the scenarios, and the testing agent will simulate a real user as it follows them, it will keep chatting back and forth with _your_ agent to play out the simulation, until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
|
45
47
|
|
46
48
|
[📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
|
47
49
|
|
50
|
+
### See also
|
51
|
+
|
52
|
+
- [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
|
53
|
+
- [Scenario Go](https://github.com/langwatch/scenario-go/)
|
54
|
+
|
48
55
|
## Getting Started
|
49
56
|
|
50
57
|
Install pytest and scenario:
|
@@ -58,32 +65,40 @@ Now create your first scenario and save it as `tests/test_vegetarian_recipe_agen
|
|
58
65
|
```python
|
59
66
|
import pytest
|
60
67
|
|
61
|
-
from scenario import Scenario, TestingAgent, scenario_cache
|
68
|
+
from scenario import Scenario, TestingAgent, ScenarioAgentAdapter, AgentInput, AgentReturnTypes, scenario_cache
|
62
69
|
|
63
70
|
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
|
64
71
|
|
65
72
|
|
73
|
+
# Create an adapter to call your agent
|
74
|
+
class VegetarianRecipeAgentAdapter(ScenarioAgentAdapter):
|
75
|
+
def __init__(self, input: AgentInput):
|
76
|
+
self.agent = VegetarianRecipeAgent()
|
77
|
+
|
78
|
+
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
79
|
+
return self.agent.run(input.last_new_user_message_str())
|
80
|
+
|
81
|
+
|
66
82
|
@pytest.mark.agent_test
|
67
83
|
@pytest.mark.asyncio
|
68
84
|
async def test_vegetarian_recipe_agent():
|
69
|
-
|
70
|
-
|
71
|
-
def vegetarian_recipe_agent(message, context):
|
72
|
-
# Call your agent here
|
73
|
-
return agent.run(message)
|
74
|
-
|
75
|
-
# Define the scenario
|
85
|
+
# Define the simulated scenario
|
76
86
|
scenario = Scenario(
|
77
|
-
"
|
87
|
+
name="dinner idea",
|
88
|
+
description="""
|
89
|
+
It's saturday evening, the user is very hungry and tired,
|
90
|
+
but have no money to order out, so they are looking for a recipe.
|
91
|
+
|
92
|
+
The user never mentions they want a vegetarian recipe.
|
93
|
+
""",
|
78
94
|
agent=vegetarian_recipe_agent,
|
79
|
-
|
80
|
-
|
81
|
-
"
|
82
|
-
"
|
83
|
-
|
84
|
-
|
85
|
-
"
|
86
|
-
"The agent asks more than two follow-up questions",
|
95
|
+
# List the evaluation criteria for the scenario to be considered successful
|
96
|
+
criteria=[
|
97
|
+
"Agent should not ask more than two follow-up questions",
|
98
|
+
"Agent should generate a recipe",
|
99
|
+
"Recipe should include a list of ingredients",
|
100
|
+
"Recipe should include step-by-step cooking instructions",
|
101
|
+
"Recipe should be vegetarian and not include any sort of meat",
|
87
102
|
],
|
88
103
|
)
|
89
104
|
|
@@ -111,9 +126,11 @@ class VegetarianRecipeAgent:
|
|
111
126
|
messages=[
|
112
127
|
{
|
113
128
|
"role": "system",
|
114
|
-
"content": """
|
115
|
-
|
116
|
-
|
129
|
+
"content": """
|
130
|
+
You are a vegetarian recipe agent.
|
131
|
+
Given the user request, ask AT MOST ONE follow-up question,
|
132
|
+
then provide a complete recipe. Keep your responses concise and focused.
|
133
|
+
""",
|
117
134
|
},
|
118
135
|
*self.history,
|
119
136
|
],
|
@@ -121,7 +138,7 @@ class VegetarianRecipeAgent:
|
|
121
138
|
message = response.choices[0].message # type: ignore
|
122
139
|
self.history.append(message)
|
123
140
|
|
124
|
-
return
|
141
|
+
return [message]
|
125
142
|
|
126
143
|
```
|
127
144
|
|
@@ -151,19 +168,20 @@ For example, in this Lovable Clone scenario test:
|
|
151
168
|
|
152
169
|
```python
|
153
170
|
scenario = Scenario(
|
154
|
-
"
|
171
|
+
name="dog walking startup landing page",
|
172
|
+
description="""
|
173
|
+
the user wants to create a new landing page for their dog walking startup
|
174
|
+
|
175
|
+
send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict
|
176
|
+
""",
|
155
177
|
agent=lovable_agent,
|
156
|
-
|
157
|
-
success_criteria=[
|
178
|
+
criteria=[
|
158
179
|
"agent reads the files before go and making changes",
|
159
|
-
"agent modified the index.css file",
|
160
|
-
"agent modified the Index.tsx file",
|
180
|
+
"agent modified the index.css file, not only the Index.tsx file",
|
161
181
|
"agent created a comprehensive landing page",
|
162
182
|
"agent extended the landing page with a new section",
|
163
|
-
|
164
|
-
|
165
|
-
"agent says it can't read the file",
|
166
|
-
"agent produces incomplete code or is too lazy to finish",
|
183
|
+
"agent should NOT say it can't read the file",
|
184
|
+
"agent should NOT produce incomplete code or be too lazy to finish",
|
167
185
|
],
|
168
186
|
max_turns=5,
|
169
187
|
)
|
@@ -173,6 +191,49 @@ result = await scenario.run()
|
|
173
191
|
|
174
192
|
You can find a fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
|
175
193
|
|
194
|
+
## Specify a script for guiding the scenario
|
195
|
+
|
196
|
+
You can specify a script for guiding the scenario by passing a list of steps to the `script` field.
|
197
|
+
|
198
|
+
```python
|
199
|
+
@pytest.mark.agent_test
|
200
|
+
@pytest.mark.asyncio
|
201
|
+
async def test_ai_assistant_agent():
|
202
|
+
scenario = Scenario(
|
203
|
+
name="false assumptions",
|
204
|
+
description="""
|
205
|
+
The agent makes false assumption about being an ATM bank, and user corrects it
|
206
|
+
""",
|
207
|
+
agent=AiAssistantAgentAdapter,
|
208
|
+
criteria=[
|
209
|
+
"user should get good recommendations on river crossing",
|
210
|
+
"agent should NOT follow up about ATM recommendation after user has corrected them they are just hiking",
|
211
|
+
],
|
212
|
+
max_turns=5,
|
213
|
+
)
|
214
|
+
|
215
|
+
def check_if_tool_was_called(state: ScenarioExecutor) -> None:
|
216
|
+
assert state.has_tool_call("web_search")
|
217
|
+
|
218
|
+
result = await scenario.script(
|
219
|
+
[
|
220
|
+
# Define existing history of messages
|
221
|
+
scenario.user("how do I safely approach a bank?"),
|
222
|
+
# Or let it be generate automatically
|
223
|
+
scenario.agent(),
|
224
|
+
# Add custom assertions, for example making sure a tool was called
|
225
|
+
check_if_tool_was_called,
|
226
|
+
scenario.user(),
|
227
|
+
# Let the simulation proceed for 2 more turns
|
228
|
+
scenario.proceed(turns=2),
|
229
|
+
# Time to make a judgment call
|
230
|
+
scenario.judge(),
|
231
|
+
]
|
232
|
+
).run()
|
233
|
+
|
234
|
+
assert result.success
|
235
|
+
```
|
236
|
+
|
176
237
|
## Debug mode
|
177
238
|
|
178
239
|
You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running, or by passing the `--debug` flag to pytest.
|
{langwatch_scenario-0.1.3 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/SOURCES.txt
RENAMED
@@ -12,8 +12,12 @@ scenario/cache.py
|
|
12
12
|
scenario/config.py
|
13
13
|
scenario/error_messages.py
|
14
14
|
scenario/pytest_plugin.py
|
15
|
-
scenario/result.py
|
16
15
|
scenario/scenario.py
|
16
|
+
scenario/scenario_agent_adapter.py
|
17
17
|
scenario/scenario_executor.py
|
18
18
|
scenario/testing_agent.py
|
19
|
-
scenario/
|
19
|
+
scenario/types.py
|
20
|
+
scenario/utils.py
|
21
|
+
tests/test_scenario.py
|
22
|
+
tests/test_scenario_agent.py
|
23
|
+
tests/test_scenario_executor.py
|
@@ -4,13 +4,11 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "langwatch-scenario"
|
7
|
-
version = "0.
|
7
|
+
version = "0.3.0"
|
8
8
|
description = "The end-to-end agent testing library"
|
9
9
|
readme = "README.md"
|
10
|
-
authors = [
|
11
|
-
|
12
|
-
]
|
13
|
-
license = {text = "MIT"}
|
10
|
+
authors = [{ name = "LangWatch Team", email = "support@langwatch.ai" }]
|
11
|
+
license = { text = "MIT" }
|
14
12
|
requires-python = ">=3.9"
|
15
13
|
classifiers = [
|
16
14
|
"Development Status :: 4 - Beta",
|
@@ -32,14 +30,16 @@ dependencies = [
|
|
32
30
|
"wrapt>=1.17.2",
|
33
31
|
"pytest-asyncio>=0.26.0",
|
34
32
|
"rich>=13.3.3,<15.0.0",
|
33
|
+
"pksuid>=1.1.2",
|
35
34
|
]
|
36
35
|
|
37
36
|
[project.optional-dependencies]
|
38
37
|
dev = [
|
39
38
|
"black",
|
40
39
|
"isort",
|
41
|
-
"mypy",
|
42
40
|
"pytest-cov",
|
41
|
+
"pre-commit",
|
42
|
+
"commitizen",
|
43
43
|
]
|
44
44
|
|
45
45
|
[project.urls]
|
@@ -47,12 +47,21 @@ dev = [
|
|
47
47
|
"Bug Tracker" = "https://github.com/langwatch/scenario/issues"
|
48
48
|
|
49
49
|
[tool.pytest.ini_options]
|
50
|
-
markers = [
|
51
|
-
"agent_test: marks tests as agent scenario tests",
|
52
|
-
]
|
50
|
+
markers = ["agent_test: marks tests as agent scenario tests"]
|
53
51
|
|
54
52
|
[dependency-groups]
|
55
53
|
dev = [
|
54
|
+
"commitizen>=4.8.3",
|
55
|
+
"pre-commit>=4.2.0",
|
56
56
|
"pydantic-ai>=0.0.52",
|
57
|
+
"pyright>=1.1.401",
|
57
58
|
"pytest-asyncio-concurrent>=0.4.1",
|
58
59
|
]
|
60
|
+
|
61
|
+
[tool.commitizen]
|
62
|
+
name = "cz_conventional_commits"
|
63
|
+
version = "0.2.0"
|
64
|
+
tag_format = "v$version"
|
65
|
+
version_files = ["pyproject.toml:version"]
|
66
|
+
bump_message = "bump: version $current_version → $new_version"
|
67
|
+
major_version_zero = true
|