langwatch-scenario 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langwatch_scenario-0.2.0 → langwatch_scenario-0.3.0}/PKG-INFO +60 -12
- {langwatch_scenario-0.2.0 → langwatch_scenario-0.3.0}/README.md +56 -10
- {langwatch_scenario-0.2.0 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/PKG-INFO +60 -12
- {langwatch_scenario-0.2.0 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/SOURCES.txt +6 -2
- {langwatch_scenario-0.2.0 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/requires.txt +3 -1
- {langwatch_scenario-0.2.0 → langwatch_scenario-0.3.0}/pyproject.toml +18 -9
- {langwatch_scenario-0.2.0 → langwatch_scenario-0.3.0}/scenario/__init__.py +13 -3
- {langwatch_scenario-0.2.0 → langwatch_scenario-0.3.0}/scenario/config.py +18 -7
- langwatch_scenario-0.3.0/scenario/error_messages.py +134 -0
- {langwatch_scenario-0.2.0 → langwatch_scenario-0.3.0}/scenario/pytest_plugin.py +1 -1
- langwatch_scenario-0.3.0/scenario/scenario.py +238 -0
- langwatch_scenario-0.3.0/scenario/scenario_agent_adapter.py +16 -0
- langwatch_scenario-0.3.0/scenario/scenario_executor.py +466 -0
- {langwatch_scenario-0.2.0 → langwatch_scenario-0.3.0}/scenario/testing_agent.py +75 -58
- langwatch_scenario-0.3.0/scenario/types.py +96 -0
- langwatch_scenario-0.3.0/scenario/utils.py +264 -0
- {langwatch_scenario-0.2.0 → langwatch_scenario-0.3.0}/setup.py +1 -1
- langwatch_scenario-0.3.0/tests/test_scenario.py +434 -0
- langwatch_scenario-0.3.0/tests/test_scenario_agent.py +39 -0
- langwatch_scenario-0.3.0/tests/test_scenario_executor.py +162 -0
- langwatch_scenario-0.2.0/scenario/error_messages.py +0 -76
- langwatch_scenario-0.2.0/scenario/result.py +0 -74
- langwatch_scenario-0.2.0/scenario/scenario.py +0 -123
- langwatch_scenario-0.2.0/scenario/scenario_executor.py +0 -204
- langwatch_scenario-0.2.0/scenario/utils.py +0 -121
- {langwatch_scenario-0.2.0 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/dependency_links.txt +0 -0
- {langwatch_scenario-0.2.0 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/entry_points.txt +0 -0
- {langwatch_scenario-0.2.0 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/top_level.txt +0 -0
- {langwatch_scenario-0.2.0 → langwatch_scenario-0.3.0}/scenario/cache.py +0 -0
- {langwatch_scenario-0.2.0 → langwatch_scenario-0.3.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langwatch-scenario
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.0
|
4
4
|
Summary: The end-to-end agent testing library
|
5
5
|
Author-email: LangWatch Team <support@langwatch.ai>
|
6
6
|
License: MIT
|
@@ -25,11 +25,13 @@ Requires-Dist: joblib>=1.4.2
|
|
25
25
|
Requires-Dist: wrapt>=1.17.2
|
26
26
|
Requires-Dist: pytest-asyncio>=0.26.0
|
27
27
|
Requires-Dist: rich<15.0.0,>=13.3.3
|
28
|
+
Requires-Dist: pksuid>=1.1.2
|
28
29
|
Provides-Extra: dev
|
29
30
|
Requires-Dist: black; extra == "dev"
|
30
31
|
Requires-Dist: isort; extra == "dev"
|
31
|
-
Requires-Dist: mypy; extra == "dev"
|
32
32
|
Requires-Dist: pytest-cov; extra == "dev"
|
33
|
+
Requires-Dist: pre-commit; extra == "dev"
|
34
|
+
Requires-Dist: commitizen; extra == "dev"
|
33
35
|
|
34
36
|

|
35
37
|
|
@@ -39,9 +41,9 @@ Requires-Dist: pytest-cov; extra == "dev"
|
|
39
41
|
|
40
42
|
# Scenario: Use an Agent to test your Agent
|
41
43
|
|
42
|
-
Scenario is
|
44
|
+
Scenario is an Agent Testing Framework for testing AI agents through Simulation Testing.
|
43
45
|
|
44
|
-
You define the scenarios, and the testing agent will simulate
|
46
|
+
You define the scenarios, and the testing agent will simulate a real user as it follows them, it will keep chatting back and forth with _your_ agent to play out the simulation, until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
|
45
47
|
|
46
48
|
[📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
|
47
49
|
|
@@ -63,20 +65,23 @@ Now create your first scenario and save it as `tests/test_vegetarian_recipe_agen
|
|
63
65
|
```python
|
64
66
|
import pytest
|
65
67
|
|
66
|
-
from scenario import Scenario, TestingAgent, scenario_cache
|
68
|
+
from scenario import Scenario, TestingAgent, ScenarioAgentAdapter, AgentInput, AgentReturnTypes, scenario_cache
|
67
69
|
|
68
70
|
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
|
69
71
|
|
70
72
|
|
73
|
+
# Create an adapter to call your agent
|
74
|
+
class VegetarianRecipeAgentAdapter(ScenarioAgentAdapter):
|
75
|
+
def __init__(self, input: AgentInput):
|
76
|
+
self.agent = VegetarianRecipeAgent()
|
77
|
+
|
78
|
+
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
79
|
+
return self.agent.run(input.last_new_user_message_str())
|
80
|
+
|
81
|
+
|
71
82
|
@pytest.mark.agent_test
|
72
83
|
@pytest.mark.asyncio
|
73
84
|
async def test_vegetarian_recipe_agent():
|
74
|
-
agent = VegetarianRecipeAgent()
|
75
|
-
|
76
|
-
def vegetarian_recipe_agent(message, context):
|
77
|
-
# Call your agent here
|
78
|
-
return agent.run(message)
|
79
|
-
|
80
85
|
# Define the simulated scenario
|
81
86
|
scenario = Scenario(
|
82
87
|
name="dinner idea",
|
@@ -133,7 +138,7 @@ class VegetarianRecipeAgent:
|
|
133
138
|
message = response.choices[0].message # type: ignore
|
134
139
|
self.history.append(message)
|
135
140
|
|
136
|
-
return
|
141
|
+
return [message]
|
137
142
|
|
138
143
|
```
|
139
144
|
|
@@ -186,6 +191,49 @@ result = await scenario.run()
|
|
186
191
|
|
187
192
|
You can find a fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
|
188
193
|
|
194
|
+
## Specify a script for guiding the scenario
|
195
|
+
|
196
|
+
You can specify a script for guiding the scenario by passing a list of steps to the `script` field.
|
197
|
+
|
198
|
+
```python
|
199
|
+
@pytest.mark.agent_test
|
200
|
+
@pytest.mark.asyncio
|
201
|
+
async def test_ai_assistant_agent():
|
202
|
+
scenario = Scenario(
|
203
|
+
name="false assumptions",
|
204
|
+
description="""
|
205
|
+
The agent makes false assumption about being an ATM bank, and user corrects it
|
206
|
+
""",
|
207
|
+
agent=AiAssistantAgentAdapter,
|
208
|
+
criteria=[
|
209
|
+
"user should get good recommendations on river crossing",
|
210
|
+
"agent should NOT follow up about ATM recommendation after user has corrected them they are just hiking",
|
211
|
+
],
|
212
|
+
max_turns=5,
|
213
|
+
)
|
214
|
+
|
215
|
+
def check_if_tool_was_called(state: ScenarioExecutor) -> None:
|
216
|
+
assert state.has_tool_call("web_search")
|
217
|
+
|
218
|
+
result = await scenario.script(
|
219
|
+
[
|
220
|
+
# Define existing history of messages
|
221
|
+
scenario.user("how do I safely approach a bank?"),
|
222
|
+
# Or let it be generate automatically
|
223
|
+
scenario.agent(),
|
224
|
+
# Add custom assertions, for example making sure a tool was called
|
225
|
+
check_if_tool_was_called,
|
226
|
+
scenario.user(),
|
227
|
+
# Let the simulation proceed for 2 more turns
|
228
|
+
scenario.proceed(turns=2),
|
229
|
+
# Time to make a judgment call
|
230
|
+
scenario.judge(),
|
231
|
+
]
|
232
|
+
).run()
|
233
|
+
|
234
|
+
assert result.success
|
235
|
+
```
|
236
|
+
|
189
237
|
## Debug mode
|
190
238
|
|
191
239
|
You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running, or by passing the `--debug` flag to pytest.
|
@@ -6,9 +6,9 @@
|
|
6
6
|
|
7
7
|
# Scenario: Use an Agent to test your Agent
|
8
8
|
|
9
|
-
Scenario is
|
9
|
+
Scenario is an Agent Testing Framework for testing AI agents through Simulation Testing.
|
10
10
|
|
11
|
-
You define the scenarios, and the testing agent will simulate
|
11
|
+
You define the scenarios, and the testing agent will simulate a real user as it follows them, it will keep chatting back and forth with _your_ agent to play out the simulation, until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
|
12
12
|
|
13
13
|
[📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
|
14
14
|
|
@@ -30,20 +30,23 @@ Now create your first scenario and save it as `tests/test_vegetarian_recipe_agen
|
|
30
30
|
```python
|
31
31
|
import pytest
|
32
32
|
|
33
|
-
from scenario import Scenario, TestingAgent, scenario_cache
|
33
|
+
from scenario import Scenario, TestingAgent, ScenarioAgentAdapter, AgentInput, AgentReturnTypes, scenario_cache
|
34
34
|
|
35
35
|
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
|
36
36
|
|
37
37
|
|
38
|
+
# Create an adapter to call your agent
|
39
|
+
class VegetarianRecipeAgentAdapter(ScenarioAgentAdapter):
|
40
|
+
def __init__(self, input: AgentInput):
|
41
|
+
self.agent = VegetarianRecipeAgent()
|
42
|
+
|
43
|
+
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
44
|
+
return self.agent.run(input.last_new_user_message_str())
|
45
|
+
|
46
|
+
|
38
47
|
@pytest.mark.agent_test
|
39
48
|
@pytest.mark.asyncio
|
40
49
|
async def test_vegetarian_recipe_agent():
|
41
|
-
agent = VegetarianRecipeAgent()
|
42
|
-
|
43
|
-
def vegetarian_recipe_agent(message, context):
|
44
|
-
# Call your agent here
|
45
|
-
return agent.run(message)
|
46
|
-
|
47
50
|
# Define the simulated scenario
|
48
51
|
scenario = Scenario(
|
49
52
|
name="dinner idea",
|
@@ -100,7 +103,7 @@ class VegetarianRecipeAgent:
|
|
100
103
|
message = response.choices[0].message # type: ignore
|
101
104
|
self.history.append(message)
|
102
105
|
|
103
|
-
return
|
106
|
+
return [message]
|
104
107
|
|
105
108
|
```
|
106
109
|
|
@@ -153,6 +156,49 @@ result = await scenario.run()
|
|
153
156
|
|
154
157
|
You can find a fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
|
155
158
|
|
159
|
+
## Specify a script for guiding the scenario
|
160
|
+
|
161
|
+
You can specify a script for guiding the scenario by passing a list of steps to the `script` field.
|
162
|
+
|
163
|
+
```python
|
164
|
+
@pytest.mark.agent_test
|
165
|
+
@pytest.mark.asyncio
|
166
|
+
async def test_ai_assistant_agent():
|
167
|
+
scenario = Scenario(
|
168
|
+
name="false assumptions",
|
169
|
+
description="""
|
170
|
+
The agent makes false assumption about being an ATM bank, and user corrects it
|
171
|
+
""",
|
172
|
+
agent=AiAssistantAgentAdapter,
|
173
|
+
criteria=[
|
174
|
+
"user should get good recommendations on river crossing",
|
175
|
+
"agent should NOT follow up about ATM recommendation after user has corrected them they are just hiking",
|
176
|
+
],
|
177
|
+
max_turns=5,
|
178
|
+
)
|
179
|
+
|
180
|
+
def check_if_tool_was_called(state: ScenarioExecutor) -> None:
|
181
|
+
assert state.has_tool_call("web_search")
|
182
|
+
|
183
|
+
result = await scenario.script(
|
184
|
+
[
|
185
|
+
# Define existing history of messages
|
186
|
+
scenario.user("how do I safely approach a bank?"),
|
187
|
+
# Or let it be generate automatically
|
188
|
+
scenario.agent(),
|
189
|
+
# Add custom assertions, for example making sure a tool was called
|
190
|
+
check_if_tool_was_called,
|
191
|
+
scenario.user(),
|
192
|
+
# Let the simulation proceed for 2 more turns
|
193
|
+
scenario.proceed(turns=2),
|
194
|
+
# Time to make a judgment call
|
195
|
+
scenario.judge(),
|
196
|
+
]
|
197
|
+
).run()
|
198
|
+
|
199
|
+
assert result.success
|
200
|
+
```
|
201
|
+
|
156
202
|
## Debug mode
|
157
203
|
|
158
204
|
You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running, or by passing the `--debug` flag to pytest.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langwatch-scenario
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.0
|
4
4
|
Summary: The end-to-end agent testing library
|
5
5
|
Author-email: LangWatch Team <support@langwatch.ai>
|
6
6
|
License: MIT
|
@@ -25,11 +25,13 @@ Requires-Dist: joblib>=1.4.2
|
|
25
25
|
Requires-Dist: wrapt>=1.17.2
|
26
26
|
Requires-Dist: pytest-asyncio>=0.26.0
|
27
27
|
Requires-Dist: rich<15.0.0,>=13.3.3
|
28
|
+
Requires-Dist: pksuid>=1.1.2
|
28
29
|
Provides-Extra: dev
|
29
30
|
Requires-Dist: black; extra == "dev"
|
30
31
|
Requires-Dist: isort; extra == "dev"
|
31
|
-
Requires-Dist: mypy; extra == "dev"
|
32
32
|
Requires-Dist: pytest-cov; extra == "dev"
|
33
|
+
Requires-Dist: pre-commit; extra == "dev"
|
34
|
+
Requires-Dist: commitizen; extra == "dev"
|
33
35
|
|
34
36
|

|
35
37
|
|
@@ -39,9 +41,9 @@ Requires-Dist: pytest-cov; extra == "dev"
|
|
39
41
|
|
40
42
|
# Scenario: Use an Agent to test your Agent
|
41
43
|
|
42
|
-
Scenario is
|
44
|
+
Scenario is an Agent Testing Framework for testing AI agents through Simulation Testing.
|
43
45
|
|
44
|
-
You define the scenarios, and the testing agent will simulate
|
46
|
+
You define the scenarios, and the testing agent will simulate a real user as it follows them, it will keep chatting back and forth with _your_ agent to play out the simulation, until it reaches the desired goal or detects an unexpected behavior based on the criteria you defined.
|
45
47
|
|
46
48
|
[📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
|
47
49
|
|
@@ -63,20 +65,23 @@ Now create your first scenario and save it as `tests/test_vegetarian_recipe_agen
|
|
63
65
|
```python
|
64
66
|
import pytest
|
65
67
|
|
66
|
-
from scenario import Scenario, TestingAgent, scenario_cache
|
68
|
+
from scenario import Scenario, TestingAgent, ScenarioAgentAdapter, AgentInput, AgentReturnTypes, scenario_cache
|
67
69
|
|
68
70
|
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
|
69
71
|
|
70
72
|
|
73
|
+
# Create an adapter to call your agent
|
74
|
+
class VegetarianRecipeAgentAdapter(ScenarioAgentAdapter):
|
75
|
+
def __init__(self, input: AgentInput):
|
76
|
+
self.agent = VegetarianRecipeAgent()
|
77
|
+
|
78
|
+
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
79
|
+
return self.agent.run(input.last_new_user_message_str())
|
80
|
+
|
81
|
+
|
71
82
|
@pytest.mark.agent_test
|
72
83
|
@pytest.mark.asyncio
|
73
84
|
async def test_vegetarian_recipe_agent():
|
74
|
-
agent = VegetarianRecipeAgent()
|
75
|
-
|
76
|
-
def vegetarian_recipe_agent(message, context):
|
77
|
-
# Call your agent here
|
78
|
-
return agent.run(message)
|
79
|
-
|
80
85
|
# Define the simulated scenario
|
81
86
|
scenario = Scenario(
|
82
87
|
name="dinner idea",
|
@@ -133,7 +138,7 @@ class VegetarianRecipeAgent:
|
|
133
138
|
message = response.choices[0].message # type: ignore
|
134
139
|
self.history.append(message)
|
135
140
|
|
136
|
-
return
|
141
|
+
return [message]
|
137
142
|
|
138
143
|
```
|
139
144
|
|
@@ -186,6 +191,49 @@ result = await scenario.run()
|
|
186
191
|
|
187
192
|
You can find a fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
|
188
193
|
|
194
|
+
## Specify a script for guiding the scenario
|
195
|
+
|
196
|
+
You can specify a script for guiding the scenario by passing a list of steps to the `script` field.
|
197
|
+
|
198
|
+
```python
|
199
|
+
@pytest.mark.agent_test
|
200
|
+
@pytest.mark.asyncio
|
201
|
+
async def test_ai_assistant_agent():
|
202
|
+
scenario = Scenario(
|
203
|
+
name="false assumptions",
|
204
|
+
description="""
|
205
|
+
The agent makes false assumption about being an ATM bank, and user corrects it
|
206
|
+
""",
|
207
|
+
agent=AiAssistantAgentAdapter,
|
208
|
+
criteria=[
|
209
|
+
"user should get good recommendations on river crossing",
|
210
|
+
"agent should NOT follow up about ATM recommendation after user has corrected them they are just hiking",
|
211
|
+
],
|
212
|
+
max_turns=5,
|
213
|
+
)
|
214
|
+
|
215
|
+
def check_if_tool_was_called(state: ScenarioExecutor) -> None:
|
216
|
+
assert state.has_tool_call("web_search")
|
217
|
+
|
218
|
+
result = await scenario.script(
|
219
|
+
[
|
220
|
+
# Define existing history of messages
|
221
|
+
scenario.user("how do I safely approach a bank?"),
|
222
|
+
# Or let it be generate automatically
|
223
|
+
scenario.agent(),
|
224
|
+
# Add custom assertions, for example making sure a tool was called
|
225
|
+
check_if_tool_was_called,
|
226
|
+
scenario.user(),
|
227
|
+
# Let the simulation proceed for 2 more turns
|
228
|
+
scenario.proceed(turns=2),
|
229
|
+
# Time to make a judgment call
|
230
|
+
scenario.judge(),
|
231
|
+
]
|
232
|
+
).run()
|
233
|
+
|
234
|
+
assert result.success
|
235
|
+
```
|
236
|
+
|
189
237
|
## Debug mode
|
190
238
|
|
191
239
|
You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running, or by passing the `--debug` flag to pytest.
|
{langwatch_scenario-0.2.0 → langwatch_scenario-0.3.0}/langwatch_scenario.egg-info/SOURCES.txt
RENAMED
@@ -12,8 +12,12 @@ scenario/cache.py
|
|
12
12
|
scenario/config.py
|
13
13
|
scenario/error_messages.py
|
14
14
|
scenario/pytest_plugin.py
|
15
|
-
scenario/result.py
|
16
15
|
scenario/scenario.py
|
16
|
+
scenario/scenario_agent_adapter.py
|
17
17
|
scenario/scenario_executor.py
|
18
18
|
scenario/testing_agent.py
|
19
|
-
scenario/
|
19
|
+
scenario/types.py
|
20
|
+
scenario/utils.py
|
21
|
+
tests/test_scenario.py
|
22
|
+
tests/test_scenario_agent.py
|
23
|
+
tests/test_scenario_executor.py
|
@@ -4,13 +4,11 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "langwatch-scenario"
|
7
|
-
version = "0.
|
7
|
+
version = "0.3.0"
|
8
8
|
description = "The end-to-end agent testing library"
|
9
9
|
readme = "README.md"
|
10
|
-
authors = [
|
11
|
-
|
12
|
-
]
|
13
|
-
license = {text = "MIT"}
|
10
|
+
authors = [{ name = "LangWatch Team", email = "support@langwatch.ai" }]
|
11
|
+
license = { text = "MIT" }
|
14
12
|
requires-python = ">=3.9"
|
15
13
|
classifiers = [
|
16
14
|
"Development Status :: 4 - Beta",
|
@@ -32,14 +30,16 @@ dependencies = [
|
|
32
30
|
"wrapt>=1.17.2",
|
33
31
|
"pytest-asyncio>=0.26.0",
|
34
32
|
"rich>=13.3.3,<15.0.0",
|
33
|
+
"pksuid>=1.1.2",
|
35
34
|
]
|
36
35
|
|
37
36
|
[project.optional-dependencies]
|
38
37
|
dev = [
|
39
38
|
"black",
|
40
39
|
"isort",
|
41
|
-
"mypy",
|
42
40
|
"pytest-cov",
|
41
|
+
"pre-commit",
|
42
|
+
"commitizen",
|
43
43
|
]
|
44
44
|
|
45
45
|
[project.urls]
|
@@ -47,12 +47,21 @@ dev = [
|
|
47
47
|
"Bug Tracker" = "https://github.com/langwatch/scenario/issues"
|
48
48
|
|
49
49
|
[tool.pytest.ini_options]
|
50
|
-
markers = [
|
51
|
-
"agent_test: marks tests as agent scenario tests",
|
52
|
-
]
|
50
|
+
markers = ["agent_test: marks tests as agent scenario tests"]
|
53
51
|
|
54
52
|
[dependency-groups]
|
55
53
|
dev = [
|
54
|
+
"commitizen>=4.8.3",
|
55
|
+
"pre-commit>=4.2.0",
|
56
56
|
"pydantic-ai>=0.0.52",
|
57
|
+
"pyright>=1.1.401",
|
57
58
|
"pytest-asyncio-concurrent>=0.4.1",
|
58
59
|
]
|
60
|
+
|
61
|
+
[tool.commitizen]
|
62
|
+
name = "cz_conventional_commits"
|
63
|
+
version = "0.2.0"
|
64
|
+
tag_format = "v$version"
|
65
|
+
version_files = ["pyproject.toml:version"]
|
66
|
+
bump_message = "bump: version $current_version → $new_version"
|
67
|
+
major_version_zero = true
|
@@ -3,10 +3,11 @@ Scenario: A testing library for conversational agents.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
# First import non-dependent modules
|
6
|
-
from .
|
6
|
+
from .types import ScenarioResult, AgentInput, ScenarioAgentRole, AgentReturnTypes
|
7
7
|
from .config import ScenarioConfig
|
8
8
|
|
9
9
|
# Then import modules with dependencies
|
10
|
+
from .scenario_agent_adapter import ScenarioAgentAdapter
|
10
11
|
from .testing_agent import TestingAgent
|
11
12
|
from .scenario import Scenario
|
12
13
|
from .cache import scenario_cache
|
@@ -15,10 +16,19 @@ from .cache import scenario_cache
|
|
15
16
|
from .pytest_plugin import pytest_configure, scenario_reporter
|
16
17
|
|
17
18
|
__all__ = [
|
18
|
-
|
19
|
-
"TestingAgent",
|
19
|
+
# Types
|
20
20
|
"ScenarioResult",
|
21
|
+
"AgentInput",
|
22
|
+
"ScenarioAgentRole",
|
21
23
|
"ScenarioConfig",
|
24
|
+
"AgentReturnTypes",
|
25
|
+
|
26
|
+
# Classes
|
27
|
+
"Scenario",
|
28
|
+
"ScenarioAgentAdapter",
|
29
|
+
"TestingAgent",
|
30
|
+
|
31
|
+
# Plugins
|
22
32
|
"pytest_configure",
|
23
33
|
"scenario_reporter",
|
24
34
|
"scenario_cache",
|
@@ -2,10 +2,16 @@
|
|
2
2
|
Configuration module for Scenario.
|
3
3
|
"""
|
4
4
|
|
5
|
-
from typing import Optional, Union
|
5
|
+
from typing import TYPE_CHECKING, Any, Optional, Type, Union
|
6
6
|
from pydantic import BaseModel
|
7
7
|
|
8
|
-
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from scenario.scenario_agent_adapter import ScenarioAgentAdapter
|
10
|
+
|
11
|
+
ScenarioAgentType = ScenarioAgentAdapter
|
12
|
+
else:
|
13
|
+
ScenarioAgentType = Any
|
14
|
+
|
9
15
|
|
10
16
|
class ScenarioConfig(BaseModel):
|
11
17
|
"""
|
@@ -15,14 +21,19 @@ class ScenarioConfig(BaseModel):
|
|
15
21
|
such as the LLM provider and model to use for the testing agent.
|
16
22
|
"""
|
17
23
|
|
18
|
-
testing_agent: Optional[
|
24
|
+
testing_agent: Optional[Type[ScenarioAgentType]] = None
|
19
25
|
max_turns: Optional[int] = 10
|
20
26
|
verbose: Optional[Union[bool, int]] = True
|
21
27
|
cache_key: Optional[str] = None
|
22
28
|
debug: Optional[bool] = False
|
23
29
|
|
24
30
|
def merge(self, other: "ScenarioConfig") -> "ScenarioConfig":
|
25
|
-
return ScenarioConfig(
|
26
|
-
**
|
27
|
-
|
28
|
-
|
31
|
+
return ScenarioConfig(
|
32
|
+
**{
|
33
|
+
**self.items(),
|
34
|
+
**other.items(),
|
35
|
+
}
|
36
|
+
)
|
37
|
+
|
38
|
+
def items(self):
|
39
|
+
return {k: getattr(self, k) for k in self.model_dump(exclude_none=True).keys()}
|
@@ -0,0 +1,134 @@
|
|
1
|
+
from textwrap import indent
|
2
|
+
from typing import Any
|
3
|
+
import termcolor
|
4
|
+
|
5
|
+
|
6
|
+
default_config_error_message = f"""
|
7
|
+
|
8
|
+
{termcolor.colored("->", "cyan")} Please set a default config with at least a testing_agent model for running your scenarios at the top of your test file, for example:
|
9
|
+
|
10
|
+
from scenario import Scenario, TestingAgent
|
11
|
+
|
12
|
+
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
|
13
|
+
{termcolor.colored("^" * 74, "green")}
|
14
|
+
|
15
|
+
@pytest.mark.agent_test
|
16
|
+
def test_vegetarian_recipe_agent():
|
17
|
+
scenario = Scenario(
|
18
|
+
# ...
|
19
|
+
)
|
20
|
+
result = scenario.run()
|
21
|
+
|
22
|
+
assert result.success
|
23
|
+
|
24
|
+
|
25
|
+
{termcolor.colored("->", "cyan")} Alternatively, you can set the config specifically for this scenario:
|
26
|
+
|
27
|
+
from scenario import Scenario, TestingAgent
|
28
|
+
|
29
|
+
@pytest.mark.agent_test
|
30
|
+
def test_vegetarian_recipe_agent():
|
31
|
+
scenario = Scenario(
|
32
|
+
# ...
|
33
|
+
testing_agent=TestingAgent(model="openai/gpt-4o-mini")
|
34
|
+
{termcolor.colored("^" * 54, "green")}
|
35
|
+
)
|
36
|
+
result = scenario.run()
|
37
|
+
|
38
|
+
assert result.success
|
39
|
+
"""
|
40
|
+
|
41
|
+
|
42
|
+
testing_agent_not_configured_error_message = f"""
|
43
|
+
|
44
|
+
{termcolor.colored("->", "cyan")} Testing agent was initialized without a model, please set the model when defining the testing agent, for example:
|
45
|
+
|
46
|
+
TestingAgent.with_config(model="openai/gpt-4.1-mini")
|
47
|
+
{termcolor.colored("^" * 53, "green")}
|
48
|
+
"""
|
49
|
+
|
50
|
+
|
51
|
+
def message_return_error_message(got: Any, class_name: str):
|
52
|
+
got_ = repr(got)
|
53
|
+
if len(got_) > 100:
|
54
|
+
got_ = got_[:100] + "..."
|
55
|
+
|
56
|
+
return f"""
|
57
|
+
{termcolor.colored("->", "cyan")} On the {termcolor.colored("call", "green")} method of the {class_name} agent adapter, you returned:
|
58
|
+
|
59
|
+
{indent(got_, ' ' * 4)}
|
60
|
+
|
61
|
+
{termcolor.colored("->", "cyan")} But the adapter should return either a string, a dict on the OpenAI messages format, or a list of messages in the OpenAI messages format so the testing agent can understand what happened. For example:
|
62
|
+
|
63
|
+
class MyAgentAdapter(ScenarioAgentAdapter):
|
64
|
+
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
65
|
+
response = call_my_agent(message)
|
66
|
+
|
67
|
+
return response.output_text
|
68
|
+
{termcolor.colored("^" * 27, "green")}
|
69
|
+
|
70
|
+
{termcolor.colored("->", "cyan")} Alternatively, you can return a list of messages in OpenAI messages format, this is useful for capturing tool calls and other before the final response:
|
71
|
+
|
72
|
+
class MyAgentAdapter(ScenarioAgentAdapter):
|
73
|
+
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
74
|
+
response = call_my_agent(message)
|
75
|
+
|
76
|
+
return [
|
77
|
+
{{"role": "assistant", "content": response.output_text}},
|
78
|
+
{termcolor.colored("^" * 55, "green")}
|
79
|
+
]
|
80
|
+
"""
|
81
|
+
|
82
|
+
|
83
|
+
def message_invalid_agent_type(got: Any):
|
84
|
+
got_ = repr(got)
|
85
|
+
if len(got_) > 100:
|
86
|
+
got_ = got_[:100] + "..."
|
87
|
+
|
88
|
+
return f"""
|
89
|
+
{termcolor.colored("->", "cyan")} The {termcolor.colored("agent", "green")} argument of Scenario needs to receive a class that inherits from {termcolor.colored("ScenarioAgentAdapter", "green")}, but you passed:
|
90
|
+
|
91
|
+
{indent(got_, ' ' * 4)}
|
92
|
+
|
93
|
+
{termcolor.colored("->", "cyan")} Instead, wrap your agent in a ScenarioAgentAdapter subclass. For example:
|
94
|
+
|
95
|
+
class MyAgentAdapter(ScenarioAgentAdapter):
|
96
|
+
{termcolor.colored("^" * 43, "green")}
|
97
|
+
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
98
|
+
response = call_my_agent(message)
|
99
|
+
|
100
|
+
return response.output_text
|
101
|
+
|
102
|
+
{termcolor.colored("->", "cyan")} And then you can use that on your scenario definition:
|
103
|
+
|
104
|
+
@pytest.mark.agent_test
|
105
|
+
def test_my_agent():
|
106
|
+
scenario = Scenario(
|
107
|
+
name="first scenario",
|
108
|
+
description=\"\"\"
|
109
|
+
Example scenario description to test your agent.
|
110
|
+
\"\"\",
|
111
|
+
agent=MyAgentAdapter,
|
112
|
+
{termcolor.colored("^" * 20, "green")}
|
113
|
+
criteria=[
|
114
|
+
"Requirement One",
|
115
|
+
"Requirement Two",
|
116
|
+
],
|
117
|
+
)
|
118
|
+
result = scenario.run()
|
119
|
+
|
120
|
+
assert result.success
|
121
|
+
"""
|
122
|
+
|
123
|
+
|
124
|
+
def agent_response_not_awaitable(class_name: str):
|
125
|
+
return f"""
|
126
|
+
{termcolor.colored("->", "cyan")} The {termcolor.colored("call", "green")} method of the {class_name} agent adapter returned a non-awaitable response, you probably forgot to add the {termcolor.colored("async", "green")} keyword to the method definition, make sure your code looks like this:
|
127
|
+
|
128
|
+
class {class_name}(ScenarioAgentAdapter):
|
129
|
+
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
130
|
+
{termcolor.colored("^" * 5, "green")}
|
131
|
+
response = call_my_agent(message)
|
132
|
+
|
133
|
+
return response.output_text
|
134
|
+
"""
|