langwatch-scenario 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langwatch_scenario-0.1.0.dist-info/METADATA +192 -0
- langwatch_scenario-0.1.0.dist-info/RECORD +15 -0
- langwatch_scenario-0.1.0.dist-info/WHEEL +5 -0
- langwatch_scenario-0.1.0.dist-info/entry_points.txt +2 -0
- langwatch_scenario-0.1.0.dist-info/top_level.txt +1 -0
- scenario/__init__.py +26 -0
- scenario/cache.py +62 -0
- scenario/config.py +28 -0
- scenario/error_messages.py +66 -0
- scenario/pytest_plugin.py +161 -0
- scenario/result.py +81 -0
- scenario/scenario.py +117 -0
- scenario/scenario_executor.py +204 -0
- scenario/testing_agent.py +262 -0
- scenario/utils.py +121 -0
@@ -0,0 +1,192 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: langwatch-scenario
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: The end-to-end agent testing library
|
5
|
+
Author-email: LangWatch Team <support@langwatch.ai>
|
6
|
+
License: MIT
|
7
|
+
Project-URL: Homepage, https://github.com/langwatch/scenario
|
8
|
+
Project-URL: Bug Tracker, https://github.com/langwatch/scenario/issues
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
17
|
+
Requires-Python: >=3.9
|
18
|
+
Description-Content-Type: text/markdown
|
19
|
+
Requires-Dist: pytest>=8.1.1
|
20
|
+
Requires-Dist: litellm>=1.49.0
|
21
|
+
Requires-Dist: python-dotenv>=1.0.1
|
22
|
+
Requires-Dist: termcolor>=2.4.0
|
23
|
+
Requires-Dist: pydantic>=2.7.0
|
24
|
+
Requires-Dist: joblib>=1.4.2
|
25
|
+
Requires-Dist: wrapt>=1.17.2
|
26
|
+
Requires-Dist: pytest-asyncio>=0.26.0
|
27
|
+
Requires-Dist: rich>=14.0.0
|
28
|
+
Provides-Extra: dev
|
29
|
+
Requires-Dist: black; extra == "dev"
|
30
|
+
Requires-Dist: isort; extra == "dev"
|
31
|
+
Requires-Dist: mypy; extra == "dev"
|
32
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
33
|
+
|
34
|
+

|
35
|
+
|
36
|
+
<div align="center">
|
37
|
+
<!-- Discord, PyPI, Docs, etc links -->
|
38
|
+
</div>
|
39
|
+
|
40
|
+
# Scenario: Use an Agent to test your Agent
|
41
|
+
|
42
|
+
Scenario is a library for testing agents end-to-end as a human would, but without having to manually do it. The automated testing agent covers every single scenario for you.
|
43
|
+
|
44
|
+
You define the scenarios, and the testing agent will simulate your users as it follows them, it will keep chatting and evaluating your agent until it reaches the desired goal or detects an unexpected behavior.
|
45
|
+
|
46
|
+
## Getting Started
|
47
|
+
|
48
|
+
Install pytest and scenario:
|
49
|
+
|
50
|
+
```bash
|
51
|
+
pip install pytest langwatch-scenario
|
52
|
+
```
|
53
|
+
|
54
|
+
Now create your first scenario:
|
55
|
+
|
56
|
+
```python
|
57
|
+
import pytest
|
58
|
+
|
59
|
+
from scenario import Scenario, TestingAgent
|
60
|
+
|
61
|
+
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
|
62
|
+
|
63
|
+
|
64
|
+
@pytest.mark.agent_test
|
65
|
+
@pytest.mark.asyncio
|
66
|
+
async def test_vegetarian_recipe_agent():
|
67
|
+
def vegetarian_recipe_agent(message, context):
|
68
|
+
# Call your agent here
|
69
|
+
response = "<Your agent's response>"
|
70
|
+
|
71
|
+
return {"message": response}
|
72
|
+
|
73
|
+
scenario = Scenario(
|
74
|
+
"User is looking for a dinner idea",
|
75
|
+
agent=vegetarian_recipe_agent,
|
76
|
+
success_criteria=[
|
77
|
+
"Recipe agent generates a vegetarian recipe",
|
78
|
+
"Recipe includes step-by-step cooking instructions",
|
79
|
+
],
|
80
|
+
failure_criteria=[
|
81
|
+
"The recipe includes meat",
|
82
|
+
"The agent asks more than two follow-up questions",
|
83
|
+
],
|
84
|
+
)
|
85
|
+
|
86
|
+
result = await scenario.run()
|
87
|
+
|
88
|
+
assert result.success
|
89
|
+
```
|
90
|
+
|
91
|
+
Save it as `tests/test_vegetarian_recipe_agent.py` and run it with pytest:
|
92
|
+
|
93
|
+
```bash
|
94
|
+
pytest -s tests/test_vegetarian_recipe_agent.py
|
95
|
+
```
|
96
|
+
|
97
|
+
Once you connect to callback to a real agent, this is how it will look like:
|
98
|
+
|
99
|
+
[](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11)
|
100
|
+
|
101
|
+
You can find a fully working example in [examples/test_vegetarian_recipe_agent.py](examples/test_vegetarian_recipe_agent.py).
|
102
|
+
|
103
|
+
## Customize strategy and max_turns
|
104
|
+
|
105
|
+
You can customize how should the testing agent go about testing by defining a `strategy` field. You can also limit the maximum number of turns the scenario will take by setting the `max_turns` field (defaults to 10).
|
106
|
+
|
107
|
+
For example, in this Lovable Clone scenario test:
|
108
|
+
|
109
|
+
```python
|
110
|
+
scenario = Scenario(
|
111
|
+
"user wants to create a new landing page for their dog walking startup",
|
112
|
+
agent=lovable_agent,
|
113
|
+
strategy="send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict",
|
114
|
+
success_criteria=[
|
115
|
+
"agent reads the files before go and making changes",
|
116
|
+
"agent modified the index.css file",
|
117
|
+
"agent modified the Index.tsx file",
|
118
|
+
"agent created a comprehensive landing page",
|
119
|
+
"agent extended the landing page with a new section",
|
120
|
+
],
|
121
|
+
failure_criteria=[
|
122
|
+
"agent says it can't read the file",
|
123
|
+
"agent produces incomplete code or is too lazy to finish",
|
124
|
+
],
|
125
|
+
max_turns=5,
|
126
|
+
)
|
127
|
+
|
128
|
+
result = await scenario.run()
|
129
|
+
```
|
130
|
+
|
131
|
+
You can find a fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
|
132
|
+
|
133
|
+
## Debug mode
|
134
|
+
|
135
|
+
You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running.
|
136
|
+
|
137
|
+
Debug mode allows you to see the messages in slow motion step by step, and intervene with your own inputs to debug your agent from the middle of the conversation.
|
138
|
+
|
139
|
+
```python
|
140
|
+
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"), debug=True)
|
141
|
+
```
|
142
|
+
|
143
|
+
## Cache
|
144
|
+
|
145
|
+
Each time the scenario runs, the testing agent might chose a different input to start, this is good to make sure it covers the variance of real users as well, however we understand that the non-deterministic nature of it might make it less repeatable, costly and harder to debug. To solve for it, you can use the `cache_key` field in the `Scenario.configure` method or in the specific scenario you are running, this will make the testing agent give the same input for given the same scenario:
|
146
|
+
|
147
|
+
```python
|
148
|
+
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"), cache_key="42")
|
149
|
+
```
|
150
|
+
|
151
|
+
To bust the cache, you can simply pass a different `cache_key`, disable it, or delete the cache files located at `~/.scenario/cache`.
|
152
|
+
|
153
|
+
To go a step further and fully cache the test end-to-end, you can also wrap the LLM calls or any other non-deterministic functions in your application side with the `@scenario_cache` decorator:
|
154
|
+
|
155
|
+
```python
|
156
|
+
class MyAgent:
|
157
|
+
@scenario_cache(ignore=["self"])
|
158
|
+
def invoke(self, message, context):
|
159
|
+
return client.chat.completions.create(
|
160
|
+
# ...
|
161
|
+
)
|
162
|
+
```
|
163
|
+
|
164
|
+
This will cache any function call you decorate when running the tests and make them repeatable, hashed by the function arguments, the scenario being executed, and the `cache_key` you provided. You can exclude arguments that should not be hashed for the cache key by naming them in the `ignore` argument.
|
165
|
+
|
166
|
+
## Disable Output
|
167
|
+
|
168
|
+
You can remove the `-s` flag from pytest to hide the output during test, which will only show up if the test fails. Alternatively, you can set `verbose=False` in the `Scenario.configure` method or in the specific scenario you are running.
|
169
|
+
|
170
|
+
## Running in parallel
|
171
|
+
|
172
|
+
As the number of your scenarios grows, you might want to run them in parallel to speed up your whole test suite. We suggest you to use the [pytest-asyncio-concurrent](https://pypi.org/project/pytest-asyncio-concurrent/) plugin to do so.
|
173
|
+
|
174
|
+
Simply install the plugin from the link above, then replace the `@pytest.mark.asyncio` annotation in the tests with `@pytest.mark.asyncio_concurrent`, adding a group name to it to mark the group of scenarions that should be run in parallel together, e.g.:
|
175
|
+
|
176
|
+
```python
|
177
|
+
@pytest.mark.agent_test
|
178
|
+
@pytest.mark.asyncio_concurrent(group="vegetarian_recipe_agent")
|
179
|
+
async def test_vegetarian_recipe_agent():
|
180
|
+
# ...
|
181
|
+
|
182
|
+
@pytest.mark.agent_test
|
183
|
+
@pytest.mark.asyncio_concurrent(group="vegetarian_recipe_agent")
|
184
|
+
async def test_user_is_very_hungry():
|
185
|
+
# ...
|
186
|
+
```
|
187
|
+
|
188
|
+
Those two scenarios should now run in parallel.
|
189
|
+
|
190
|
+
## License
|
191
|
+
|
192
|
+
MIT License
|
@@ -0,0 +1,15 @@
|
|
1
|
+
scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
|
2
|
+
scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
|
3
|
+
scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
|
4
|
+
scenario/error_messages.py,sha256=8bTwG_iKz7FjGp50FU0anQ1fmI6eJE4NeaoXtiifbBg,2099
|
5
|
+
scenario/pytest_plugin.py,sha256=ydtQxaN09qzoo12nNT8BQY_UPPHAt-AH92HWnPEN6bI,5212
|
6
|
+
scenario/result.py,sha256=SGF8uYNtkP7cJy4KsshUozZRevmdiyX2TFzr6VreTv8,2717
|
7
|
+
scenario/scenario.py,sha256=3gOTeEOsV7EWmhEUfYnWsdFD-px1JUYnMEixicQrTqY,4009
|
8
|
+
scenario/scenario_executor.py,sha256=bDzoatslbp80dG6DU-i2VUlOa9SMtyw2VIhcF7knwis,7883
|
9
|
+
scenario/testing_agent.py,sha256=wMK2GqmN4QDr0kFoxgqcAPsU6gjCx8HBJQv1wmsdSb4,10683
|
10
|
+
scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
|
11
|
+
langwatch_scenario-0.1.0.dist-info/METADATA,sha256=XlfUHfHDjLKswH8Sq4tuc8nJA-yrs_bCV2h3eXpDM7E,7435
|
12
|
+
langwatch_scenario-0.1.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
13
|
+
langwatch_scenario-0.1.0.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
|
14
|
+
langwatch_scenario-0.1.0.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
|
15
|
+
langwatch_scenario-0.1.0.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
scenario
|
scenario/__init__.py
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
"""
|
2
|
+
Scenario: A testing library for conversational agents.
|
3
|
+
"""
|
4
|
+
|
5
|
+
# First import non-dependent modules
|
6
|
+
from .result import ScenarioResult
|
7
|
+
from .config import ScenarioConfig
|
8
|
+
|
9
|
+
# Then import modules with dependencies
|
10
|
+
from .testing_agent import TestingAgent
|
11
|
+
from .scenario import Scenario
|
12
|
+
from .cache import scenario_cache
|
13
|
+
|
14
|
+
# Import pytest plugin components
|
15
|
+
from .pytest_plugin import pytest_configure, scenario_reporter
|
16
|
+
|
17
|
+
__all__ = [
|
18
|
+
"Scenario",
|
19
|
+
"TestingAgent",
|
20
|
+
"ScenarioResult",
|
21
|
+
"ScenarioConfig",
|
22
|
+
"pytest_configure",
|
23
|
+
"scenario_reporter",
|
24
|
+
"scenario_cache",
|
25
|
+
]
|
26
|
+
__version__ = "0.1.0"
|
scenario/cache.py
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
from contextvars import ContextVar
|
2
|
+
import inspect
|
3
|
+
import os
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Callable, TYPE_CHECKING
|
6
|
+
from joblib import Memory
|
7
|
+
|
8
|
+
import json
|
9
|
+
|
10
|
+
import wrapt
|
11
|
+
from scenario.utils import SerializableWithStringFallback
|
12
|
+
|
13
|
+
if TYPE_CHECKING:
|
14
|
+
from scenario.scenario import Scenario
|
15
|
+
|
16
|
+
|
17
|
+
context_scenario = ContextVar("scenario")
|
18
|
+
|
19
|
+
def get_cache() -> Memory:
|
20
|
+
"""Get a cross-platform cache directory for scenario."""
|
21
|
+
home_dir = str(Path.home())
|
22
|
+
cache_dir = os.path.join(home_dir, ".scenario", "cache")
|
23
|
+
|
24
|
+
return Memory(location=os.environ.get("SCENARIO_CACHE_DIR", cache_dir), verbose=0)
|
25
|
+
|
26
|
+
memory = get_cache()
|
27
|
+
|
28
|
+
def scenario_cache(ignore=[]):
|
29
|
+
@wrapt.decorator
|
30
|
+
def wrapper(wrapped: Callable, instance=None, args=[], kwargs={}):
|
31
|
+
scenario: "Scenario" = context_scenario.get()
|
32
|
+
|
33
|
+
if not scenario.cache_key:
|
34
|
+
return wrapped(*args, **kwargs)
|
35
|
+
|
36
|
+
sig = inspect.signature(wrapped)
|
37
|
+
parameters = list(sig.parameters.values())
|
38
|
+
|
39
|
+
all_args = {
|
40
|
+
str(parameter.name): value for parameter, value in zip(parameters, args)
|
41
|
+
}
|
42
|
+
for arg in ["self"] + ignore:
|
43
|
+
if arg in all_args:
|
44
|
+
del all_args[arg]
|
45
|
+
|
46
|
+
cache_key = json.dumps(
|
47
|
+
{
|
48
|
+
"cache_key": scenario.cache_key,
|
49
|
+
"scenario": scenario.model_dump(exclude={"agent"}),
|
50
|
+
"all_args": all_args,
|
51
|
+
},
|
52
|
+
cls=SerializableWithStringFallback,
|
53
|
+
)
|
54
|
+
|
55
|
+
return _cached_call(wrapped, args, kwargs, cache_key=cache_key)
|
56
|
+
|
57
|
+
return wrapper
|
58
|
+
|
59
|
+
|
60
|
+
@memory.cache(ignore=["func", "args", "kwargs"])
|
61
|
+
def _cached_call(func: Callable, args, kwargs, cache_key):
|
62
|
+
return func(*args, **kwargs)
|
scenario/config.py
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
"""
|
2
|
+
Configuration module for Scenario.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import Optional, Union
|
6
|
+
from pydantic import BaseModel
|
7
|
+
|
8
|
+
from scenario.testing_agent import TestingAgent
|
9
|
+
|
10
|
+
class ScenarioConfig(BaseModel):
|
11
|
+
"""
|
12
|
+
Configuration class for the Scenario library.
|
13
|
+
|
14
|
+
This allows users to set global configuration settings for the library,
|
15
|
+
such as the LLM provider and model to use for the testing agent.
|
16
|
+
"""
|
17
|
+
|
18
|
+
testing_agent: Optional[TestingAgent] = None
|
19
|
+
max_turns: Optional[int] = 10
|
20
|
+
verbose: Optional[Union[bool, int]] = True
|
21
|
+
cache_key: Optional[str] = None
|
22
|
+
debug: Optional[bool] = False
|
23
|
+
|
24
|
+
def merge(self, other: "ScenarioConfig") -> "ScenarioConfig":
|
25
|
+
return ScenarioConfig(**{
|
26
|
+
**self.model_dump(),
|
27
|
+
**other.model_dump(exclude_none=True),
|
28
|
+
})
|
@@ -0,0 +1,66 @@
|
|
1
|
+
import termcolor
|
2
|
+
|
3
|
+
|
4
|
+
default_config_error_message = f"""
|
5
|
+
|
6
|
+
{termcolor.colored("->", "cyan")} Please set a default config with at least a testing_agent model for running your scenarios at the top of your test file, for example:
|
7
|
+
|
8
|
+
from scenario import Scenario, TestingAgent
|
9
|
+
|
10
|
+
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
|
11
|
+
{termcolor.colored("^" * 74, "green")}
|
12
|
+
|
13
|
+
@pytest.mark.agent_test
|
14
|
+
def test_vegetarian_recipe_agent():
|
15
|
+
scenario = Scenario(
|
16
|
+
# ...
|
17
|
+
)
|
18
|
+
result = scenario.run()
|
19
|
+
|
20
|
+
assert result.success
|
21
|
+
|
22
|
+
|
23
|
+
{termcolor.colored("->", "cyan")} Alternatively, you can set the config specifically for this scenario:
|
24
|
+
|
25
|
+
from scenario import Scenario, TestingAgent
|
26
|
+
|
27
|
+
@pytest.mark.agent_test
|
28
|
+
def test_vegetarian_recipe_agent():
|
29
|
+
scenario = Scenario(
|
30
|
+
# ...
|
31
|
+
testing_agent=TestingAgent(model="openai/gpt-4o-mini")
|
32
|
+
{termcolor.colored("^" * 54, "green")}
|
33
|
+
)
|
34
|
+
result = scenario.run()
|
35
|
+
|
36
|
+
assert result.success
|
37
|
+
"""
|
38
|
+
|
39
|
+
|
40
|
+
message_return_error_message = f"""
|
41
|
+
|
42
|
+
{termcolor.colored("->", "cyan")} Your agent should return a dict with either a "message" string key or a "messages" key in OpenAI messages format so the testing agent can understand what happened. For example:
|
43
|
+
|
44
|
+
def my_agent_under_test(message, context):
|
45
|
+
response = call_my_agent(message)
|
46
|
+
|
47
|
+
return {{
|
48
|
+
"message": response.output_text
|
49
|
+
{termcolor.colored("^" * 31, "green")}
|
50
|
+
}}
|
51
|
+
|
52
|
+
{termcolor.colored("->", "cyan")} Alternatively, you can return a list of messages in OpenAI messages format, you can also optionally provide extra artifacts:
|
53
|
+
|
54
|
+
def my_agent_under_test(message, context):
|
55
|
+
response = call_my_agent(message)
|
56
|
+
|
57
|
+
return {{
|
58
|
+
"messages": [
|
59
|
+
{{"role": "assistant", "content": response}}
|
60
|
+
{termcolor.colored("^" * 42, "green")}
|
61
|
+
],
|
62
|
+
"extra": {{
|
63
|
+
# ... optional extra artifacts
|
64
|
+
}}
|
65
|
+
}}
|
66
|
+
"""
|
@@ -0,0 +1,161 @@
|
|
1
|
+
"""
|
2
|
+
Pytest plugin for Scenario testing library.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import pytest
|
6
|
+
from typing import TypedDict
|
7
|
+
import functools
|
8
|
+
from termcolor import colored
|
9
|
+
|
10
|
+
from scenario.result import ScenarioResult
|
11
|
+
|
12
|
+
from .scenario import Scenario
|
13
|
+
|
14
|
+
class ScenarioReporterResults(TypedDict):
|
15
|
+
scenario: Scenario
|
16
|
+
result: ScenarioResult
|
17
|
+
|
18
|
+
# ScenarioReporter class definition moved outside the fixture for global use
|
19
|
+
class ScenarioReporter:
|
20
|
+
def __init__(self):
|
21
|
+
self.results : list[ScenarioReporterResults] = []
|
22
|
+
|
23
|
+
def add_result(self, scenario, result):
|
24
|
+
"""Add a test result to the reporter."""
|
25
|
+
self.results.append({"scenario": scenario, "result": result})
|
26
|
+
|
27
|
+
def get_summary(self):
|
28
|
+
"""Get a summary of all test results."""
|
29
|
+
total = len(self.results)
|
30
|
+
passed = sum(1 for r in self.results if r["result"].success)
|
31
|
+
failed = total - passed
|
32
|
+
|
33
|
+
return {
|
34
|
+
"total": total,
|
35
|
+
"passed": passed,
|
36
|
+
"failed": failed,
|
37
|
+
"success_rate": round(passed / total * 100, 2) if total else 0,
|
38
|
+
}
|
39
|
+
|
40
|
+
def print_report(self):
|
41
|
+
"""Print a detailed report of all test results."""
|
42
|
+
if not self.results:
|
43
|
+
return # Skip report if no results
|
44
|
+
|
45
|
+
summary = self.get_summary()
|
46
|
+
|
47
|
+
print("\n" + colored("=== Scenario Test Report ===", "cyan", attrs=["bold"]))
|
48
|
+
print(colored(f"Total Scenarios: {summary['total']}", "white"))
|
49
|
+
print(
|
50
|
+
colored(
|
51
|
+
f"Passed: {summary['passed']}",
|
52
|
+
"green" if summary["passed"] > 0 else "white",
|
53
|
+
)
|
54
|
+
)
|
55
|
+
print(
|
56
|
+
colored(
|
57
|
+
f"Failed: {summary['failed']}",
|
58
|
+
"red" if summary["failed"] > 0 else "white",
|
59
|
+
)
|
60
|
+
)
|
61
|
+
|
62
|
+
# Color the success rate based on its value
|
63
|
+
success_rate = summary["success_rate"]
|
64
|
+
rate_color = (
|
65
|
+
"green"
|
66
|
+
if success_rate == 100
|
67
|
+
else "yellow" if success_rate >= 70 else "red"
|
68
|
+
)
|
69
|
+
print(colored(f"Success Rate: {success_rate}%", rate_color))
|
70
|
+
|
71
|
+
for idx, item in enumerate(self.results, 1):
|
72
|
+
scenario = item["scenario"]
|
73
|
+
result = item["result"]
|
74
|
+
|
75
|
+
status = "PASSED" if result.success else "FAILED"
|
76
|
+
status_color = "green" if result.success else "red"
|
77
|
+
|
78
|
+
time = ""
|
79
|
+
if result.total_time and result.agent_time:
|
80
|
+
time = f" in {result.total_time:.2f}s (agent: {result.agent_time:.2f}s)"
|
81
|
+
|
82
|
+
print(
|
83
|
+
f"\n{idx}. {scenario.description} - {colored(status, status_color, attrs=['bold'])}{time}"
|
84
|
+
)
|
85
|
+
|
86
|
+
print(colored(f" Reasoning: {result.reasoning}", "green" if result.success else "red"))
|
87
|
+
|
88
|
+
if hasattr(result, "met_criteria") and result.met_criteria:
|
89
|
+
criteria_count = len(result.met_criteria)
|
90
|
+
total_criteria = len(scenario.success_criteria)
|
91
|
+
criteria_color = (
|
92
|
+
"green" if criteria_count == total_criteria else "yellow"
|
93
|
+
)
|
94
|
+
print(
|
95
|
+
colored(
|
96
|
+
f" Success Criteria: {criteria_count}/{total_criteria}",
|
97
|
+
criteria_color,
|
98
|
+
)
|
99
|
+
)
|
100
|
+
|
101
|
+
if hasattr(result, "triggered_failures") and result.triggered_failures:
|
102
|
+
print(
|
103
|
+
colored(
|
104
|
+
f" Failures Criteria: {len(result.triggered_failures)}",
|
105
|
+
"red",
|
106
|
+
)
|
107
|
+
)
|
108
|
+
|
109
|
+
|
110
|
+
# Store the original run method
|
111
|
+
original_run = Scenario.run
|
112
|
+
|
113
|
+
|
114
|
+
@pytest.hookimpl(trylast=True)
|
115
|
+
def pytest_configure(config):
|
116
|
+
"""Register the agent_test marker and set up automatic reporting."""
|
117
|
+
# Register the marker
|
118
|
+
config.addinivalue_line(
|
119
|
+
"markers", "agent_test: mark test as an agent scenario test"
|
120
|
+
)
|
121
|
+
|
122
|
+
# Create a global reporter instance
|
123
|
+
config._scenario_reporter = ScenarioReporter()
|
124
|
+
|
125
|
+
# Create a patched version of Scenario.run that auto-reports
|
126
|
+
@functools.wraps(original_run)
|
127
|
+
async def auto_reporting_run(self, *args, **kwargs):
|
128
|
+
result = await original_run(self, *args, **kwargs)
|
129
|
+
|
130
|
+
# Always report to the global reporter
|
131
|
+
config._scenario_reporter.add_result(self, result)
|
132
|
+
|
133
|
+
return result
|
134
|
+
|
135
|
+
# Apply the patch
|
136
|
+
Scenario.run = auto_reporting_run
|
137
|
+
|
138
|
+
|
139
|
+
@pytest.hookimpl(trylast=True)
|
140
|
+
def pytest_unconfigure(config):
|
141
|
+
"""Clean up and print final report when pytest exits."""
|
142
|
+
# Print the final report
|
143
|
+
if hasattr(config, "_scenario_reporter"):
|
144
|
+
config._scenario_reporter.print_report()
|
145
|
+
|
146
|
+
# Restore the original method
|
147
|
+
Scenario.run = original_run
|
148
|
+
|
149
|
+
|
150
|
+
@pytest.fixture
|
151
|
+
def scenario_reporter(request):
|
152
|
+
"""
|
153
|
+
A pytest fixture for accessing the global scenario reporter.
|
154
|
+
|
155
|
+
This fixture provides access to the same reporter that's used for automatic
|
156
|
+
reporting, allowing tests to explicitly interact with the reporter if needed.
|
157
|
+
"""
|
158
|
+
# Get the global reporter from pytest config
|
159
|
+
reporter = request.config._scenario_reporter
|
160
|
+
yield reporter
|
161
|
+
# No need to print report here as it's handled by pytest_unconfigure
|
scenario/result.py
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
"""
|
2
|
+
Result module: defines the class for scenario test results.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from dataclasses import dataclass, field
|
6
|
+
from typing import List, Dict, Optional
|
7
|
+
|
8
|
+
|
9
|
+
@dataclass
|
10
|
+
class ScenarioResult:
|
11
|
+
"""
|
12
|
+
Represents the results of a scenario test run.
|
13
|
+
|
14
|
+
Attributes:
|
15
|
+
success: Whether the scenario passed
|
16
|
+
conversation: The conversation history
|
17
|
+
reasoning: Reasoning for the result
|
18
|
+
met_criteria: List of success criteria that were met
|
19
|
+
unmet_criteria: List of success criteria that were not met
|
20
|
+
triggered_failures: List of failure criteria that were triggered
|
21
|
+
"""
|
22
|
+
|
23
|
+
success: bool
|
24
|
+
conversation: List[Dict[str, str]]
|
25
|
+
reasoning: Optional[str] = None
|
26
|
+
met_criteria: List[str] = field(default_factory=list)
|
27
|
+
unmet_criteria: List[str] = field(default_factory=list)
|
28
|
+
triggered_failures: List[str] = field(default_factory=list)
|
29
|
+
total_time: Optional[float] = None
|
30
|
+
agent_time: Optional[float] = None
|
31
|
+
|
32
|
+
def __post_init__(self) -> None:
|
33
|
+
"""Validate the result after initialization."""
|
34
|
+
if not self.success and not self.reasoning:
|
35
|
+
raise ValueError("Failed scenarios must have a reasoning")
|
36
|
+
|
37
|
+
@classmethod
|
38
|
+
def success_result(
|
39
|
+
cls,
|
40
|
+
conversation: List[Dict[str, str]],
|
41
|
+
reasoning: Optional[str],
|
42
|
+
met_criteria: List[str],
|
43
|
+
total_time: Optional[float] = None,
|
44
|
+
agent_time: Optional[float] = None,
|
45
|
+
) -> "ScenarioResult":
|
46
|
+
"""Create a successful result."""
|
47
|
+
return cls(
|
48
|
+
success=True,
|
49
|
+
conversation=conversation,
|
50
|
+
reasoning=reasoning,
|
51
|
+
met_criteria=met_criteria,
|
52
|
+
unmet_criteria=[],
|
53
|
+
triggered_failures=[],
|
54
|
+
total_time=total_time,
|
55
|
+
agent_time=agent_time,
|
56
|
+
)
|
57
|
+
|
58
|
+
@classmethod
|
59
|
+
def failure_result(
|
60
|
+
cls,
|
61
|
+
conversation: List[Dict[str, str]],
|
62
|
+
reasoning: str,
|
63
|
+
met_criteria: Optional[List[str]] = None,
|
64
|
+
unmet_criteria: Optional[List[str]] = None,
|
65
|
+
triggered_failures: Optional[List[str]] = None,
|
66
|
+
total_time: Optional[float] = None,
|
67
|
+
agent_time: Optional[float] = None,
|
68
|
+
) -> "ScenarioResult":
|
69
|
+
"""Create a failed result."""
|
70
|
+
return cls(
|
71
|
+
success=False,
|
72
|
+
conversation=conversation,
|
73
|
+
reasoning=reasoning,
|
74
|
+
met_criteria=met_criteria if met_criteria is not None else [],
|
75
|
+
unmet_criteria=unmet_criteria if unmet_criteria is not None else [],
|
76
|
+
triggered_failures=(
|
77
|
+
triggered_failures if triggered_failures is not None else []
|
78
|
+
),
|
79
|
+
total_time=total_time,
|
80
|
+
agent_time=agent_time,
|
81
|
+
)
|
scenario/scenario.py
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
"""
|
2
|
+
Scenario module: defines the core Scenario class for agent testing.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import Awaitable, List, Dict, Any, Optional, Callable, TypedDict, Union
|
6
|
+
import asyncio
|
7
|
+
import concurrent.futures
|
8
|
+
from functools import partial
|
9
|
+
|
10
|
+
from scenario.config import ScenarioConfig
|
11
|
+
from scenario.scenario_executor import ScenarioExecutor
|
12
|
+
|
13
|
+
from .result import ScenarioResult
|
14
|
+
from .testing_agent import TestingAgent
|
15
|
+
|
16
|
+
from openai.types.chat import ChatCompletionMessageParam
|
17
|
+
|
18
|
+
class AgentResult(TypedDict, total=False):
|
19
|
+
message: str
|
20
|
+
messages: List[ChatCompletionMessageParam]
|
21
|
+
extra: Dict[str, Any]
|
22
|
+
|
23
|
+
|
24
|
+
class Scenario(ScenarioConfig):
|
25
|
+
"""
|
26
|
+
A scenario represents a specific testing case for an agent.
|
27
|
+
|
28
|
+
It includes:
|
29
|
+
- A description of the scenario
|
30
|
+
- Success criteria to determine if the agent behaved correctly
|
31
|
+
- Failure criteria to determine if the agent failed
|
32
|
+
- An optional strategy that guides the testing agent
|
33
|
+
- Optional additional parameters
|
34
|
+
"""
|
35
|
+
|
36
|
+
description: str
|
37
|
+
agent: Union[
|
38
|
+
Callable[[str, Optional[Dict[str, Any]]], Dict[str, Any]],
|
39
|
+
Callable[[str, Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]],
|
40
|
+
]
|
41
|
+
success_criteria: List[str]
|
42
|
+
failure_criteria: List[str] = []
|
43
|
+
strategy: Optional[str] = None
|
44
|
+
|
45
|
+
def __init__(self, description: str, **kwargs):
|
46
|
+
"""Validate scenario configuration after initialization."""
|
47
|
+
if not description:
|
48
|
+
raise ValueError("Scenario description cannot be empty")
|
49
|
+
kwargs["description"] = description
|
50
|
+
|
51
|
+
if not kwargs.get("success_criteria"):
|
52
|
+
raise ValueError("Scenario must have at least one success criterion")
|
53
|
+
|
54
|
+
if kwargs.get("max_turns", 0) < 1:
|
55
|
+
raise ValueError("max_turns must be a positive integer")
|
56
|
+
|
57
|
+
# Ensure agent is callable
|
58
|
+
if not callable(kwargs.get("agent")):
|
59
|
+
raise ValueError("Agent must be a callable function")
|
60
|
+
|
61
|
+
default_config = getattr(Scenario, "default_config", None)
|
62
|
+
if default_config:
|
63
|
+
kwargs = {**default_config.model_dump(), **kwargs}
|
64
|
+
|
65
|
+
super().__init__(**kwargs)
|
66
|
+
|
67
|
+
|
68
|
+
async def run(self, context: Optional[Dict[str, Any]] = None) -> ScenarioResult:
|
69
|
+
"""
|
70
|
+
Run the scenario against the agent under test.
|
71
|
+
|
72
|
+
Args:
|
73
|
+
context: Optional initial context for the agent
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
ScenarioResult containing the test outcome
|
77
|
+
"""
|
78
|
+
|
79
|
+
# We'll use a thread pool to run the execution logic, we
|
80
|
+
# require a separate thread because even though asyncio is
|
81
|
+
# being used throughout, any user code on the callback can
|
82
|
+
# be blocking, preventing them from running scenarios in parallel
|
83
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
84
|
+
def run_in_thread():
|
85
|
+
loop = asyncio.new_event_loop()
|
86
|
+
asyncio.set_event_loop(loop)
|
87
|
+
|
88
|
+
try:
|
89
|
+
return loop.run_until_complete(ScenarioExecutor(self).run(context))
|
90
|
+
finally:
|
91
|
+
loop.close()
|
92
|
+
|
93
|
+
# Run the function in the thread pool and await its result
|
94
|
+
# This converts the thread's execution into a Future that the current
|
95
|
+
# event loop can await without blocking
|
96
|
+
loop = asyncio.get_event_loop()
|
97
|
+
result = await loop.run_in_executor(executor, run_in_thread)
|
98
|
+
return result
|
99
|
+
|
100
|
+
@classmethod
|
101
|
+
def configure(
|
102
|
+
cls,
|
103
|
+
testing_agent: Optional[TestingAgent] = None,
|
104
|
+
max_turns: Optional[int] = None,
|
105
|
+
verbose: Optional[Union[bool, int]] = None,
|
106
|
+
cache_key: Optional[str] = None,
|
107
|
+
) -> None:
|
108
|
+
existing_config = getattr(cls, "default_config", ScenarioConfig())
|
109
|
+
|
110
|
+
cls.default_config = existing_config.merge(
|
111
|
+
ScenarioConfig(
|
112
|
+
testing_agent=testing_agent,
|
113
|
+
max_turns=max_turns,
|
114
|
+
verbose=verbose,
|
115
|
+
cache_key=cache_key,
|
116
|
+
)
|
117
|
+
)
|
@@ -0,0 +1,204 @@
|
|
1
|
+
"""
|
2
|
+
ScenarioExecutor module: holds the scenario execution logic and state, orchestrating the conversation between the testing agent and the agent under test.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import json
|
6
|
+
import sys
|
7
|
+
from typing import TYPE_CHECKING, Awaitable, Dict, List, Any, Optional, Union
|
8
|
+
import time
|
9
|
+
import termcolor
|
10
|
+
|
11
|
+
from scenario.error_messages import message_return_error_message
|
12
|
+
from scenario.utils import print_openai_messages, safe_attr_or_key, safe_list_at, show_spinner
|
13
|
+
from openai.types.chat import ChatCompletionMessageParam
|
14
|
+
|
15
|
+
from .result import ScenarioResult
|
16
|
+
from .error_messages import default_config_error_message
|
17
|
+
from .cache import context_scenario
|
18
|
+
|
19
|
+
if TYPE_CHECKING:
|
20
|
+
from scenario.scenario import Scenario
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
class ScenarioExecutor:
|
25
|
+
def __init__(self, scenario: "Scenario"):
|
26
|
+
self.scenario = scenario.model_copy()
|
27
|
+
|
28
|
+
testing_agent = scenario.testing_agent
|
29
|
+
if not testing_agent or not testing_agent.model:
|
30
|
+
raise Exception(default_config_error_message)
|
31
|
+
self.testing_agent = testing_agent
|
32
|
+
|
33
|
+
self.conversation: List[Dict[str, Any]] = []
|
34
|
+
|
35
|
+
async def run(
|
36
|
+
self,
|
37
|
+
context: Optional[Dict[str, Any]] = None,
|
38
|
+
) -> ScenarioResult:
|
39
|
+
"""
|
40
|
+
Run a scenario against the agent under test.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
context: Optional initial context for the agent
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
ScenarioResult containing the test outcome
|
47
|
+
"""
|
48
|
+
|
49
|
+
if self.scenario.verbose:
|
50
|
+
print("") # new line
|
51
|
+
|
52
|
+
# Run the initial testing agent prompt to get started
|
53
|
+
total_start_time = time.time()
|
54
|
+
context_scenario.set(self.scenario)
|
55
|
+
initial_message = self._generate_next_message(
|
56
|
+
self.scenario, self.conversation, first_message=True
|
57
|
+
)
|
58
|
+
|
59
|
+
if isinstance(initial_message, ScenarioResult):
|
60
|
+
raise Exception(
|
61
|
+
"Unexpectedly generated a ScenarioResult for the initial message",
|
62
|
+
initial_message.__repr__(),
|
63
|
+
)
|
64
|
+
elif self.scenario.verbose:
|
65
|
+
print(self._scenario_name() + termcolor.colored("User:", "green"), initial_message)
|
66
|
+
|
67
|
+
# Execute the conversation
|
68
|
+
current_turn = 0
|
69
|
+
max_turns = self.scenario.max_turns or 10
|
70
|
+
agent_time = 0
|
71
|
+
|
72
|
+
# Start the test with the initial message
|
73
|
+
while current_turn < max_turns:
|
74
|
+
# Record the testing agent's message
|
75
|
+
self.conversation.append({"role": "user", "content": initial_message})
|
76
|
+
|
77
|
+
# Get response from the agent under test
|
78
|
+
start_time = time.time()
|
79
|
+
|
80
|
+
context_scenario.set(self.scenario)
|
81
|
+
with show_spinner(text="Agent:", color="blue", enabled=self.scenario.verbose):
|
82
|
+
agent_response = self.scenario.agent(initial_message, context)
|
83
|
+
if isinstance(agent_response, Awaitable):
|
84
|
+
agent_response = await agent_response
|
85
|
+
|
86
|
+
has_valid_message = (
|
87
|
+
"message" in agent_response
|
88
|
+
and type(agent_response["message"]) is str
|
89
|
+
and agent_response["message"] is not None
|
90
|
+
)
|
91
|
+
has_valid_messages = (
|
92
|
+
"messages" in agent_response
|
93
|
+
and isinstance(agent_response["messages"], list)
|
94
|
+
and all(
|
95
|
+
"role" in msg or hasattr(msg, "role")
|
96
|
+
for msg in agent_response["messages"]
|
97
|
+
)
|
98
|
+
)
|
99
|
+
if not has_valid_message and not has_valid_messages:
|
100
|
+
raise Exception(message_return_error_message)
|
101
|
+
|
102
|
+
messages: list[ChatCompletionMessageParam] = []
|
103
|
+
if has_valid_messages:
|
104
|
+
messages = agent_response["messages"]
|
105
|
+
|
106
|
+
# Drop the first messages both if they are system or user messages
|
107
|
+
if safe_attr_or_key(safe_list_at(messages, 0), "role") == "system":
|
108
|
+
messages = messages[1:]
|
109
|
+
if safe_attr_or_key(safe_list_at(messages, 0), "role") == "user":
|
110
|
+
messages = messages[1:]
|
111
|
+
|
112
|
+
if has_valid_message and self.scenario.verbose:
|
113
|
+
print(self._scenario_name(), termcolor.colored("Agent:", "blue"), agent_response["message"])
|
114
|
+
|
115
|
+
if messages and self.scenario.verbose:
|
116
|
+
print_openai_messages(self._scenario_name(), messages)
|
117
|
+
|
118
|
+
if (
|
119
|
+
self.scenario.verbose
|
120
|
+
and "extra" in agent_response
|
121
|
+
and len(agent_response["extra"].keys()) > 0
|
122
|
+
):
|
123
|
+
print(
|
124
|
+
termcolor.colored(
|
125
|
+
"Extra:" + json.dumps(agent_response["extra"]),
|
126
|
+
"magenta",
|
127
|
+
)
|
128
|
+
)
|
129
|
+
response_time = time.time() - start_time
|
130
|
+
agent_time += response_time
|
131
|
+
|
132
|
+
if messages:
|
133
|
+
self.conversation.extend(agent_response["messages"])
|
134
|
+
if "message" in agent_response:
|
135
|
+
self.conversation.append(
|
136
|
+
{"role": "assistant", "content": agent_response["message"]}
|
137
|
+
)
|
138
|
+
if "extra" in agent_response:
|
139
|
+
self.conversation.append(
|
140
|
+
{
|
141
|
+
"role": "assistant",
|
142
|
+
"content": json.dumps(agent_response["extra"]),
|
143
|
+
}
|
144
|
+
)
|
145
|
+
|
146
|
+
# Generate the next message OR finish the test based on the agent's evaluation
|
147
|
+
result = self._generate_next_message(
|
148
|
+
self.scenario,
|
149
|
+
self.conversation,
|
150
|
+
last_message=current_turn == max_turns - 1,
|
151
|
+
)
|
152
|
+
|
153
|
+
# Check if the result is a ScenarioResult (indicating test completion)
|
154
|
+
if isinstance(result, ScenarioResult):
|
155
|
+
result.total_time = time.time() - start_time
|
156
|
+
result.agent_time = agent_time
|
157
|
+
return result
|
158
|
+
elif self.scenario.verbose:
|
159
|
+
print(self._scenario_name() + termcolor.colored("User:", "green"), result)
|
160
|
+
|
161
|
+
# Otherwise, it's the next message to send to the agent
|
162
|
+
initial_message = result
|
163
|
+
|
164
|
+
# Increment turn counter
|
165
|
+
current_turn += 1
|
166
|
+
|
167
|
+
# If we reached max turns without conclusion, fail the test
|
168
|
+
return ScenarioResult.failure_result(
|
169
|
+
conversation=self.conversation,
|
170
|
+
reasoning=f"Reached maximum turns ({max_turns}) without conclusion",
|
171
|
+
total_time=time.time() - total_start_time,
|
172
|
+
agent_time=agent_time,
|
173
|
+
)
|
174
|
+
|
175
|
+
def _generate_next_message(
|
176
|
+
self,
|
177
|
+
scenario: "Scenario",
|
178
|
+
conversation: List[Dict[str, Any]],
|
179
|
+
first_message: bool = False,
|
180
|
+
last_message: bool = False,
|
181
|
+
) -> Union[str, ScenarioResult]:
|
182
|
+
if self.scenario.debug:
|
183
|
+
print(f"\n{self._scenario_name()}{termcolor.colored('[Debug Mode]', 'yellow')} Press enter to continue or type a message to send")
|
184
|
+
input_message = input(self._scenario_name() + termcolor.colored('User: ', 'green'))
|
185
|
+
|
186
|
+
# Clear the input prompt lines completely
|
187
|
+
for _ in range(3):
|
188
|
+
sys.stdout.write("\033[F") # Move up to the input line
|
189
|
+
sys.stdout.write("\033[2K") # Clear the entire input line
|
190
|
+
sys.stdout.flush() # Make sure the clearing is visible
|
191
|
+
|
192
|
+
if input_message:
|
193
|
+
return input_message
|
194
|
+
|
195
|
+
with show_spinner(text=f"{self._scenario_name()}User:", color="green", enabled=self.scenario.verbose):
|
196
|
+
return self.testing_agent.generate_next_message(
|
197
|
+
scenario, conversation, first_message, last_message
|
198
|
+
)
|
199
|
+
|
200
|
+
def _scenario_name(self):
|
201
|
+
if self.scenario.verbose == 2:
|
202
|
+
return termcolor.colored(f"[Scenario: {self.scenario.description}] ", "yellow")
|
203
|
+
else:
|
204
|
+
return ""
|
@@ -0,0 +1,262 @@
|
|
1
|
+
"""
|
2
|
+
TestingAgent module: defines the testing agent that interacts with the agent under test.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import json
|
6
|
+
import logging
|
7
|
+
from typing import TYPE_CHECKING, Dict, List, Any, Optional, Union, cast
|
8
|
+
from pydantic import BaseModel
|
9
|
+
|
10
|
+
from litellm import Choices, completion
|
11
|
+
from litellm.files.main import ModelResponse
|
12
|
+
|
13
|
+
from scenario.cache import scenario_cache
|
14
|
+
from scenario.utils import safe_attr_or_key
|
15
|
+
|
16
|
+
from .result import ScenarioResult
|
17
|
+
|
18
|
+
if TYPE_CHECKING:
|
19
|
+
from scenario.scenario import Scenario
|
20
|
+
|
21
|
+
|
22
|
+
logger = logging.getLogger("scenario")
|
23
|
+
|
24
|
+
|
25
|
+
class TestingAgent(BaseModel):
|
26
|
+
"""
|
27
|
+
The Testing Agent that interacts with the agent under test.
|
28
|
+
|
29
|
+
This agent is responsible for:
|
30
|
+
1. Generating messages to send to the agent based on the scenario
|
31
|
+
2. Evaluating the responses from the agent against the success/failure criteria
|
32
|
+
3. Determining when to end the test and return a result
|
33
|
+
"""
|
34
|
+
|
35
|
+
model: str
|
36
|
+
api_key: Optional[str] = None
|
37
|
+
temperature: float = 0.0
|
38
|
+
max_tokens: Optional[int] = None
|
39
|
+
|
40
|
+
# To prevent pytest from thinking this is actually a test class
|
41
|
+
__test__ = False
|
42
|
+
|
43
|
+
@scenario_cache(ignore=["scenario"])
|
44
|
+
def generate_next_message(
|
45
|
+
self,
|
46
|
+
scenario: "Scenario",
|
47
|
+
conversation: List[Dict[str, Any]],
|
48
|
+
first_message: bool = False,
|
49
|
+
last_message: bool = False,
|
50
|
+
) -> Union[str, ScenarioResult]:
|
51
|
+
"""
|
52
|
+
Generate the next message in the conversation based on history OR
|
53
|
+
return a ScenarioResult if the test should conclude.
|
54
|
+
|
55
|
+
Returns either:
|
56
|
+
- A string message to send to the agent (if conversation should continue)
|
57
|
+
- A ScenarioResult (if the test should conclude)
|
58
|
+
"""
|
59
|
+
|
60
|
+
messages = [
|
61
|
+
{
|
62
|
+
"role": "system",
|
63
|
+
"content": f"""
|
64
|
+
<role>
|
65
|
+
You are pretending to be a user, you are testing an AI Agent (shown as the user role) based on a scenario.
|
66
|
+
Approach this naturally, as a human user would, with very short inputs, few words, all lowercase, imperative, not periods, like when they google or talk to chatgpt.
|
67
|
+
</role>
|
68
|
+
|
69
|
+
<goal>
|
70
|
+
Your goal (assistant) is to interact with the Agent Under Test (user) as if you were a human user to see if it can complete the scenario successfully.
|
71
|
+
</goal>
|
72
|
+
|
73
|
+
<scenario>
|
74
|
+
{scenario.description}
|
75
|
+
</scenario>
|
76
|
+
|
77
|
+
<strategy>
|
78
|
+
{scenario.strategy or "Start with a first message and guide the conversation to play out the scenario."}
|
79
|
+
</strategy>
|
80
|
+
|
81
|
+
<success_criteria>
|
82
|
+
{json.dumps(scenario.success_criteria, indent=2)}
|
83
|
+
</success_criteria>
|
84
|
+
|
85
|
+
<failure_criteria>
|
86
|
+
{json.dumps(scenario.failure_criteria, indent=2)}
|
87
|
+
</failure_criteria>
|
88
|
+
|
89
|
+
<execution_flow>
|
90
|
+
1. Generate the first message to start the scenario
|
91
|
+
2. After the Agent Under Test (user) responds, generate the next message to send to the Agent Under Test, keep repeating step 2 until criterias match
|
92
|
+
3. If the test should end, use the finish_test tool to determine if success or failure criteria have been met
|
93
|
+
</execution_flow>
|
94
|
+
|
95
|
+
<rules>
|
96
|
+
1. Test should end immediately if a failure criteria is triggered
|
97
|
+
2. Test should continue until all success criteria have been met
|
98
|
+
3. DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
|
99
|
+
4. DO NOT carry over any requests yourself, YOU ARE NOT the assistant today, wait for the user to do it
|
100
|
+
</rules>
|
101
|
+
""",
|
102
|
+
},
|
103
|
+
{"role": "assistant", "content": "Hello, how can I help you today?"},
|
104
|
+
*conversation,
|
105
|
+
]
|
106
|
+
|
107
|
+
if last_message:
|
108
|
+
messages.append(
|
109
|
+
{
|
110
|
+
"role": "user",
|
111
|
+
"content": """
|
112
|
+
System:
|
113
|
+
|
114
|
+
<finish_test>
|
115
|
+
This is the last message, conversation has reached the maximum number of turns, give your final verdict,
|
116
|
+
if you don't have enough information to make a verdict, say inconclusive with max turns reached.
|
117
|
+
</finish_test>
|
118
|
+
""",
|
119
|
+
}
|
120
|
+
)
|
121
|
+
|
122
|
+
# User to assistant role reversal
|
123
|
+
# LLM models are biased to always be the assistant not the user, so we need to do this reversal otherwise models like GPT 4.5 is
|
124
|
+
# super confused, and Claude 3.7 even starts throwing exceptions.
|
125
|
+
for message in messages:
|
126
|
+
# Can't reverse tool calls
|
127
|
+
if not safe_attr_or_key(message, "content") or safe_attr_or_key(
|
128
|
+
message, "tool_calls"
|
129
|
+
):
|
130
|
+
continue
|
131
|
+
|
132
|
+
if type(message) == dict:
|
133
|
+
if message["role"] == "user":
|
134
|
+
message["role"] = "assistant"
|
135
|
+
elif message["role"] == "assistant":
|
136
|
+
message["role"] = "user"
|
137
|
+
else:
|
138
|
+
if getattr(message, "role", None) == "user":
|
139
|
+
message.role = "assistant"
|
140
|
+
elif getattr(message, "role", None) == "assistant":
|
141
|
+
message.role = "user"
|
142
|
+
|
143
|
+
# Define the tool
|
144
|
+
tools = [
|
145
|
+
{
|
146
|
+
"type": "function",
|
147
|
+
"function": {
|
148
|
+
"name": "finish_test",
|
149
|
+
"description": "Complete the test with a final verdict",
|
150
|
+
"strict": True,
|
151
|
+
"parameters": {
|
152
|
+
"type": "object",
|
153
|
+
"properties": {
|
154
|
+
"verdict": {
|
155
|
+
"type": "string",
|
156
|
+
"enum": ["success", "failure", "inconclusive"],
|
157
|
+
"description": "The final verdict of the test",
|
158
|
+
},
|
159
|
+
"reasoning": {
|
160
|
+
"type": "string",
|
161
|
+
"description": "Explanation of why this verdict was reached",
|
162
|
+
},
|
163
|
+
"details": {
|
164
|
+
"type": "object",
|
165
|
+
"properties": {
|
166
|
+
"met_criteria": {
|
167
|
+
"type": "array",
|
168
|
+
"items": {"type": "string"},
|
169
|
+
"description": "List of success criteria that have been met",
|
170
|
+
},
|
171
|
+
"unmet_criteria": {
|
172
|
+
"type": "array",
|
173
|
+
"items": {"type": "string"},
|
174
|
+
"description": "List of success criteria that have not been met",
|
175
|
+
},
|
176
|
+
"triggered_failures": {
|
177
|
+
"type": "array",
|
178
|
+
"items": {"type": "string"},
|
179
|
+
"description": "List of failure criteria that have been triggered",
|
180
|
+
},
|
181
|
+
},
|
182
|
+
"required": ["met_criteria", "unmet_criteria", "triggered_failures"],
|
183
|
+
"additionalProperties": False,
|
184
|
+
"description": "Detailed information about criteria evaluation",
|
185
|
+
},
|
186
|
+
},
|
187
|
+
"required": ["verdict", "reasoning", "details"],
|
188
|
+
"additionalProperties": False,
|
189
|
+
},
|
190
|
+
},
|
191
|
+
}
|
192
|
+
]
|
193
|
+
|
194
|
+
response = cast(
|
195
|
+
ModelResponse,
|
196
|
+
completion(
|
197
|
+
model=self.model,
|
198
|
+
messages=messages,
|
199
|
+
temperature=self.temperature,
|
200
|
+
max_tokens=self.max_tokens,
|
201
|
+
tools=tools if not first_message else None,
|
202
|
+
tool_choice="required" if last_message else None,
|
203
|
+
),
|
204
|
+
)
|
205
|
+
|
206
|
+
# Extract the content from the response
|
207
|
+
if hasattr(response, "choices") and len(response.choices) > 0:
|
208
|
+
message = cast(Choices, response.choices[0]).message
|
209
|
+
|
210
|
+
# Check if the LLM chose to use the tool
|
211
|
+
if message.tool_calls:
|
212
|
+
tool_call = message.tool_calls[0]
|
213
|
+
if tool_call.function.name == "finish_test":
|
214
|
+
# Parse the tool call arguments
|
215
|
+
try:
|
216
|
+
args = json.loads(tool_call.function.arguments)
|
217
|
+
verdict = args.get("verdict", "inconclusive")
|
218
|
+
reasoning = args.get("reasoning", "No reasoning provided")
|
219
|
+
details = args.get("details", {})
|
220
|
+
|
221
|
+
met_criteria = details.get("met_criteria", [])
|
222
|
+
unmet_criteria = details.get("unmet_criteria", [])
|
223
|
+
triggered_failures = details.get("triggered_failures", [])
|
224
|
+
|
225
|
+
# Return the appropriate ScenarioResult based on the verdict
|
226
|
+
if verdict == "success":
|
227
|
+
return ScenarioResult.success_result(
|
228
|
+
conversation=conversation,
|
229
|
+
reasoning=reasoning,
|
230
|
+
met_criteria=met_criteria,
|
231
|
+
)
|
232
|
+
elif verdict == "failure":
|
233
|
+
return ScenarioResult.failure_result(
|
234
|
+
conversation=conversation,
|
235
|
+
reasoning=reasoning,
|
236
|
+
met_criteria=met_criteria,
|
237
|
+
unmet_criteria=unmet_criteria,
|
238
|
+
triggered_failures=triggered_failures,
|
239
|
+
)
|
240
|
+
else: # inconclusive
|
241
|
+
return ScenarioResult(
|
242
|
+
success=False,
|
243
|
+
conversation=conversation,
|
244
|
+
reasoning=reasoning,
|
245
|
+
met_criteria=met_criteria,
|
246
|
+
unmet_criteria=unmet_criteria,
|
247
|
+
triggered_failures=triggered_failures,
|
248
|
+
)
|
249
|
+
except json.JSONDecodeError:
|
250
|
+
logger.error("Failed to parse tool call arguments")
|
251
|
+
|
252
|
+
# If no tool call or invalid tool call, use the message content as next message
|
253
|
+
message_content = message.content
|
254
|
+
if message_content is None:
|
255
|
+
raise Exception(f"No response from LLM: {response.__repr__()}")
|
256
|
+
|
257
|
+
return message_content
|
258
|
+
else:
|
259
|
+
raise Exception(
|
260
|
+
f"Unexpected response format from LLM: {response.__repr__()}"
|
261
|
+
)
|
262
|
+
|
scenario/utils.py
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
from contextlib import contextmanager
|
2
|
+
import sys
|
3
|
+
from typing import Optional, Union
|
4
|
+
from pydantic import BaseModel
|
5
|
+
|
6
|
+
import json
|
7
|
+
|
8
|
+
import termcolor
|
9
|
+
from textwrap import indent
|
10
|
+
from openai.types.chat import ChatCompletionMessageParam
|
11
|
+
from rich.live import Live
|
12
|
+
from rich.spinner import Spinner
|
13
|
+
from rich.console import Console
|
14
|
+
from rich.text import Text
|
15
|
+
from rich.errors import LiveError
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
class SerializableAndPydanticEncoder(json.JSONEncoder):
|
20
|
+
def default(self, o):
|
21
|
+
if isinstance(o, BaseModel):
|
22
|
+
return o.model_dump(exclude_unset=True)
|
23
|
+
return super().default(o)
|
24
|
+
|
25
|
+
|
26
|
+
class SerializableWithStringFallback(SerializableAndPydanticEncoder):
|
27
|
+
def default(self, o):
|
28
|
+
try:
|
29
|
+
return super().default(o)
|
30
|
+
except:
|
31
|
+
return str(o)
|
32
|
+
|
33
|
+
|
34
|
+
def safe_list_at(list, index, default=None):
|
35
|
+
try:
|
36
|
+
return list[index]
|
37
|
+
except:
|
38
|
+
return default
|
39
|
+
|
40
|
+
|
41
|
+
def safe_attr_or_key(obj, attr_or_key, default=None):
|
42
|
+
return getattr(obj, attr_or_key, obj.get(attr_or_key))
|
43
|
+
|
44
|
+
|
45
|
+
def title_case(string):
|
46
|
+
return " ".join(word.capitalize() for word in string.split("_"))
|
47
|
+
|
48
|
+
|
49
|
+
def print_openai_messages(scenario_name: str, messages: list[ChatCompletionMessageParam]):
|
50
|
+
for msg in messages:
|
51
|
+
role = safe_attr_or_key(msg, "role")
|
52
|
+
content = safe_attr_or_key(msg, "content")
|
53
|
+
if role == "assistant":
|
54
|
+
tool_calls = safe_attr_or_key(msg, "tool_calls")
|
55
|
+
if content:
|
56
|
+
print(scenario_name + termcolor.colored("Agent:", "blue"), content)
|
57
|
+
if tool_calls:
|
58
|
+
for tool_call in tool_calls:
|
59
|
+
function = safe_attr_or_key(tool_call, "function")
|
60
|
+
name = safe_attr_or_key(function, "name")
|
61
|
+
args = safe_attr_or_key(function, "arguments", "{}")
|
62
|
+
args = _take_maybe_json_first_lines(args)
|
63
|
+
print(
|
64
|
+
scenario_name + termcolor.colored(f"ToolCall({name}):", "magenta"),
|
65
|
+
f"\n\n{indent(args, ' ' * 4)}\n",
|
66
|
+
)
|
67
|
+
elif role == "tool":
|
68
|
+
content = _take_maybe_json_first_lines(content or msg.__repr__())
|
69
|
+
print(
|
70
|
+
scenario_name + termcolor.colored(f"ToolResult:", "magenta"),
|
71
|
+
f"\n\n{indent(content, ' ' * 4)}\n",
|
72
|
+
)
|
73
|
+
else:
|
74
|
+
print(
|
75
|
+
scenario_name + termcolor.colored(f"{title_case(role)}:", "magenta"),
|
76
|
+
msg.__repr__(),
|
77
|
+
)
|
78
|
+
|
79
|
+
|
80
|
+
def _take_maybe_json_first_lines(string, max_lines=5):
|
81
|
+
content = str(string)
|
82
|
+
try:
|
83
|
+
content = json.dumps(json.loads(content), indent=2)
|
84
|
+
except:
|
85
|
+
pass
|
86
|
+
content = content.split("\n")
|
87
|
+
if len(content) > max_lines:
|
88
|
+
content = content[:max_lines] + ["..."]
|
89
|
+
return "\n".join(content)
|
90
|
+
|
91
|
+
|
92
|
+
console = Console()
|
93
|
+
|
94
|
+
class TextFirstSpinner(Spinner):
|
95
|
+
def __init__(self, name, text: str, color: str, **kwargs):
|
96
|
+
super().__init__(name, "", style="bold white", **kwargs) # Initialize with empty text
|
97
|
+
self.text_before = text
|
98
|
+
self.color = color
|
99
|
+
|
100
|
+
def render(self, time):
|
101
|
+
# Get the original spinner frame
|
102
|
+
spinner_frame = super().render(time)
|
103
|
+
# Create a composite with text first, then spinner
|
104
|
+
return Text(f"{self.text_before} ", style=self.color) + spinner_frame
|
105
|
+
|
106
|
+
|
107
|
+
@contextmanager
|
108
|
+
def show_spinner(text: str, color: str = "white", enabled: Optional[Union[bool, int]] = None):
|
109
|
+
if not enabled:
|
110
|
+
yield
|
111
|
+
else:
|
112
|
+
spinner = TextFirstSpinner("dots", text, color=color)
|
113
|
+
try:
|
114
|
+
with Live(spinner, console=console, refresh_per_second=20):
|
115
|
+
yield
|
116
|
+
# It happens when we are multi-threading, it's fine, just ignore it, you probably don't want multiple spinners at once anyway
|
117
|
+
except LiveError:
|
118
|
+
yield
|
119
|
+
|
120
|
+
# Cursor up one line
|
121
|
+
sys.stdout.write("\033[F")
|