langwatch-scenario 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langwatch_scenario-0.4.0.dist-info/METADATA +363 -0
- langwatch_scenario-0.4.0.dist-info/RECORD +18 -0
- scenario/__init__.py +230 -6
- scenario/agent_adapter.py +111 -0
- scenario/cache.py +132 -8
- scenario/config.py +165 -10
- scenario/error_messages.py +75 -47
- scenario/judge_agent.py +435 -0
- scenario/pytest_plugin.py +224 -16
- scenario/scenario_executor.py +704 -150
- scenario/scenario_state.py +205 -0
- scenario/script.py +361 -0
- scenario/types.py +269 -0
- scenario/user_simulator_agent.py +249 -0
- scenario/utils.py +398 -5
- langwatch_scenario-0.2.0.dist-info/METADATA +0 -254
- langwatch_scenario-0.2.0.dist-info/RECORD +0 -15
- scenario/result.py +0 -74
- scenario/scenario.py +0 -123
- scenario/testing_agent.py +0 -262
- {langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.4.0.dist-info}/WHEEL +0 -0
- {langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.4.0.dist-info}/entry_points.txt +0 -0
- {langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,111 @@
|
|
1
|
+
"""
|
2
|
+
Agent adapter module for integrating custom agents with the Scenario framework.
|
3
|
+
|
4
|
+
This module provides the abstract base class that users must implement to integrate
|
5
|
+
their existing agents with the Scenario testing framework. The adapter pattern allows
|
6
|
+
any agent implementation to work with the framework regardless of its underlying
|
7
|
+
architecture or API.
|
8
|
+
"""
|
9
|
+
|
10
|
+
from abc import ABC, abstractmethod
|
11
|
+
from typing import ClassVar
|
12
|
+
|
13
|
+
from .types import AgentInput, AgentReturnTypes, AgentRole
|
14
|
+
|
15
|
+
|
16
|
+
class AgentAdapter(ABC):
|
17
|
+
"""
|
18
|
+
Abstract base class for integrating custom agents with the Scenario framework.
|
19
|
+
|
20
|
+
This adapter pattern allows you to wrap any existing agent implementation
|
21
|
+
(LLM calls, agent frameworks, or complex multi-step systems) to work with
|
22
|
+
the Scenario testing framework. The adapter receives structured input about
|
23
|
+
the conversation state and returns responses in a standardized format.
|
24
|
+
|
25
|
+
Attributes:
|
26
|
+
role: The role this agent plays in scenarios (USER, AGENT, or JUDGE)
|
27
|
+
|
28
|
+
Example:
|
29
|
+
```python
|
30
|
+
import scenario
|
31
|
+
from my_agent_library import MyCustomAgent
|
32
|
+
|
33
|
+
class MyAgentAdapter(scenario.AgentAdapter):
|
34
|
+
def __init__(self):
|
35
|
+
self.agent = MyCustomAgent()
|
36
|
+
|
37
|
+
async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
|
38
|
+
# Get the latest user message
|
39
|
+
user_message = input.last_new_user_message_str()
|
40
|
+
|
41
|
+
# Call your existing agent
|
42
|
+
response = await self.agent.process(
|
43
|
+
message=user_message,
|
44
|
+
history=input.messages,
|
45
|
+
thread_id=input.thread_id
|
46
|
+
)
|
47
|
+
|
48
|
+
# Return the response (can be string, message dict, or list of messages)
|
49
|
+
return response
|
50
|
+
|
51
|
+
# Use in a scenario
|
52
|
+
result = await scenario.run(
|
53
|
+
name="test my agent",
|
54
|
+
description="User asks for help with a coding problem",
|
55
|
+
agents=[
|
56
|
+
MyAgentAdapter(),
|
57
|
+
scenario.UserSimulatorAgent(),
|
58
|
+
scenario.JudgeAgent(criteria=["Provides helpful coding advice"])
|
59
|
+
]
|
60
|
+
)
|
61
|
+
```
|
62
|
+
|
63
|
+
Note:
|
64
|
+
- The call method must be async
|
65
|
+
- Return types can be: str, ChatCompletionMessageParam, List[ChatCompletionMessageParam], or ScenarioResult
|
66
|
+
- For stateful agents, use input.thread_id to maintain conversation context
|
67
|
+
- For stateless agents, use input.messages for the full conversation history
|
68
|
+
"""
|
69
|
+
role: ClassVar[AgentRole] = AgentRole.AGENT
|
70
|
+
|
71
|
+
@abstractmethod
|
72
|
+
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
73
|
+
"""
|
74
|
+
Process the input and generate a response.
|
75
|
+
|
76
|
+
This is the main method that your agent implementation must provide.
|
77
|
+
It receives structured information about the current conversation state
|
78
|
+
and must return a response in one of the supported formats.
|
79
|
+
|
80
|
+
Args:
|
81
|
+
input: AgentInput containing conversation history, thread context, and scenario state
|
82
|
+
|
83
|
+
Returns:
|
84
|
+
AgentReturnTypes: The agent's response, which can be:
|
85
|
+
- str: Simple text response
|
86
|
+
- ChatCompletionMessageParam: Single OpenAI-format message
|
87
|
+
- List[ChatCompletionMessageParam]: Multiple messages for complex responses
|
88
|
+
- ScenarioResult: Direct test result (typically only used by judge agents)
|
89
|
+
|
90
|
+
Example:
|
91
|
+
```python
|
92
|
+
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
93
|
+
# Simple string response
|
94
|
+
user_msg = input.last_new_user_message_str()
|
95
|
+
return f"I understand you said: {user_msg}"
|
96
|
+
|
97
|
+
# Or structured message response
|
98
|
+
return {
|
99
|
+
"role": "assistant",
|
100
|
+
"content": "Let me help you with that...",
|
101
|
+
"tool_calls": [...] # If your agent uses tools
|
102
|
+
}
|
103
|
+
|
104
|
+
# Or multiple messages for complex interactions
|
105
|
+
return [
|
106
|
+
{"role": "assistant", "content": "Let me search for that information..."},
|
107
|
+
{"role": "assistant", "content": "Here's what I found: ..."}
|
108
|
+
]
|
109
|
+
```
|
110
|
+
"""
|
111
|
+
pass
|
scenario/cache.py
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
"""
|
2
|
+
Caching module for deterministic scenario testing.
|
3
|
+
|
4
|
+
This module provides caching functionality to make scenario tests deterministic
|
5
|
+
and repeatable. It caches LLM calls and other non-deterministic operations based
|
6
|
+
on scenario configuration and function arguments, enabling consistent test results
|
7
|
+
across multiple runs.
|
8
|
+
"""
|
9
|
+
|
1
10
|
from contextvars import ContextVar
|
2
11
|
import inspect
|
3
12
|
import os
|
@@ -8,29 +17,99 @@ from joblib import Memory
|
|
8
17
|
import json
|
9
18
|
|
10
19
|
import wrapt
|
20
|
+
from scenario.types import AgentInput
|
11
21
|
from scenario.utils import SerializableWithStringFallback
|
12
22
|
|
13
23
|
if TYPE_CHECKING:
|
14
|
-
from scenario.
|
24
|
+
from scenario.scenario_executor import ScenarioExecutor
|
15
25
|
|
16
26
|
|
17
27
|
context_scenario = ContextVar("scenario")
|
18
28
|
|
29
|
+
|
19
30
|
def get_cache() -> Memory:
|
20
|
-
"""
|
31
|
+
"""
|
32
|
+
Get a cross-platform cache directory for scenario execution.
|
33
|
+
|
34
|
+
Creates and returns a joblib Memory instance configured to use a
|
35
|
+
cross-platform cache directory. The cache location can be customized
|
36
|
+
via the SCENARIO_CACHE_DIR environment variable.
|
37
|
+
|
38
|
+
Returns:
|
39
|
+
Memory instance configured with the appropriate cache directory
|
40
|
+
|
41
|
+
Example:
|
42
|
+
```python
|
43
|
+
# Default cache location: ~/.scenario/cache
|
44
|
+
cache = get_cache()
|
45
|
+
|
46
|
+
# Custom cache location via environment variable
|
47
|
+
os.environ["SCENARIO_CACHE_DIR"] = "/tmp/my_scenario_cache"
|
48
|
+
cache = get_cache()
|
49
|
+
```
|
50
|
+
"""
|
21
51
|
home_dir = str(Path.home())
|
22
52
|
cache_dir = os.path.join(home_dir, ".scenario", "cache")
|
23
53
|
|
24
54
|
return Memory(location=os.environ.get("SCENARIO_CACHE_DIR", cache_dir), verbose=0)
|
25
55
|
|
56
|
+
|
26
57
|
memory = get_cache()
|
27
58
|
|
59
|
+
|
28
60
|
def scenario_cache(ignore=[]):
|
61
|
+
"""
|
62
|
+
Decorator for caching function calls during scenario execution.
|
63
|
+
|
64
|
+
This decorator caches function calls based on the scenario's cache_key,
|
65
|
+
scenario configuration, and function arguments. It enables deterministic
|
66
|
+
testing by ensuring the same inputs always produce the same outputs,
|
67
|
+
making tests repeatable and faster on subsequent runs.
|
68
|
+
|
69
|
+
Args:
|
70
|
+
ignore: List of argument names to exclude from the cache key computation.
|
71
|
+
Commonly used to ignore 'self' for instance methods or other
|
72
|
+
non-deterministic arguments.
|
73
|
+
|
74
|
+
Returns:
|
75
|
+
Decorator function that can be applied to any function or method
|
76
|
+
|
77
|
+
Example:
|
78
|
+
```python
|
79
|
+
import scenario
|
80
|
+
|
81
|
+
class MyAgent:
|
82
|
+
@scenario.cache(ignore=["self"])
|
83
|
+
def invoke(self, message: str, context: dict) -> str:
|
84
|
+
# This LLM call will be cached
|
85
|
+
response = llm_client.complete(
|
86
|
+
model="gpt-4",
|
87
|
+
messages=[{"role": "user", "content": message}]
|
88
|
+
)
|
89
|
+
return response.choices[0].message.content
|
90
|
+
|
91
|
+
# Usage in tests
|
92
|
+
scenario.configure(cache_key="my-test-suite-v1")
|
93
|
+
|
94
|
+
# First run: makes actual LLM calls and caches results
|
95
|
+
result1 = await scenario.run(...)
|
96
|
+
|
97
|
+
# Second run: uses cached results, much faster
|
98
|
+
result2 = await scenario.run(...)
|
99
|
+
# result1 and result2 will be identical
|
100
|
+
```
|
101
|
+
|
102
|
+
Note:
|
103
|
+
- Caching only occurs when a cache_key is set in the scenario configuration
|
104
|
+
- The cache key is computed from scenario config, function arguments, and cache_key
|
105
|
+
- AgentInput objects are specially handled to exclude thread_id from caching
|
106
|
+
- Both sync and async functions are supported
|
107
|
+
"""
|
29
108
|
@wrapt.decorator
|
30
109
|
def wrapper(wrapped: Callable, instance=None, args=[], kwargs={}):
|
31
|
-
scenario: "
|
110
|
+
scenario: "ScenarioExecutor" = context_scenario.get()
|
32
111
|
|
33
|
-
if not scenario.cache_key:
|
112
|
+
if not scenario.config.cache_key:
|
34
113
|
return wrapped(*args, **kwargs)
|
35
114
|
|
36
115
|
sig = inspect.signature(wrapped)
|
@@ -43,20 +122,65 @@ def scenario_cache(ignore=[]):
|
|
43
122
|
if arg in all_args:
|
44
123
|
del all_args[arg]
|
45
124
|
|
125
|
+
for key, value in all_args.items():
|
126
|
+
if isinstance(value, AgentInput):
|
127
|
+
scenario_state = value.scenario_state.model_dump(exclude={"thread_id"})
|
128
|
+
all_args[key] = value.model_dump(exclude={"thread_id"})
|
129
|
+
all_args[key]["scenario_state"] = scenario_state
|
130
|
+
|
46
131
|
cache_key = json.dumps(
|
47
132
|
{
|
48
|
-
"cache_key": scenario.cache_key,
|
49
|
-
"scenario": scenario.model_dump(exclude={"
|
133
|
+
"cache_key": scenario.config.cache_key,
|
134
|
+
"scenario": scenario.config.model_dump(exclude={"agents"}),
|
50
135
|
"all_args": all_args,
|
51
136
|
},
|
52
137
|
cls=SerializableWithStringFallback,
|
53
138
|
)
|
54
139
|
|
55
|
-
|
140
|
+
# if is an async function, we need to wrap it in a sync function
|
141
|
+
if inspect.iscoroutinefunction(wrapped):
|
142
|
+
return _async_cached_call(wrapped, args, kwargs, cache_key=cache_key)
|
143
|
+
else:
|
144
|
+
return _cached_call(wrapped, args, kwargs, cache_key=cache_key)
|
56
145
|
|
57
146
|
return wrapper
|
58
147
|
|
59
148
|
|
60
149
|
@memory.cache(ignore=["func", "args", "kwargs"])
|
61
150
|
def _cached_call(func: Callable, args, kwargs, cache_key):
|
62
|
-
|
151
|
+
"""
|
152
|
+
Internal function for caching synchronous function calls.
|
153
|
+
|
154
|
+
This function is used internally by the scenario_cache decorator
|
155
|
+
to cache synchronous function calls using joblib.Memory.
|
156
|
+
|
157
|
+
Args:
|
158
|
+
func: The function to call and cache
|
159
|
+
args: Positional arguments for the function
|
160
|
+
kwargs: Keyword arguments for the function
|
161
|
+
cache_key: Cache key for deterministic caching
|
162
|
+
|
163
|
+
Returns:
|
164
|
+
The result of calling func(*args, **kwargs)
|
165
|
+
"""
|
166
|
+
return func(*args, **kwargs)
|
167
|
+
|
168
|
+
|
169
|
+
@memory.cache(ignore=["func", "args", "kwargs"])
|
170
|
+
async def _async_cached_call(func: Callable, args, kwargs, cache_key):
|
171
|
+
"""
|
172
|
+
Internal function for caching asynchronous function calls.
|
173
|
+
|
174
|
+
This function is used internally by the scenario_cache decorator
|
175
|
+
to cache asynchronous function calls using joblib.Memory.
|
176
|
+
|
177
|
+
Args:
|
178
|
+
func: The async function to call and cache
|
179
|
+
args: Positional arguments for the function
|
180
|
+
kwargs: Keyword arguments for the function
|
181
|
+
cache_key: Cache key for deterministic caching
|
182
|
+
|
183
|
+
Returns:
|
184
|
+
The result of calling await func(*args, **kwargs)
|
185
|
+
"""
|
186
|
+
return await func(*args, **kwargs)
|
scenario/config.py
CHANGED
@@ -1,28 +1,183 @@
|
|
1
1
|
"""
|
2
2
|
Configuration module for Scenario.
|
3
|
+
|
4
|
+
This module provides configuration classes for customizing the behavior of the
|
5
|
+
Scenario testing framework, including LLM model settings, execution parameters,
|
6
|
+
and debugging options.
|
3
7
|
"""
|
4
8
|
|
5
|
-
from typing import Optional, Union
|
9
|
+
from typing import Optional, Union, ClassVar
|
6
10
|
from pydantic import BaseModel
|
7
11
|
|
8
|
-
|
12
|
+
class ModelConfig(BaseModel):
|
13
|
+
"""
|
14
|
+
Configuration for LLM model settings.
|
15
|
+
|
16
|
+
This class encapsulates all the parameters needed to configure an LLM model
|
17
|
+
for use with user simulator and judge agents in the Scenario framework.
|
18
|
+
|
19
|
+
Attributes:
|
20
|
+
model: The model identifier (e.g., "openai/gpt-4.1-mini", "anthropic/claude-3-sonnet")
|
21
|
+
api_key: Optional API key for the model provider
|
22
|
+
temperature: Sampling temperature for response generation (0.0 = deterministic, 1.0 = creative)
|
23
|
+
max_tokens: Maximum number of tokens to generate in responses
|
24
|
+
|
25
|
+
Example:
|
26
|
+
```python
|
27
|
+
model_config = ModelConfig(
|
28
|
+
model="openai/gpt-4.1-mini",
|
29
|
+
api_key="your-api-key",
|
30
|
+
temperature=0.1,
|
31
|
+
max_tokens=1000
|
32
|
+
)
|
33
|
+
```
|
34
|
+
"""
|
35
|
+
model: str
|
36
|
+
api_key: Optional[str] = None
|
37
|
+
temperature: float = 0.0
|
38
|
+
max_tokens: Optional[int] = None
|
39
|
+
|
9
40
|
|
10
41
|
class ScenarioConfig(BaseModel):
|
11
42
|
"""
|
12
|
-
|
43
|
+
Global configuration class for the Scenario testing framework.
|
13
44
|
|
14
|
-
This allows users to set
|
15
|
-
|
45
|
+
This class allows users to set default behavior and parameters that apply
|
46
|
+
to all scenario executions, including the LLM model to use for simulator
|
47
|
+
and judge agents, execution limits, and debugging options.
|
48
|
+
|
49
|
+
Attributes:
|
50
|
+
default_model: Default LLM model configuration for agents (can be string or ModelConfig)
|
51
|
+
max_turns: Maximum number of conversation turns before scenario times out
|
52
|
+
verbose: Whether to show detailed output during execution (True/False or verbosity level)
|
53
|
+
cache_key: Key for caching scenario results to ensure deterministic behavior
|
54
|
+
debug: Whether to enable debug mode with step-by-step interaction
|
55
|
+
|
56
|
+
Example:
|
57
|
+
```python
|
58
|
+
# Configure globally for all scenarios
|
59
|
+
scenario.configure(
|
60
|
+
default_model="openai/gpt-4.1-mini",
|
61
|
+
max_turns=15,
|
62
|
+
verbose=True,
|
63
|
+
cache_key="my-test-suite-v1",
|
64
|
+
debug=False
|
65
|
+
)
|
66
|
+
|
67
|
+
# Or create a specific config instance
|
68
|
+
config = ScenarioConfig(
|
69
|
+
default_model=ModelConfig(
|
70
|
+
model="openai/gpt-4.1-mini",
|
71
|
+
temperature=0.2
|
72
|
+
),
|
73
|
+
max_turns=20
|
74
|
+
)
|
75
|
+
```
|
16
76
|
"""
|
17
77
|
|
18
|
-
|
78
|
+
default_model: Optional[Union[str, ModelConfig]] = None
|
19
79
|
max_turns: Optional[int] = 10
|
20
80
|
verbose: Optional[Union[bool, int]] = True
|
21
81
|
cache_key: Optional[str] = None
|
22
82
|
debug: Optional[bool] = False
|
23
83
|
|
84
|
+
default_config: ClassVar[Optional["ScenarioConfig"]] = None
|
85
|
+
|
86
|
+
@classmethod
|
87
|
+
def configure(
|
88
|
+
cls,
|
89
|
+
default_model: Optional[str] = None,
|
90
|
+
max_turns: Optional[int] = None,
|
91
|
+
verbose: Optional[Union[bool, int]] = None,
|
92
|
+
cache_key: Optional[str] = None,
|
93
|
+
debug: Optional[bool] = None,
|
94
|
+
) -> None:
|
95
|
+
"""
|
96
|
+
Set global configuration settings for all scenario executions.
|
97
|
+
|
98
|
+
This method allows you to configure default behavior that will be applied
|
99
|
+
to all scenarios unless explicitly overridden in individual scenario runs.
|
100
|
+
|
101
|
+
Args:
|
102
|
+
default_model: Default LLM model identifier for user simulator and judge agents
|
103
|
+
max_turns: Maximum number of conversation turns before timeout (default: 10)
|
104
|
+
verbose: Enable verbose output during scenario execution
|
105
|
+
cache_key: Cache key for deterministic scenario behavior across runs
|
106
|
+
debug: Enable debug mode for step-by-step execution with user intervention
|
107
|
+
|
108
|
+
Example:
|
109
|
+
```python
|
110
|
+
import scenario
|
111
|
+
|
112
|
+
# Set up default configuration
|
113
|
+
scenario.configure(
|
114
|
+
default_model="openai/gpt-4.1-mini",
|
115
|
+
max_turns=15,
|
116
|
+
verbose=True,
|
117
|
+
debug=False
|
118
|
+
)
|
119
|
+
|
120
|
+
# All subsequent scenario runs will use these defaults
|
121
|
+
result = await scenario.run(
|
122
|
+
name="my test",
|
123
|
+
description="Test scenario",
|
124
|
+
agents=[my_agent, scenario.UserSimulatorAgent(), scenario.JudgeAgent()]
|
125
|
+
)
|
126
|
+
```
|
127
|
+
"""
|
128
|
+
existing_config = cls.default_config or ScenarioConfig()
|
129
|
+
|
130
|
+
cls.default_config = existing_config.merge(
|
131
|
+
ScenarioConfig(
|
132
|
+
default_model=default_model,
|
133
|
+
max_turns=max_turns,
|
134
|
+
verbose=verbose,
|
135
|
+
cache_key=cache_key,
|
136
|
+
debug=debug,
|
137
|
+
)
|
138
|
+
)
|
139
|
+
|
24
140
|
def merge(self, other: "ScenarioConfig") -> "ScenarioConfig":
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
141
|
+
"""
|
142
|
+
Merge this configuration with another configuration.
|
143
|
+
|
144
|
+
Values from the other configuration will override values in this
|
145
|
+
configuration where they are not None.
|
146
|
+
|
147
|
+
Args:
|
148
|
+
other: Another ScenarioConfig instance to merge with
|
149
|
+
|
150
|
+
Returns:
|
151
|
+
A new ScenarioConfig instance with merged values
|
152
|
+
|
153
|
+
Example:
|
154
|
+
```python
|
155
|
+
base_config = ScenarioConfig(max_turns=10, verbose=True)
|
156
|
+
override_config = ScenarioConfig(max_turns=20)
|
157
|
+
|
158
|
+
merged = base_config.merge(override_config)
|
159
|
+
# Result: max_turns=20, verbose=True
|
160
|
+
```
|
161
|
+
"""
|
162
|
+
return ScenarioConfig(
|
163
|
+
**{
|
164
|
+
**self.items(),
|
165
|
+
**other.items(),
|
166
|
+
}
|
167
|
+
)
|
168
|
+
|
169
|
+
def items(self):
|
170
|
+
"""
|
171
|
+
Get configuration items as a dictionary.
|
172
|
+
|
173
|
+
Returns:
|
174
|
+
Dictionary of configuration key-value pairs, excluding None values
|
175
|
+
|
176
|
+
Example:
|
177
|
+
```python
|
178
|
+
config = ScenarioConfig(max_turns=15, verbose=True)
|
179
|
+
items = config.items()
|
180
|
+
# Result: {"max_turns": 15, "verbose": True}
|
181
|
+
```
|
182
|
+
"""
|
183
|
+
return {k: getattr(self, k) for k in self.model_dump(exclude_none=True).keys()}
|
scenario/error_messages.py
CHANGED
@@ -3,74 +3,102 @@ from typing import Any
|
|
3
3
|
import termcolor
|
4
4
|
|
5
5
|
|
6
|
-
|
6
|
+
def agent_not_configured_error_message(class_name: str):
|
7
|
+
return f"""
|
7
8
|
|
8
|
-
{termcolor.colored("->", "cyan")}
|
9
|
+
{termcolor.colored("->", "cyan")} {class_name} was initialized without a model, please set the model when defining the testing agent, for example:
|
9
10
|
|
10
|
-
|
11
|
+
{class_name}(model="openai/gpt-4.1-mini")
|
12
|
+
{termcolor.colored("^" * (29 + len(class_name)), "green")}
|
11
13
|
|
12
|
-
|
13
|
-
{termcolor.colored("^" * 74, "green")}
|
14
|
+
{termcolor.colored("->", "cyan")} Alternatively, you can set the default model globally, for example:
|
14
15
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
# ...
|
19
|
-
)
|
20
|
-
result = scenario.run()
|
16
|
+
scenario.configure(default_model="openai/gpt-4.1-mini")
|
17
|
+
{termcolor.colored("^" * 55, "green")}
|
18
|
+
"""
|
21
19
|
|
22
|
-
assert result.success
|
23
20
|
|
21
|
+
def message_return_error_message(got: Any, class_name: str):
|
22
|
+
got_ = repr(got)
|
23
|
+
if len(got_) > 100:
|
24
|
+
got_ = got_[:100] + "..."
|
25
|
+
|
26
|
+
return f"""
|
27
|
+
{termcolor.colored("->", "cyan")} On the {termcolor.colored("call", "green")} method of the {class_name} agent adapter, you returned:
|
24
28
|
|
25
|
-
|
29
|
+
{indent(got_, ' ' * 4)}
|
26
30
|
|
27
|
-
|
31
|
+
{termcolor.colored("->", "cyan")} But the adapter should return either a string, a dict on the OpenAI messages format, or a list of messages in the OpenAI messages format so the testing agent can understand what happened. For example:
|
28
32
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
# ...
|
33
|
-
testing_agent=TestingAgent(model="openai/gpt-4o-mini")
|
34
|
-
{termcolor.colored("^" * 54, "green")}
|
35
|
-
)
|
36
|
-
result = scenario.run()
|
33
|
+
class MyAgentAdapter(ScenarioAgentAdapter):
|
34
|
+
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
35
|
+
response = call_my_agent(message)
|
37
36
|
|
38
|
-
|
39
|
-
|
37
|
+
return response.output_text
|
38
|
+
{termcolor.colored("^" * 27, "green")}
|
40
39
|
|
40
|
+
{termcolor.colored("->", "cyan")} Alternatively, you can return a list of messages in OpenAI messages format, this is useful for capturing tool calls and other before the final response:
|
41
41
|
|
42
|
-
|
43
|
-
|
42
|
+
class MyAgentAdapter(ScenarioAgentAdapter):
|
43
|
+
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
44
|
+
response = call_my_agent(message)
|
45
|
+
|
46
|
+
return [
|
47
|
+
{{"role": "assistant", "content": response.output_text}},
|
48
|
+
{termcolor.colored("^" * 55, "green")}
|
49
|
+
]
|
50
|
+
"""
|
51
|
+
|
52
|
+
|
53
|
+
def message_invalid_agent_type(got: Any):
|
54
|
+
got_ = repr(got)
|
44
55
|
if len(got_) > 100:
|
45
56
|
got_ = got_[:100] + "..."
|
46
57
|
|
47
58
|
return f"""
|
48
|
-
{termcolor.colored("->", "cyan")}
|
59
|
+
{termcolor.colored("->", "cyan")} The {termcolor.colored("agent", "green")} argument of Scenario needs to receive a class that inherits from {termcolor.colored("ScenarioAgentAdapter", "green")}, but you passed:
|
49
60
|
|
50
61
|
{indent(got_, ' ' * 4)}
|
51
62
|
|
52
|
-
{termcolor.colored("->", "cyan")}
|
53
|
-
|
54
|
-
def my_agent_under_test(message, context):
|
55
|
-
response = call_my_agent(message)
|
63
|
+
{termcolor.colored("->", "cyan")} Instead, wrap your agent in a ScenarioAgentAdapter subclass. For example:
|
56
64
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
65
|
+
class MyAgentAdapter(ScenarioAgentAdapter):
|
66
|
+
{termcolor.colored("^" * 43, "green")}
|
67
|
+
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
68
|
+
response = call_my_agent(message)
|
61
69
|
|
62
|
-
|
70
|
+
return response.output_text
|
63
71
|
|
64
|
-
|
65
|
-
response = call_my_agent(message)
|
72
|
+
{termcolor.colored("->", "cyan")} And then you can use that on your scenario definition:
|
66
73
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
74
|
+
@pytest.mark.agent_test
|
75
|
+
def test_my_agent():
|
76
|
+
scenario = Scenario(
|
77
|
+
name="first scenario",
|
78
|
+
description=\"\"\"
|
79
|
+
Example scenario description to test your agent.
|
80
|
+
\"\"\",
|
81
|
+
agent=MyAgentAdapter,
|
82
|
+
{termcolor.colored("^" * 20, "green")}
|
83
|
+
criteria=[
|
84
|
+
"Requirement One",
|
85
|
+
"Requirement Two",
|
71
86
|
],
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
87
|
+
)
|
88
|
+
result = scenario.run()
|
89
|
+
|
90
|
+
assert result.success
|
91
|
+
"""
|
92
|
+
|
93
|
+
|
94
|
+
def agent_response_not_awaitable(class_name: str):
|
95
|
+
return f"""
|
96
|
+
{termcolor.colored("->", "cyan")} The {termcolor.colored("call", "green")} method of the {class_name} agent adapter returned a non-awaitable response, you probably forgot to add the {termcolor.colored("async", "green")} keyword to the method definition, make sure your code looks like this:
|
97
|
+
|
98
|
+
class {class_name}(ScenarioAgentAdapter):
|
99
|
+
async def call(self, input: AgentInput) -> AgentReturnTypes:
|
100
|
+
{termcolor.colored("^" * 5, "green")}
|
101
|
+
response = call_my_agent(message)
|
102
|
+
|
103
|
+
return response.output_text
|
104
|
+
"""
|