langwatch-scenario 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langwatch_scenario-0.4.0.dist-info/METADATA +363 -0
- langwatch_scenario-0.4.0.dist-info/RECORD +18 -0
- scenario/__init__.py +230 -6
- scenario/agent_adapter.py +111 -0
- scenario/cache.py +132 -8
- scenario/config.py +165 -10
- scenario/error_messages.py +75 -47
- scenario/judge_agent.py +435 -0
- scenario/pytest_plugin.py +224 -16
- scenario/scenario_executor.py +704 -150
- scenario/scenario_state.py +205 -0
- scenario/script.py +361 -0
- scenario/types.py +269 -0
- scenario/user_simulator_agent.py +249 -0
- scenario/utils.py +398 -5
- langwatch_scenario-0.2.0.dist-info/METADATA +0 -254
- langwatch_scenario-0.2.0.dist-info/RECORD +0 -15
- scenario/result.py +0 -74
- scenario/scenario.py +0 -123
- scenario/testing_agent.py +0 -262
- {langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.4.0.dist-info}/WHEEL +0 -0
- {langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.4.0.dist-info}/entry_points.txt +0 -0
- {langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.4.0.dist-info}/top_level.txt +0 -0
scenario/scenario_executor.py
CHANGED
@@ -1,187 +1,555 @@
|
|
1
1
|
"""
|
2
|
-
|
2
|
+
Scenario execution engine for agent testing.
|
3
|
+
|
4
|
+
This module contains the core ScenarioExecutor class that orchestrates the execution
|
5
|
+
of scenario tests, managing the interaction between user simulators, agents under test,
|
6
|
+
and judge agents to determine test success or failure.
|
3
7
|
"""
|
4
8
|
|
5
|
-
import json
|
6
9
|
import sys
|
7
|
-
from typing import
|
10
|
+
from typing import (
|
11
|
+
Awaitable,
|
12
|
+
Callable,
|
13
|
+
Dict,
|
14
|
+
List,
|
15
|
+
Any,
|
16
|
+
Optional,
|
17
|
+
Set,
|
18
|
+
Tuple,
|
19
|
+
Union,
|
20
|
+
)
|
8
21
|
import time
|
9
22
|
import termcolor
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
from
|
14
|
-
|
15
|
-
|
16
|
-
|
23
|
+
import asyncio
|
24
|
+
import concurrent.futures
|
25
|
+
|
26
|
+
from scenario.config import ScenarioConfig
|
27
|
+
from scenario.utils import (
|
28
|
+
await_if_awaitable,
|
29
|
+
check_valid_return_type,
|
30
|
+
convert_agent_return_types_to_openai_messages,
|
31
|
+
print_openai_messages,
|
32
|
+
show_spinner,
|
33
|
+
)
|
34
|
+
from openai.types.chat import (
|
35
|
+
ChatCompletionMessageParam,
|
36
|
+
ChatCompletionUserMessageParam,
|
37
|
+
)
|
38
|
+
|
39
|
+
from .types import AgentInput, AgentRole, ScenarioResult, ScriptStep
|
40
|
+
from .error_messages import agent_response_not_awaitable
|
17
41
|
from .cache import context_scenario
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
42
|
+
from .agent_adapter import AgentAdapter
|
43
|
+
from .script import proceed
|
44
|
+
from pksuid import PKSUID
|
45
|
+
from .scenario_state import ScenarioState
|
22
46
|
|
23
47
|
|
24
48
|
class ScenarioExecutor:
|
25
|
-
|
26
|
-
|
49
|
+
"""
|
50
|
+
Core orchestrator for scenario-based agent testing.
|
51
|
+
|
52
|
+
The ScenarioExecutor manages the complete lifecycle of a scenario test, including:
|
53
|
+
- Orchestrating conversations between user simulators, agents, and judges
|
54
|
+
- Managing turn-based execution flow
|
55
|
+
- Handling script-based scenario control
|
56
|
+
- Collecting and reporting test results
|
57
|
+
- Supporting debug mode for interactive testing
|
58
|
+
|
59
|
+
This class serves as both a builder (for configuration) and an executor (for running tests).
|
60
|
+
Most users will interact with it through the high-level `scenario.run()` function rather
|
61
|
+
than instantiating it directly.
|
62
|
+
|
63
|
+
Attributes:
|
64
|
+
name: Human-readable name for the scenario
|
65
|
+
description: Detailed description of what the scenario tests
|
66
|
+
agents: List of agent adapters participating in the scenario
|
67
|
+
script: Optional list of script steps to control scenario flow
|
68
|
+
config: Configuration settings for execution behavior
|
69
|
+
|
70
|
+
Example:
|
71
|
+
```python
|
72
|
+
# Direct instantiation (less common)
|
73
|
+
executor = ScenarioExecutor(
|
74
|
+
name="weather query test",
|
75
|
+
description="User asks about weather, agent should provide helpful response",
|
76
|
+
agents=[
|
77
|
+
weather_agent,
|
78
|
+
scenario.UserSimulatorAgent(),
|
79
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
|
80
|
+
],
|
81
|
+
max_turns=10,
|
82
|
+
verbose=True
|
83
|
+
)
|
84
|
+
result = await executor._run()
|
85
|
+
|
86
|
+
# Preferred high-level API
|
87
|
+
result = await scenario.run(
|
88
|
+
name="weather query test",
|
89
|
+
description="User asks about weather, agent should provide helpful response",
|
90
|
+
agents=[
|
91
|
+
weather_agent,
|
92
|
+
scenario.UserSimulatorAgent(),
|
93
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
|
94
|
+
]
|
95
|
+
)
|
96
|
+
```
|
97
|
+
|
98
|
+
Note:
|
99
|
+
- Scenarios run in isolated thread pools to support parallel execution
|
100
|
+
- All agent interactions are cached when cache_key is configured
|
101
|
+
- Debug mode allows step-by-step execution with user intervention
|
102
|
+
- Results include detailed timing information and conversation history
|
103
|
+
"""
|
104
|
+
name: str
|
105
|
+
description: str
|
106
|
+
agents: List[AgentAdapter]
|
107
|
+
script: List[ScriptStep]
|
108
|
+
|
109
|
+
config: ScenarioConfig
|
110
|
+
|
111
|
+
_state: ScenarioState
|
112
|
+
_total_start_time: float
|
113
|
+
_pending_messages: Dict[int, List[ChatCompletionMessageParam]]
|
114
|
+
|
115
|
+
_pending_roles_on_turn: List[AgentRole] = []
|
116
|
+
_pending_agents_on_turn: Set[AgentAdapter] = set()
|
117
|
+
_agent_times: Dict[int, float] = {}
|
118
|
+
|
119
|
+
def __init__(
|
120
|
+
self,
|
121
|
+
name: str,
|
122
|
+
description: str,
|
123
|
+
agents: List[AgentAdapter] = [],
|
124
|
+
script: Optional[List[ScriptStep]] = None,
|
125
|
+
# Config
|
126
|
+
max_turns: Optional[int] = None,
|
127
|
+
verbose: Optional[Union[bool, int]] = None,
|
128
|
+
cache_key: Optional[str] = None,
|
129
|
+
debug: Optional[bool] = None,
|
130
|
+
):
|
131
|
+
"""
|
132
|
+
Initialize a scenario executor.
|
27
133
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
134
|
+
Args:
|
135
|
+
name: Human-readable name for the scenario (used in reports and logs)
|
136
|
+
description: Detailed description of what the scenario tests.
|
137
|
+
This guides the user simulator's behavior and provides context.
|
138
|
+
agents: List of agent adapters participating in the scenario.
|
139
|
+
Typically includes: agent under test, user simulator, and judge.
|
140
|
+
script: Optional list of script steps to control scenario flow.
|
141
|
+
If not provided, defaults to automatic proceeding.
|
142
|
+
max_turns: Maximum number of conversation turns before timeout.
|
143
|
+
Overrides global configuration for this scenario.
|
144
|
+
verbose: Whether to show detailed output during execution.
|
145
|
+
Can be True/False or integer level (2 for extra details).
|
146
|
+
cache_key: Cache key for deterministic behavior across runs.
|
147
|
+
Overrides global configuration for this scenario.
|
148
|
+
debug: Whether to enable debug mode with step-by-step execution.
|
149
|
+
Overrides global configuration for this scenario.
|
150
|
+
|
151
|
+
Example:
|
152
|
+
```python
|
153
|
+
executor = ScenarioExecutor(
|
154
|
+
name="customer service test",
|
155
|
+
description="Customer has a billing question and needs help",
|
156
|
+
agents=[
|
157
|
+
customer_service_agent,
|
158
|
+
scenario.UserSimulatorAgent(),
|
159
|
+
scenario.JudgeAgent(criteria=[
|
160
|
+
"Agent is polite and professional",
|
161
|
+
"Agent addresses the billing question",
|
162
|
+
"Agent provides clear next steps"
|
163
|
+
])
|
164
|
+
],
|
165
|
+
max_turns=15,
|
166
|
+
verbose=True,
|
167
|
+
debug=False
|
168
|
+
)
|
169
|
+
```
|
170
|
+
"""
|
171
|
+
self.name = name
|
172
|
+
self.description = description
|
173
|
+
self.agents = agents
|
174
|
+
self.script = script or [proceed()]
|
175
|
+
|
176
|
+
config = ScenarioConfig(
|
177
|
+
max_turns=max_turns,
|
178
|
+
verbose=verbose,
|
179
|
+
cache_key=cache_key,
|
180
|
+
debug=debug,
|
181
|
+
)
|
182
|
+
self.config = (ScenarioConfig.default_config or ScenarioConfig()).merge(config)
|
32
183
|
|
33
|
-
self.
|
184
|
+
self.reset()
|
34
185
|
|
186
|
+
@classmethod
|
35
187
|
async def run(
|
36
|
-
|
37
|
-
|
188
|
+
cls,
|
189
|
+
name: str,
|
190
|
+
description: str,
|
191
|
+
agents: List[AgentAdapter] = [],
|
192
|
+
max_turns: Optional[int] = None,
|
193
|
+
verbose: Optional[Union[bool, int]] = None,
|
194
|
+
cache_key: Optional[str] = None,
|
195
|
+
debug: Optional[bool] = None,
|
196
|
+
script: Optional[List[ScriptStep]] = None,
|
38
197
|
) -> ScenarioResult:
|
39
198
|
"""
|
40
|
-
|
199
|
+
High-level interface for running a scenario test.
|
200
|
+
|
201
|
+
This is the main entry point for executing scenario tests. It creates a
|
202
|
+
ScenarioExecutor instance and runs it in an isolated thread pool to support
|
203
|
+
parallel execution and prevent blocking.
|
41
204
|
|
42
205
|
Args:
|
43
|
-
|
206
|
+
name: Human-readable name for the scenario
|
207
|
+
description: Detailed description of what the scenario tests
|
208
|
+
agents: List of agent adapters (agent under test, user simulator, judge)
|
209
|
+
max_turns: Maximum conversation turns before timeout (default: 10)
|
210
|
+
verbose: Show detailed output during execution
|
211
|
+
cache_key: Cache key for deterministic behavior
|
212
|
+
debug: Enable debug mode for step-by-step execution
|
213
|
+
script: Optional script steps to control scenario flow
|
44
214
|
|
45
215
|
Returns:
|
46
|
-
ScenarioResult containing the test outcome
|
216
|
+
ScenarioResult containing the test outcome, conversation history,
|
217
|
+
success/failure status, and detailed reasoning
|
218
|
+
|
219
|
+
Example:
|
220
|
+
```python
|
221
|
+
import scenario
|
222
|
+
|
223
|
+
# Simple scenario with automatic flow
|
224
|
+
result = await scenario.run(
|
225
|
+
name="help request",
|
226
|
+
description="User asks for help with a technical problem",
|
227
|
+
agents=[
|
228
|
+
my_agent,
|
229
|
+
scenario.UserSimulatorAgent(),
|
230
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful response"])
|
231
|
+
]
|
232
|
+
)
|
233
|
+
|
234
|
+
# Scripted scenario with custom evaluations
|
235
|
+
result = await scenario.run(
|
236
|
+
name="custom interaction",
|
237
|
+
description="Test specific conversation flow",
|
238
|
+
agents=[
|
239
|
+
my_agent,
|
240
|
+
scenario.UserSimulatorAgent(),
|
241
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful response"])
|
242
|
+
],
|
243
|
+
script=[
|
244
|
+
scenario.user("Hello"),
|
245
|
+
scenario.agent(),
|
246
|
+
custom_eval,
|
247
|
+
scenario.succeed()
|
248
|
+
]
|
249
|
+
)
|
250
|
+
|
251
|
+
# Results analysis
|
252
|
+
print(f"Test {'PASSED' if result.success else 'FAILED'}")
|
253
|
+
print(f"Reasoning: {result.reasoning}")
|
254
|
+
print(f"Conversation had {len(result.messages)} messages")
|
255
|
+
```
|
256
|
+
|
257
|
+
Note:
|
258
|
+
- Runs in isolated thread pool to support parallel execution
|
259
|
+
- Blocks until scenario completes or times out
|
260
|
+
- All agent calls are automatically cached when cache_key is set
|
261
|
+
- Exception handling ensures clean resource cleanup
|
47
262
|
"""
|
263
|
+
scenario = cls(
|
264
|
+
name=name,
|
265
|
+
description=description,
|
266
|
+
agents=agents,
|
267
|
+
max_turns=max_turns,
|
268
|
+
verbose=verbose,
|
269
|
+
cache_key=cache_key,
|
270
|
+
debug=debug,
|
271
|
+
script=script,
|
272
|
+
)
|
48
273
|
|
49
|
-
|
50
|
-
|
274
|
+
# We'll use a thread pool to run the execution logic, we
|
275
|
+
# require a separate thread because even though asyncio is
|
276
|
+
# being used throughout, any user code on the callback can
|
277
|
+
# be blocking, preventing them from running scenarios in parallel
|
278
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
279
|
+
|
280
|
+
def run_in_thread():
|
281
|
+
loop = asyncio.new_event_loop()
|
282
|
+
asyncio.set_event_loop(loop)
|
283
|
+
|
284
|
+
try:
|
285
|
+
return loop.run_until_complete(scenario._run())
|
286
|
+
finally:
|
287
|
+
loop.close()
|
288
|
+
|
289
|
+
# Run the function in the thread pool and await its result
|
290
|
+
# This converts the thread's execution into a Future that the current
|
291
|
+
# event loop can await without blocking
|
292
|
+
loop = asyncio.get_event_loop()
|
293
|
+
result = await loop.run_in_executor(executor, run_in_thread)
|
294
|
+
return result
|
295
|
+
|
296
|
+
def reset(self):
|
297
|
+
"""
|
298
|
+
Reset the scenario executor to initial state.
|
299
|
+
|
300
|
+
This method reinitializes all internal state for a fresh scenario run,
|
301
|
+
including conversation history, turn counters, and agent timing information.
|
302
|
+
Called automatically during initialization and can be used to rerun scenarios.
|
303
|
+
|
304
|
+
Example:
|
305
|
+
```python
|
306
|
+
executor = ScenarioExecutor(...)
|
51
307
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
308
|
+
# Run first test
|
309
|
+
result1 = await executor._run()
|
310
|
+
|
311
|
+
# Reset and run again
|
312
|
+
executor.reset()
|
313
|
+
result2 = await executor._run()
|
314
|
+
```
|
315
|
+
"""
|
316
|
+
self._state = ScenarioState(
|
317
|
+
description=self.description,
|
318
|
+
messages=[],
|
319
|
+
thread_id=str(PKSUID("thread")),
|
320
|
+
current_turn=0,
|
321
|
+
config=self.config,
|
322
|
+
_executor=self,
|
57
323
|
)
|
324
|
+
# Pydantic doesn't actually set the _executor field from the constructor, as it's private, so we need to do it manually
|
325
|
+
self._state._executor = self
|
58
326
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
next_message.__repr__(),
|
63
|
-
)
|
64
|
-
elif self.scenario.verbose:
|
65
|
-
print(self._scenario_name() + termcolor.colored("User:", "green"), next_message)
|
327
|
+
self._pending_messages = {}
|
328
|
+
self._total_start_time = time.time()
|
329
|
+
self._agent_times = {}
|
66
330
|
|
67
|
-
|
68
|
-
current_turn = 0
|
69
|
-
max_turns = self.scenario.max_turns or 10
|
70
|
-
agent_time = 0
|
331
|
+
self._new_turn()
|
332
|
+
self._state.current_turn = 0
|
71
333
|
|
72
|
-
|
73
|
-
while current_turn < max_turns:
|
74
|
-
# Record the testing agent's message
|
75
|
-
self.conversation.append({"role": "user", "content": next_message})
|
334
|
+
context_scenario.set(self)
|
76
335
|
|
77
|
-
|
78
|
-
|
336
|
+
def add_message(
|
337
|
+
self, message: ChatCompletionMessageParam, from_agent_idx: Optional[int] = None
|
338
|
+
):
|
339
|
+
"""
|
340
|
+
Add a message to the conversation and broadcast to other agents.
|
79
341
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
if isinstance(agent_response, Awaitable):
|
84
|
-
agent_response = await agent_response
|
342
|
+
This method adds a message to the conversation history and makes it available
|
343
|
+
to other agents in their next call. It's used internally by the executor
|
344
|
+
and can be called from script steps to inject custom messages.
|
85
345
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
"role"
|
96
|
-
|
97
|
-
)
|
346
|
+
Args:
|
347
|
+
message: OpenAI-compatible message to add to the conversation
|
348
|
+
from_agent_idx: Index of the agent that generated this message.
|
349
|
+
Used to avoid broadcasting the message back to its creator.
|
350
|
+
|
351
|
+
Example:
|
352
|
+
```python
|
353
|
+
def inject_system_message(state: ScenarioState) -> None:
|
354
|
+
state._executor.add_message({
|
355
|
+
"role": "system",
|
356
|
+
"content": "The user is now in a hurry"
|
357
|
+
})
|
358
|
+
|
359
|
+
# Use in script
|
360
|
+
result = await scenario.run(
|
361
|
+
name="system message test",
|
362
|
+
agents=[agent, user_sim, judge],
|
363
|
+
script=[
|
364
|
+
scenario.user("Hello"),
|
365
|
+
scenario.agent(),
|
366
|
+
inject_system_message,
|
367
|
+
scenario.user(), # Will see the system message
|
368
|
+
scenario.succeed()
|
369
|
+
]
|
98
370
|
)
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
messages: list[ChatCompletionMessageParam] = []
|
103
|
-
if has_valid_messages and len(agent_response["messages"]) > 0:
|
104
|
-
messages = agent_response["messages"]
|
105
|
-
|
106
|
-
# Drop the first messages both if they are system or user messages
|
107
|
-
if safe_attr_or_key(safe_list_at(messages, 0), "role") == "system":
|
108
|
-
messages = messages[1:]
|
109
|
-
if safe_attr_or_key(safe_list_at(messages, 0), "role") == "user":
|
110
|
-
messages = messages[1:]
|
111
|
-
|
112
|
-
if has_valid_message and self.scenario.verbose:
|
113
|
-
print(self._scenario_name() + termcolor.colored("Agent:", "blue"), agent_response["message"])
|
114
|
-
|
115
|
-
if messages and self.scenario.verbose:
|
116
|
-
print_openai_messages(self._scenario_name(), messages)
|
117
|
-
|
118
|
-
if (
|
119
|
-
self.scenario.verbose
|
120
|
-
and "extra" in agent_response
|
121
|
-
and len(agent_response["extra"].keys()) > 0
|
122
|
-
):
|
123
|
-
print(
|
124
|
-
termcolor.colored(
|
125
|
-
"Extra:" + json.dumps(agent_response["extra"]),
|
126
|
-
"magenta",
|
127
|
-
)
|
128
|
-
)
|
129
|
-
response_time = time.time() - start_time
|
130
|
-
agent_time += response_time
|
131
|
-
|
132
|
-
if messages:
|
133
|
-
self.conversation.extend(agent_response["messages"])
|
134
|
-
if "message" in agent_response:
|
135
|
-
self.conversation.append(
|
136
|
-
{"role": "assistant", "content": agent_response["message"]}
|
137
|
-
)
|
138
|
-
if "extra" in agent_response:
|
139
|
-
self.conversation.append(
|
140
|
-
{
|
141
|
-
"role": "assistant",
|
142
|
-
"content": json.dumps(agent_response["extra"]),
|
143
|
-
}
|
144
|
-
)
|
371
|
+
```
|
372
|
+
"""
|
373
|
+
self._state.messages.append(message)
|
145
374
|
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
375
|
+
# Broadcast the message to other agents
|
376
|
+
for idx, _ in enumerate(self.agents):
|
377
|
+
if idx == from_agent_idx:
|
378
|
+
continue
|
379
|
+
if idx not in self._pending_messages:
|
380
|
+
self._pending_messages[idx] = []
|
381
|
+
self._pending_messages[idx].append(message)
|
152
382
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
383
|
+
def add_messages(
|
384
|
+
self,
|
385
|
+
messages: List[ChatCompletionMessageParam],
|
386
|
+
from_agent_idx: Optional[int] = None,
|
387
|
+
):
|
388
|
+
"""
|
389
|
+
Add multiple messages to the conversation.
|
160
390
|
|
161
|
-
|
162
|
-
|
391
|
+
Convenience method for adding multiple messages at once. Each message
|
392
|
+
is added individually using add_message().
|
163
393
|
|
164
|
-
|
165
|
-
|
394
|
+
Args:
|
395
|
+
messages: List of OpenAI-compatible messages to add
|
396
|
+
from_agent_idx: Index of the agent that generated these messages
|
397
|
+
|
398
|
+
Example:
|
399
|
+
```python
|
400
|
+
# Agent returns multiple messages for a complex interaction
|
401
|
+
messages = [
|
402
|
+
{"role": "assistant", "content": "Let me search for that..."},
|
403
|
+
{"role": "assistant", "content": "Here's what I found: ..."}
|
404
|
+
]
|
405
|
+
executor.add_messages(messages, from_agent_idx=0)
|
406
|
+
```
|
407
|
+
"""
|
408
|
+
for message in messages:
|
409
|
+
self.add_message(message, from_agent_idx)
|
410
|
+
|
411
|
+
def _new_turn(self):
|
412
|
+
self._pending_agents_on_turn = set(self.agents)
|
413
|
+
self._pending_roles_on_turn = [
|
414
|
+
AgentRole.USER,
|
415
|
+
AgentRole.AGENT,
|
416
|
+
AgentRole.JUDGE,
|
417
|
+
]
|
418
|
+
self._state.current_turn += 1
|
419
|
+
|
420
|
+
async def step(self) -> Union[List[ChatCompletionMessageParam], ScenarioResult]:
|
421
|
+
"""
|
422
|
+
Execute a single step in the scenario.
|
423
|
+
|
424
|
+
A step consists of calling the next agent in the current turn's sequence
|
425
|
+
and processing their response. This method is used internally by the
|
426
|
+
scenario execution flow.
|
427
|
+
|
428
|
+
Returns:
|
429
|
+
Either a list of messages (if the scenario continues) or a
|
430
|
+
ScenarioResult (if the scenario should end)
|
431
|
+
|
432
|
+
Raises:
|
433
|
+
ValueError: If no result is returned from the internal step method
|
166
434
|
|
435
|
+
Note:
|
436
|
+
This is primarily an internal method. Most users should use the
|
437
|
+
high-level run() method or script DSL functions instead.
|
438
|
+
"""
|
439
|
+
result = await self._step()
|
440
|
+
if result is None:
|
441
|
+
raise ValueError("No result from step")
|
442
|
+
return result
|
443
|
+
|
444
|
+
async def _step(
|
445
|
+
self,
|
446
|
+
go_to_next_turn=True,
|
447
|
+
on_turn: Optional[
|
448
|
+
Union[
|
449
|
+
Callable[["ScenarioState"], None],
|
450
|
+
Callable[["ScenarioState"], Awaitable[None]],
|
451
|
+
]
|
452
|
+
] = None,
|
453
|
+
) -> Union[List[ChatCompletionMessageParam], ScenarioResult, None]:
|
454
|
+
if len(self._pending_roles_on_turn) == 0:
|
455
|
+
if not go_to_next_turn:
|
456
|
+
return None
|
457
|
+
|
458
|
+
self._new_turn()
|
459
|
+
|
460
|
+
if on_turn:
|
461
|
+
await await_if_awaitable(on_turn(self._state))
|
462
|
+
|
463
|
+
if self._state.current_turn >= (self.config.max_turns or 10):
|
464
|
+
return self._reached_max_turns()
|
465
|
+
|
466
|
+
current_role = self._pending_roles_on_turn[0]
|
467
|
+
idx, next_agent = self._next_agent_for_role(current_role)
|
468
|
+
if not next_agent:
|
469
|
+
self._pending_roles_on_turn.pop(0)
|
470
|
+
return await self._step(go_to_next_turn=go_to_next_turn, on_turn=on_turn)
|
471
|
+
|
472
|
+
self._pending_agents_on_turn.remove(next_agent)
|
473
|
+
return await self._call_agent(idx, role=current_role)
|
474
|
+
|
475
|
+
def _next_agent_for_role(
|
476
|
+
self, role: AgentRole
|
477
|
+
) -> Tuple[int, Optional[AgentAdapter]]:
|
478
|
+
for idx, agent in enumerate(self.agents):
|
479
|
+
if role == agent.role and agent in self._pending_agents_on_turn:
|
480
|
+
return idx, agent
|
481
|
+
return -1, None
|
482
|
+
|
483
|
+
def _reached_max_turns(self, error_message: Optional[str] = None) -> ScenarioResult:
|
167
484
|
# If we reached max turns without conclusion, fail the test
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
485
|
+
agent_roles_agents_idx = [
|
486
|
+
idx
|
487
|
+
for idx, agent in enumerate(self.agents)
|
488
|
+
if agent.role == AgentRole.AGENT
|
489
|
+
]
|
490
|
+
agent_times = [
|
491
|
+
self._agent_times[idx]
|
492
|
+
for idx in agent_roles_agents_idx
|
493
|
+
if idx in self._agent_times
|
494
|
+
]
|
495
|
+
agent_time = sum(agent_times)
|
496
|
+
|
497
|
+
return ScenarioResult(
|
498
|
+
success=False,
|
499
|
+
messages=self._state.messages,
|
500
|
+
reasoning=error_message
|
501
|
+
or f"Reached maximum turns ({self.config.max_turns or 10}) without conclusion",
|
502
|
+
total_time=time.time() - self._total_start_time,
|
172
503
|
agent_time=agent_time,
|
173
504
|
)
|
174
505
|
|
175
|
-
def
|
176
|
-
|
177
|
-
scenario
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
506
|
+
async def _run(self) -> ScenarioResult:
|
507
|
+
"""
|
508
|
+
Run a scenario against the agent under test.
|
509
|
+
|
510
|
+
Args:
|
511
|
+
context: Optional initial context for the agent
|
512
|
+
|
513
|
+
Returns:
|
514
|
+
ScenarioResult containing the test outcome
|
515
|
+
"""
|
516
|
+
|
517
|
+
if self.config.verbose:
|
518
|
+
print("") # new line
|
519
|
+
|
520
|
+
self.reset()
|
521
|
+
|
522
|
+
for script_step in self.script:
|
523
|
+
callable = script_step(self._state)
|
524
|
+
if isinstance(callable, Awaitable):
|
525
|
+
result = await callable
|
526
|
+
else:
|
527
|
+
result = callable
|
528
|
+
|
529
|
+
if isinstance(result, ScenarioResult):
|
530
|
+
return result
|
531
|
+
|
532
|
+
return self._reached_max_turns(
|
533
|
+
"""Reached end of script without conclusion, add one of the following to the end of the script:
|
534
|
+
|
535
|
+
- `scenario.proceed()` to let the simulation continue to play out
|
536
|
+
- `scenario.judge()` to force criteria judgement
|
537
|
+
- `scenario.succeed()` or `scenario.fail()` to end the test with an explicit result
|
538
|
+
"""
|
539
|
+
)
|
540
|
+
|
541
|
+
async def _call_agent(
|
542
|
+
self, idx: int, role: AgentRole, request_judgment: bool = False
|
543
|
+
) -> Union[List[ChatCompletionMessageParam], ScenarioResult]:
|
544
|
+
agent = self.agents[idx]
|
545
|
+
|
546
|
+
if role == AgentRole.USER and self.config.debug:
|
547
|
+
print(
|
548
|
+
f"\n{self._scenario_name()}{termcolor.colored('[Debug Mode]', 'yellow')} Press enter to continue or type a message to send"
|
549
|
+
)
|
550
|
+
input_message = input(
|
551
|
+
self._scenario_name() + termcolor.colored("User: ", "green")
|
552
|
+
)
|
185
553
|
|
186
554
|
# Clear the input prompt lines completely
|
187
555
|
for _ in range(3):
|
@@ -190,15 +558,201 @@ class ScenarioExecutor:
|
|
190
558
|
sys.stdout.flush() # Make sure the clearing is visible
|
191
559
|
|
192
560
|
if input_message:
|
193
|
-
return
|
561
|
+
return [
|
562
|
+
ChatCompletionUserMessageParam(role="user", content=input_message)
|
563
|
+
]
|
564
|
+
|
565
|
+
with show_spinner(
|
566
|
+
text=(
|
567
|
+
"Judging..."
|
568
|
+
if role == AgentRole.JUDGE
|
569
|
+
else f"{role.value if isinstance(role, AgentRole) else role}:"
|
570
|
+
),
|
571
|
+
color=(
|
572
|
+
"blue"
|
573
|
+
if role == AgentRole.AGENT
|
574
|
+
else "green" if role == AgentRole.USER else "yellow"
|
575
|
+
),
|
576
|
+
enabled=self.config.verbose,
|
577
|
+
):
|
578
|
+
start_time = time.time()
|
194
579
|
|
195
|
-
|
196
|
-
|
197
|
-
|
580
|
+
agent_response = agent.call(
|
581
|
+
AgentInput(
|
582
|
+
# TODO: test thread_id
|
583
|
+
thread_id=self._state.thread_id,
|
584
|
+
messages=self._state.messages,
|
585
|
+
new_messages=self._pending_messages.get(idx, []),
|
586
|
+
judgment_request=request_judgment,
|
587
|
+
scenario_state=self._state,
|
588
|
+
)
|
198
589
|
)
|
590
|
+
if not isinstance(agent_response, Awaitable):
|
591
|
+
raise Exception(
|
592
|
+
agent_response_not_awaitable(agent.__class__.__name__),
|
593
|
+
)
|
594
|
+
|
595
|
+
agent_response = await agent_response
|
596
|
+
|
597
|
+
if idx not in self._agent_times:
|
598
|
+
self._agent_times[idx] = 0
|
599
|
+
self._agent_times[idx] += time.time() - start_time
|
600
|
+
|
601
|
+
self._pending_messages[idx] = []
|
602
|
+
check_valid_return_type(agent_response, agent.__class__.__name__)
|
603
|
+
|
604
|
+
messages = []
|
605
|
+
if isinstance(agent_response, ScenarioResult):
|
606
|
+
# TODO: should be an event
|
607
|
+
return agent_response
|
608
|
+
else:
|
609
|
+
messages = convert_agent_return_types_to_openai_messages(
|
610
|
+
agent_response,
|
611
|
+
role="user" if role == AgentRole.USER else "assistant",
|
612
|
+
)
|
613
|
+
|
614
|
+
self.add_messages(messages, from_agent_idx=idx)
|
615
|
+
|
616
|
+
if messages and self.config.verbose:
|
617
|
+
print_openai_messages(
|
618
|
+
self._scenario_name(),
|
619
|
+
[m for m in messages if m["role"] != "system"],
|
620
|
+
)
|
621
|
+
|
622
|
+
return messages
|
199
623
|
|
200
624
|
def _scenario_name(self):
|
201
|
-
if self.
|
202
|
-
return termcolor.colored(f"[Scenario: {self.
|
625
|
+
if self.config.verbose == 2:
|
626
|
+
return termcolor.colored(f"[Scenario: {self.name}] ", "yellow")
|
203
627
|
else:
|
204
628
|
return ""
|
629
|
+
|
630
|
+
# Scripting utils
|
631
|
+
|
632
|
+
async def message(self, message: ChatCompletionMessageParam) -> None:
|
633
|
+
if message["role"] == "user":
|
634
|
+
await self._script_call_agent(AgentRole.USER, message)
|
635
|
+
elif message["role"] == "assistant":
|
636
|
+
await self._script_call_agent(AgentRole.AGENT, message)
|
637
|
+
else:
|
638
|
+
self.add_message(message)
|
639
|
+
|
640
|
+
async def user(
|
641
|
+
self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
|
642
|
+
) -> None:
|
643
|
+
await self._script_call_agent(AgentRole.USER, content)
|
644
|
+
|
645
|
+
async def agent(
|
646
|
+
self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
|
647
|
+
) -> None:
|
648
|
+
await self._script_call_agent(AgentRole.AGENT, content)
|
649
|
+
|
650
|
+
async def judge(
|
651
|
+
self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
|
652
|
+
) -> Optional[ScenarioResult]:
|
653
|
+
return await self._script_call_agent(
|
654
|
+
AgentRole.JUDGE, content, request_judgment=True
|
655
|
+
)
|
656
|
+
|
657
|
+
async def proceed(
|
658
|
+
self,
|
659
|
+
turns: Optional[int] = None,
|
660
|
+
on_turn: Optional[
|
661
|
+
Union[
|
662
|
+
Callable[["ScenarioState"], None],
|
663
|
+
Callable[["ScenarioState"], Awaitable[None]],
|
664
|
+
]
|
665
|
+
] = None,
|
666
|
+
on_step: Optional[
|
667
|
+
Union[
|
668
|
+
Callable[["ScenarioState"], None],
|
669
|
+
Callable[["ScenarioState"], Awaitable[None]],
|
670
|
+
]
|
671
|
+
] = None,
|
672
|
+
) -> Optional[ScenarioResult]:
|
673
|
+
initial_turn: Optional[int] = None
|
674
|
+
while True:
|
675
|
+
next_message = await self._step(
|
676
|
+
on_turn=on_turn,
|
677
|
+
go_to_next_turn=(
|
678
|
+
turns is None
|
679
|
+
or initial_turn is None
|
680
|
+
or (self._state.current_turn + 1 < initial_turn + turns)
|
681
|
+
),
|
682
|
+
)
|
683
|
+
|
684
|
+
if initial_turn is None:
|
685
|
+
initial_turn = self._state.current_turn
|
686
|
+
|
687
|
+
if next_message is None:
|
688
|
+
break
|
689
|
+
|
690
|
+
if on_step:
|
691
|
+
await await_if_awaitable(on_step(self._state))
|
692
|
+
|
693
|
+
if isinstance(next_message, ScenarioResult):
|
694
|
+
return next_message
|
695
|
+
|
696
|
+
async def succeed(self, reasoning: Optional[str] = None) -> ScenarioResult:
|
697
|
+
return ScenarioResult(
|
698
|
+
success=True,
|
699
|
+
messages=self._state.messages,
|
700
|
+
reasoning=reasoning
|
701
|
+
or "Scenario marked as successful with scenario.succeed()",
|
702
|
+
)
|
703
|
+
|
704
|
+
async def fail(self, reasoning: Optional[str] = None) -> ScenarioResult:
|
705
|
+
return ScenarioResult(
|
706
|
+
success=False,
|
707
|
+
messages=self._state.messages,
|
708
|
+
reasoning=reasoning or "Scenario marked as failed with scenario.fail()",
|
709
|
+
)
|
710
|
+
|
711
|
+
async def _script_call_agent(
|
712
|
+
self,
|
713
|
+
role: AgentRole,
|
714
|
+
content: Optional[Union[str, ChatCompletionMessageParam]] = None,
|
715
|
+
request_judgment: bool = False,
|
716
|
+
) -> Optional[ScenarioResult]:
|
717
|
+
idx, next_agent = self._next_agent_for_role(role)
|
718
|
+
if not next_agent:
|
719
|
+
self._new_turn()
|
720
|
+
idx, next_agent = self._next_agent_for_role(role)
|
721
|
+
|
722
|
+
if not next_agent:
|
723
|
+
role_class = (
|
724
|
+
"a scenario.UserSimulatorAgent()"
|
725
|
+
if role == AgentRole.USER
|
726
|
+
else (
|
727
|
+
"a scenario.JudgeAgent()"
|
728
|
+
if role == AgentRole.JUDGE
|
729
|
+
else "your agent"
|
730
|
+
)
|
731
|
+
)
|
732
|
+
if content:
|
733
|
+
raise ValueError(
|
734
|
+
f"Cannot generate a message for role `{role.value}` with content `{content}` because no agent with this role was found, please add {role_class} to the scenario `agents` list"
|
735
|
+
)
|
736
|
+
raise ValueError(
|
737
|
+
f"Cannot generate a message for role `{role.value}` because no agent with this role was found, please add {role_class} to the scenario `agents` list"
|
738
|
+
)
|
739
|
+
|
740
|
+
self._pending_agents_on_turn.remove(next_agent)
|
741
|
+
self._pending_roles_on_turn.remove(role)
|
742
|
+
|
743
|
+
if content:
|
744
|
+
if isinstance(content, str):
|
745
|
+
message = ChatCompletionUserMessageParam(role="user", content=content)
|
746
|
+
else:
|
747
|
+
message = content
|
748
|
+
|
749
|
+
self.add_message(message)
|
750
|
+
if self.config.verbose:
|
751
|
+
print_openai_messages(self._scenario_name(), [message])
|
752
|
+
return
|
753
|
+
|
754
|
+
result = await self._call_agent(
|
755
|
+
idx, role=role, request_judgment=request_judgment
|
756
|
+
)
|
757
|
+
if isinstance(result, ScenarioResult):
|
758
|
+
return result
|