langwatch-scenario 0.3.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langwatch_scenario-0.6.0.dist-info/METADATA +385 -0
- langwatch_scenario-0.6.0.dist-info/RECORD +27 -0
- scenario/__init__.py +128 -17
- scenario/{error_messages.py → _error_messages.py} +8 -38
- scenario/_utils/__init__.py +32 -0
- scenario/_utils/ids.py +58 -0
- scenario/_utils/message_conversion.py +103 -0
- scenario/_utils/utils.py +425 -0
- scenario/agent_adapter.py +115 -0
- scenario/cache.py +134 -9
- scenario/config.py +156 -10
- scenario/events/__init__.py +66 -0
- scenario/events/event_bus.py +175 -0
- scenario/events/event_reporter.py +83 -0
- scenario/events/events.py +169 -0
- scenario/events/messages.py +84 -0
- scenario/events/utils.py +86 -0
- scenario/judge_agent.py +414 -0
- scenario/pytest_plugin.py +177 -14
- scenario/scenario_executor.py +630 -154
- scenario/scenario_state.py +205 -0
- scenario/script.py +361 -0
- scenario/types.py +197 -20
- scenario/user_simulator_agent.py +242 -0
- langwatch_scenario-0.3.0.dist-info/METADATA +0 -302
- langwatch_scenario-0.3.0.dist-info/RECORD +0 -16
- scenario/scenario.py +0 -238
- scenario/scenario_agent_adapter.py +0 -16
- scenario/testing_agent.py +0 -279
- scenario/utils.py +0 -264
- {langwatch_scenario-0.3.0.dist-info → langwatch_scenario-0.6.0.dist-info}/WHEEL +0 -0
- {langwatch_scenario-0.3.0.dist-info → langwatch_scenario-0.6.0.dist-info}/entry_points.txt +0 -0
- {langwatch_scenario-0.3.0.dist-info → langwatch_scenario-0.6.0.dist-info}/top_level.txt +0 -0
scenario/scenario_executor.py
CHANGED
@@ -1,134 +1,456 @@
|
|
1
1
|
"""
|
2
|
-
|
2
|
+
Scenario execution engine for agent testing.
|
3
|
+
|
4
|
+
This module contains the core ScenarioExecutor class that orchestrates the execution
|
5
|
+
of scenario tests, managing the interaction between user simulators, agents under test,
|
6
|
+
and judge agents to determine test success or failure.
|
3
7
|
"""
|
4
8
|
|
5
9
|
import sys
|
6
10
|
from typing import (
|
7
|
-
TYPE_CHECKING,
|
8
11
|
Awaitable,
|
9
12
|
Callable,
|
10
13
|
Dict,
|
11
14
|
List,
|
12
|
-
Any,
|
13
15
|
Optional,
|
14
16
|
Set,
|
15
17
|
Tuple,
|
16
18
|
Union,
|
19
|
+
TypedDict,
|
17
20
|
)
|
18
21
|
import time
|
19
22
|
import termcolor
|
23
|
+
import asyncio
|
24
|
+
import concurrent.futures
|
20
25
|
|
21
|
-
from scenario.
|
22
|
-
|
26
|
+
from scenario.config import ScenarioConfig
|
27
|
+
from scenario._utils import (
|
23
28
|
check_valid_return_type,
|
24
29
|
convert_agent_return_types_to_openai_messages,
|
25
30
|
print_openai_messages,
|
26
31
|
show_spinner,
|
32
|
+
await_if_awaitable,
|
33
|
+
get_or_create_batch_run_id,
|
34
|
+
generate_scenario_run_id,
|
27
35
|
)
|
28
36
|
from openai.types.chat import (
|
29
37
|
ChatCompletionMessageParam,
|
30
38
|
ChatCompletionUserMessageParam,
|
31
|
-
|
39
|
+
ChatCompletionAssistantMessageParam,
|
32
40
|
)
|
33
41
|
|
34
|
-
from .types import AgentInput,
|
35
|
-
from .
|
42
|
+
from .types import AgentInput, AgentRole, ScenarioResult, ScriptStep
|
43
|
+
from ._error_messages import agent_response_not_awaitable
|
36
44
|
from .cache import context_scenario
|
37
|
-
from .
|
45
|
+
from .agent_adapter import AgentAdapter
|
46
|
+
from .script import proceed
|
38
47
|
from pksuid import PKSUID
|
39
|
-
|
40
|
-
|
41
|
-
|
48
|
+
from .scenario_state import ScenarioState
|
49
|
+
from .events import (
|
50
|
+
ScenarioEventBus,
|
51
|
+
ScenarioRunStartedEvent,
|
52
|
+
ScenarioMessageSnapshotEvent,
|
53
|
+
ScenarioRunFinishedEvent,
|
54
|
+
ScenarioRunStartedEventMetadata,
|
55
|
+
ScenarioRunFinishedEventResults,
|
56
|
+
ScenarioRunFinishedEventVerdict,
|
57
|
+
ScenarioRunFinishedEventStatus,
|
58
|
+
convert_messages_to_ag_ui_messages,
|
59
|
+
)
|
42
60
|
|
43
61
|
|
44
62
|
class ScenarioExecutor:
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
63
|
+
"""
|
64
|
+
Core orchestrator for scenario-based agent testing.
|
65
|
+
|
66
|
+
The ScenarioExecutor manages the complete lifecycle of a scenario test, including:
|
67
|
+
- Orchestrating conversations between user simulators, agents, and judges
|
68
|
+
- Managing turn-based execution flow
|
69
|
+
- Handling script-based scenario control
|
70
|
+
- Collecting and reporting test results
|
71
|
+
- Supporting debug mode for interactive testing
|
72
|
+
|
73
|
+
This class serves as both a builder (for configuration) and an executor (for running tests).
|
74
|
+
Most users will interact with it through the high-level `scenario.run()` function rather
|
75
|
+
than instantiating it directly.
|
76
|
+
|
77
|
+
Attributes:
|
78
|
+
name: Human-readable name for the scenario
|
79
|
+
description: Detailed description of what the scenario tests
|
80
|
+
agents: List of agent adapters participating in the scenario
|
81
|
+
script: Optional list of script steps to control scenario flow
|
82
|
+
config: Configuration settings for execution behavior
|
83
|
+
|
84
|
+
Example:
|
85
|
+
```
|
86
|
+
# Direct instantiation (less common)
|
87
|
+
executor = ScenarioExecutor(
|
88
|
+
name="weather query test",
|
89
|
+
description="User asks about weather, agent should provide helpful response",
|
90
|
+
agents=[
|
91
|
+
weather_agent,
|
92
|
+
scenario.UserSimulatorAgent(),
|
93
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
|
94
|
+
],
|
95
|
+
max_turns=10,
|
96
|
+
verbose=True
|
97
|
+
)
|
98
|
+
result = await executor._run()
|
99
|
+
|
100
|
+
# Preferred high-level API
|
101
|
+
result = await scenario.run(
|
102
|
+
name="weather query test",
|
103
|
+
description="User asks about weather, agent should provide helpful response",
|
104
|
+
agents=[
|
105
|
+
weather_agent,
|
106
|
+
scenario.UserSimulatorAgent(),
|
107
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
|
108
|
+
]
|
109
|
+
)
|
110
|
+
```
|
111
|
+
|
112
|
+
Note:
|
113
|
+
- Scenarios run in isolated thread pools to support parallel execution
|
114
|
+
- All agent interactions are cached when cache_key is configured
|
115
|
+
- Debug mode allows step-by-step execution with user intervention
|
116
|
+
- Results include detailed timing information and conversation history
|
117
|
+
"""
|
118
|
+
|
119
|
+
name: str
|
120
|
+
description: str
|
121
|
+
agents: List[AgentAdapter]
|
122
|
+
script: List[ScriptStep]
|
123
|
+
|
124
|
+
config: ScenarioConfig
|
125
|
+
|
126
|
+
_state: ScenarioState
|
53
127
|
_total_start_time: float
|
54
128
|
_pending_messages: Dict[int, List[ChatCompletionMessageParam]]
|
55
129
|
|
56
|
-
_pending_roles_on_turn: List[
|
57
|
-
_pending_agents_on_turn: Set[
|
130
|
+
_pending_roles_on_turn: List[AgentRole] = []
|
131
|
+
_pending_agents_on_turn: Set[AgentAdapter] = set()
|
58
132
|
_agent_times: Dict[int, float] = {}
|
59
133
|
|
134
|
+
event_bus: ScenarioEventBus
|
135
|
+
|
136
|
+
batch_run_id: str
|
137
|
+
|
60
138
|
def __init__(
|
61
139
|
self,
|
62
|
-
|
63
|
-
|
140
|
+
name: str,
|
141
|
+
description: str,
|
142
|
+
agents: List[AgentAdapter] = [],
|
64
143
|
script: Optional[List[ScriptStep]] = None,
|
144
|
+
# Config
|
145
|
+
max_turns: Optional[int] = None,
|
146
|
+
verbose: Optional[Union[bool, int]] = None,
|
147
|
+
cache_key: Optional[str] = None,
|
148
|
+
debug: Optional[bool] = None,
|
149
|
+
event_bus: Optional[ScenarioEventBus] = None,
|
65
150
|
):
|
66
|
-
|
151
|
+
"""
|
152
|
+
Initialize a scenario executor.
|
153
|
+
|
154
|
+
Args:
|
155
|
+
name: Human-readable name for the scenario (used in reports and logs)
|
156
|
+
description: Detailed description of what the scenario tests.
|
157
|
+
This guides the user simulator's behavior and provides context.
|
158
|
+
agents: List of agent adapters participating in the scenario.
|
159
|
+
Typically includes: agent under test, user simulator, and judge.
|
160
|
+
script: Optional list of script steps to control scenario flow.
|
161
|
+
If not provided, defaults to automatic proceeding.
|
162
|
+
max_turns: Maximum number of conversation turns before timeout.
|
163
|
+
Overrides global configuration for this scenario.
|
164
|
+
verbose: Whether to show detailed output during execution.
|
165
|
+
Can be True/False or integer level (2 for extra details).
|
166
|
+
cache_key: Cache key for deterministic behavior across runs.
|
167
|
+
Overrides global configuration for this scenario.
|
168
|
+
debug: Whether to enable debug mode with step-by-step execution.
|
169
|
+
Overrides global configuration for this scenario.
|
170
|
+
event_reporter: Optional event reporter for the scenario
|
171
|
+
|
172
|
+
Example:
|
173
|
+
```python
|
174
|
+
executor = ScenarioExecutor(
|
175
|
+
name="customer service test",
|
176
|
+
description="Customer has a billing question and needs help",
|
177
|
+
agents=[
|
178
|
+
customer_service_agent,
|
179
|
+
scenario.UserSimulatorAgent(),
|
180
|
+
scenario.JudgeAgent(criteria=[
|
181
|
+
"Agent is polite and professional",
|
182
|
+
"Agent addresses the billing question",
|
183
|
+
"Agent provides clear next steps"
|
184
|
+
])
|
185
|
+
],
|
186
|
+
max_turns=15,
|
187
|
+
verbose=True,
|
188
|
+
debug=False
|
189
|
+
)
|
190
|
+
```
|
191
|
+
"""
|
192
|
+
self.name = name
|
193
|
+
self.description = description
|
194
|
+
self.agents = agents
|
195
|
+
self.script = script or [proceed()]
|
196
|
+
|
197
|
+
config = ScenarioConfig(
|
198
|
+
max_turns=max_turns,
|
199
|
+
verbose=verbose,
|
200
|
+
cache_key=cache_key,
|
201
|
+
debug=debug,
|
202
|
+
)
|
203
|
+
self.config = (ScenarioConfig.default_config or ScenarioConfig()).merge(config)
|
67
204
|
|
68
|
-
self.scenario = scenario.model_copy()
|
69
|
-
self._context = context
|
70
|
-
self._script = script or [scenario.proceed()]
|
71
|
-
self.current_turn = 0
|
72
205
|
self.reset()
|
73
206
|
|
207
|
+
self.event_bus = event_bus or ScenarioEventBus()
|
208
|
+
|
209
|
+
self.batch_run_id = get_or_create_batch_run_id()
|
210
|
+
|
211
|
+
@classmethod
|
212
|
+
async def run(
|
213
|
+
cls,
|
214
|
+
name: str,
|
215
|
+
description: str,
|
216
|
+
agents: List[AgentAdapter] = [],
|
217
|
+
max_turns: Optional[int] = None,
|
218
|
+
verbose: Optional[Union[bool, int]] = None,
|
219
|
+
cache_key: Optional[str] = None,
|
220
|
+
debug: Optional[bool] = None,
|
221
|
+
script: Optional[List[ScriptStep]] = None,
|
222
|
+
) -> ScenarioResult:
|
223
|
+
"""
|
224
|
+
High-level interface for running a scenario test.
|
225
|
+
|
226
|
+
This is the main entry point for executing scenario tests. It creates a
|
227
|
+
ScenarioExecutor instance and runs it in an isolated thread pool to support
|
228
|
+
parallel execution and prevent blocking.
|
229
|
+
|
230
|
+
Args:
|
231
|
+
name: Human-readable name for the scenario
|
232
|
+
description: Detailed description of what the scenario tests
|
233
|
+
agents: List of agent adapters (agent under test, user simulator, judge)
|
234
|
+
max_turns: Maximum conversation turns before timeout (default: 10)
|
235
|
+
verbose: Show detailed output during execution
|
236
|
+
cache_key: Cache key for deterministic behavior
|
237
|
+
debug: Enable debug mode for step-by-step execution
|
238
|
+
script: Optional script steps to control scenario flow
|
239
|
+
|
240
|
+
Returns:
|
241
|
+
ScenarioResult containing the test outcome, conversation history,
|
242
|
+
success/failure status, and detailed reasoning
|
243
|
+
|
244
|
+
Example:
|
245
|
+
```
|
246
|
+
import scenario
|
247
|
+
|
248
|
+
# Simple scenario with automatic flow
|
249
|
+
result = await scenario.run(
|
250
|
+
name="help request",
|
251
|
+
description="User asks for help with a technical problem",
|
252
|
+
agents=[
|
253
|
+
my_agent,
|
254
|
+
scenario.UserSimulatorAgent(),
|
255
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful response"])
|
256
|
+
]
|
257
|
+
)
|
258
|
+
|
259
|
+
# Scripted scenario with custom evaluations
|
260
|
+
result = await scenario.run(
|
261
|
+
name="custom interaction",
|
262
|
+
description="Test specific conversation flow",
|
263
|
+
agents=[
|
264
|
+
my_agent,
|
265
|
+
scenario.UserSimulatorAgent(),
|
266
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful response"])
|
267
|
+
],
|
268
|
+
script=[
|
269
|
+
scenario.user("Hello"),
|
270
|
+
scenario.agent(),
|
271
|
+
custom_eval,
|
272
|
+
scenario.succeed()
|
273
|
+
]
|
274
|
+
)
|
275
|
+
|
276
|
+
# Results analysis
|
277
|
+
print(f"Test {'PASSED' if result.success else 'FAILED'}")
|
278
|
+
print(f"Reasoning: {result.reasoning}")
|
279
|
+
print(f"Conversation had {len(result.messages)} messages")
|
280
|
+
```
|
281
|
+
|
282
|
+
Note:
|
283
|
+
- Runs in isolated thread pool to support parallel execution
|
284
|
+
- Blocks until scenario completes or times out
|
285
|
+
- All agent calls are automatically cached when cache_key is set
|
286
|
+
- Exception handling ensures clean resource cleanup
|
287
|
+
"""
|
288
|
+
scenario = cls(
|
289
|
+
name=name,
|
290
|
+
description=description,
|
291
|
+
agents=agents,
|
292
|
+
max_turns=max_turns,
|
293
|
+
verbose=verbose,
|
294
|
+
cache_key=cache_key,
|
295
|
+
debug=debug,
|
296
|
+
script=script,
|
297
|
+
)
|
298
|
+
|
299
|
+
# We'll use a thread pool to run the execution logic, we
|
300
|
+
# require a separate thread because even though asyncio is
|
301
|
+
# being used throughout, any user code on the callback can
|
302
|
+
# be blocking, preventing them from running scenarios in parallel
|
303
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
304
|
+
|
305
|
+
def run_in_thread():
|
306
|
+
loop = asyncio.new_event_loop()
|
307
|
+
asyncio.set_event_loop(loop)
|
308
|
+
|
309
|
+
try:
|
310
|
+
return loop.run_until_complete(scenario._run())
|
311
|
+
finally:
|
312
|
+
loop.run_until_complete(scenario.event_bus.drain())
|
313
|
+
loop.close()
|
314
|
+
|
315
|
+
# Run the function in the thread pool and await its result
|
316
|
+
# This converts the thread's execution into a Future that the current
|
317
|
+
# event loop can await without blocking
|
318
|
+
loop = asyncio.get_event_loop()
|
319
|
+
result = await loop.run_in_executor(executor, run_in_thread)
|
320
|
+
return result
|
321
|
+
|
74
322
|
def reset(self):
|
75
|
-
|
76
|
-
|
323
|
+
"""
|
324
|
+
Reset the scenario executor to initial state.
|
325
|
+
|
326
|
+
This method reinitializes all internal state for a fresh scenario run,
|
327
|
+
including conversation history, turn counters, and agent timing information.
|
328
|
+
Called automatically during initialization and can be used to rerun scenarios.
|
329
|
+
"""
|
330
|
+
self._state = ScenarioState(
|
331
|
+
description=self.description,
|
332
|
+
messages=[],
|
333
|
+
thread_id=str(PKSUID("thread")),
|
334
|
+
current_turn=0,
|
335
|
+
config=self.config,
|
336
|
+
_executor=self,
|
337
|
+
)
|
338
|
+
# Pydantic doesn't actually set the _executor field from the constructor, as it's private, so we need to do it manually
|
339
|
+
self._state._executor = self
|
340
|
+
|
77
341
|
self._pending_messages = {}
|
78
|
-
self.thread_id = str(PKSUID("thread"))
|
79
342
|
self._total_start_time = time.time()
|
80
343
|
self._agent_times = {}
|
81
344
|
|
82
|
-
for AgentClass in self.scenario.agents:
|
83
|
-
self._agents.append(
|
84
|
-
AgentClass(
|
85
|
-
input=AgentInput(
|
86
|
-
thread_id=self.thread_id,
|
87
|
-
messages=[],
|
88
|
-
new_messages=[],
|
89
|
-
context=self._context or {},
|
90
|
-
requested_role=list(AgentClass.roles)[0],
|
91
|
-
scenario_state=self,
|
92
|
-
)
|
93
|
-
)
|
94
|
-
)
|
95
|
-
|
96
345
|
self._new_turn()
|
97
|
-
self.current_turn = 0
|
346
|
+
self._state.current_turn = 0
|
98
347
|
|
99
|
-
context_scenario.set(self
|
348
|
+
context_scenario.set(self)
|
100
349
|
|
101
350
|
def add_message(
|
102
351
|
self, message: ChatCompletionMessageParam, from_agent_idx: Optional[int] = None
|
103
352
|
):
|
104
|
-
|
353
|
+
"""
|
354
|
+
Add a message to the conversation and broadcast to other agents.
|
355
|
+
|
356
|
+
This method adds a message to the conversation history and makes it available
|
357
|
+
to other agents in their next call. It's used internally by the executor
|
358
|
+
and can be called from script steps to inject custom messages.
|
359
|
+
|
360
|
+
Args:
|
361
|
+
message: OpenAI-compatible message to add to the conversation
|
362
|
+
from_agent_idx: Index of the agent that generated this message.
|
363
|
+
Used to avoid broadcasting the message back to its creator.
|
364
|
+
|
365
|
+
Example:
|
366
|
+
```
|
367
|
+
def inject_system_message(state: ScenarioState) -> None:
|
368
|
+
state.add_message({
|
369
|
+
"role": "system",
|
370
|
+
"content": "The user is now in a hurry"
|
371
|
+
})
|
372
|
+
|
373
|
+
# Use in script
|
374
|
+
result = await scenario.run(
|
375
|
+
name="system message test",
|
376
|
+
agents=[agent, user_sim, judge],
|
377
|
+
script=[
|
378
|
+
scenario.user("Hello"),
|
379
|
+
scenario.agent(),
|
380
|
+
inject_system_message,
|
381
|
+
scenario.user(), # Will see the system message
|
382
|
+
scenario.succeed()
|
383
|
+
]
|
384
|
+
)
|
385
|
+
```
|
386
|
+
"""
|
387
|
+
self._state.messages.append(message)
|
105
388
|
|
106
389
|
# Broadcast the message to other agents
|
107
|
-
for idx, _ in enumerate(self.
|
390
|
+
for idx, _ in enumerate(self.agents):
|
108
391
|
if idx == from_agent_idx:
|
109
392
|
continue
|
110
393
|
if idx not in self._pending_messages:
|
111
394
|
self._pending_messages[idx] = []
|
112
395
|
self._pending_messages[idx].append(message)
|
113
396
|
|
397
|
+
|
114
398
|
def add_messages(
|
115
399
|
self,
|
116
400
|
messages: List[ChatCompletionMessageParam],
|
117
401
|
from_agent_idx: Optional[int] = None,
|
118
402
|
):
|
403
|
+
"""
|
404
|
+
Add multiple messages to the conversation.
|
405
|
+
|
406
|
+
Convenience method for adding multiple messages at once. Each message
|
407
|
+
is added individually using add_message().
|
408
|
+
|
409
|
+
Args:
|
410
|
+
messages: List of OpenAI-compatible messages to add
|
411
|
+
from_agent_idx: Index of the agent that generated these messages
|
412
|
+
|
413
|
+
Example:
|
414
|
+
```
|
415
|
+
# Agent returns multiple messages for a complex interaction
|
416
|
+
messages = [
|
417
|
+
{"role": "assistant", "content": "Let me search for that..."},
|
418
|
+
{"role": "assistant", "content": "Here's what I found: ..."}
|
419
|
+
]
|
420
|
+
executor.add_messages(messages, from_agent_idx=0)
|
421
|
+
```
|
422
|
+
"""
|
119
423
|
for message in messages:
|
120
424
|
self.add_message(message, from_agent_idx)
|
121
425
|
|
122
426
|
def _new_turn(self):
|
123
|
-
self._pending_agents_on_turn = set(self.
|
427
|
+
self._pending_agents_on_turn = set(self.agents)
|
124
428
|
self._pending_roles_on_turn = [
|
125
|
-
|
126
|
-
|
127
|
-
|
429
|
+
AgentRole.USER,
|
430
|
+
AgentRole.AGENT,
|
431
|
+
AgentRole.JUDGE,
|
128
432
|
]
|
129
|
-
self.current_turn += 1
|
433
|
+
self._state.current_turn += 1
|
130
434
|
|
131
435
|
async def step(self) -> Union[List[ChatCompletionMessageParam], ScenarioResult]:
|
436
|
+
"""
|
437
|
+
Execute a single step in the scenario.
|
438
|
+
|
439
|
+
A step consists of calling the next agent in the current turn's sequence
|
440
|
+
and processing their response. This method is used internally by the
|
441
|
+
scenario execution flow.
|
442
|
+
|
443
|
+
Returns:
|
444
|
+
Either a list of messages (if the scenario continues) or a
|
445
|
+
ScenarioResult (if the scenario should end)
|
446
|
+
|
447
|
+
Raises:
|
448
|
+
ValueError: If no result is returned from the internal step method
|
449
|
+
|
450
|
+
Note:
|
451
|
+
This is primarily an internal method. Most users should use the
|
452
|
+
high-level run() method or script DSL functions instead.
|
453
|
+
"""
|
132
454
|
result = await self._step()
|
133
455
|
if result is None:
|
134
456
|
raise ValueError("No result from step")
|
@@ -139,8 +461,8 @@ class ScenarioExecutor:
|
|
139
461
|
go_to_next_turn=True,
|
140
462
|
on_turn: Optional[
|
141
463
|
Union[
|
142
|
-
Callable[["
|
143
|
-
Callable[["
|
464
|
+
Callable[["ScenarioState"], None],
|
465
|
+
Callable[["ScenarioState"], Awaitable[None]],
|
144
466
|
]
|
145
467
|
] = None,
|
146
468
|
) -> Union[List[ChatCompletionMessageParam], ScenarioResult, None]:
|
@@ -151,9 +473,9 @@ class ScenarioExecutor:
|
|
151
473
|
self._new_turn()
|
152
474
|
|
153
475
|
if on_turn:
|
154
|
-
await await_if_awaitable(on_turn(self))
|
476
|
+
await await_if_awaitable(on_turn(self._state))
|
155
477
|
|
156
|
-
if self.current_turn >= (self.
|
478
|
+
if self._state.current_turn >= (self.config.max_turns or 10):
|
157
479
|
return self._reached_max_turns()
|
158
480
|
|
159
481
|
current_role = self._pending_roles_on_turn[0]
|
@@ -166,10 +488,14 @@ class ScenarioExecutor:
|
|
166
488
|
return await self._call_agent(idx, role=current_role)
|
167
489
|
|
168
490
|
def _next_agent_for_role(
|
169
|
-
self, role:
|
170
|
-
) -> Tuple[int, Optional[
|
171
|
-
for idx, agent in enumerate(self.
|
172
|
-
if
|
491
|
+
self, role: AgentRole
|
492
|
+
) -> Tuple[int, Optional[AgentAdapter]]:
|
493
|
+
for idx, agent in enumerate(self.agents):
|
494
|
+
if (
|
495
|
+
role == agent.role
|
496
|
+
and agent in self._pending_agents_on_turn
|
497
|
+
and agent.role in self._pending_roles_on_turn
|
498
|
+
):
|
173
499
|
return idx, agent
|
174
500
|
return -1, None
|
175
501
|
|
@@ -177,8 +503,8 @@ class ScenarioExecutor:
|
|
177
503
|
# If we reached max turns without conclusion, fail the test
|
178
504
|
agent_roles_agents_idx = [
|
179
505
|
idx
|
180
|
-
for idx, agent in enumerate(self.
|
181
|
-
if
|
506
|
+
for idx, agent in enumerate(self.agents)
|
507
|
+
if agent.role == AgentRole.AGENT
|
182
508
|
]
|
183
509
|
agent_times = [
|
184
510
|
self._agent_times[idx]
|
@@ -189,14 +515,14 @@ class ScenarioExecutor:
|
|
189
515
|
|
190
516
|
return ScenarioResult(
|
191
517
|
success=False,
|
192
|
-
messages=self.messages,
|
518
|
+
messages=self._state.messages,
|
193
519
|
reasoning=error_message
|
194
|
-
or f"Reached maximum turns ({self.
|
520
|
+
or f"Reached maximum turns ({self.config.max_turns or 10}) without conclusion",
|
195
521
|
total_time=time.time() - self._total_start_time,
|
196
522
|
agent_time=agent_time,
|
197
523
|
)
|
198
524
|
|
199
|
-
async def
|
525
|
+
async def _run(self) -> ScenarioResult:
|
200
526
|
"""
|
201
527
|
Run a scenario against the agent under test.
|
202
528
|
|
@@ -206,37 +532,61 @@ class ScenarioExecutor:
|
|
206
532
|
Returns:
|
207
533
|
ScenarioResult containing the test outcome
|
208
534
|
"""
|
535
|
+
scenario_run_id = generate_scenario_run_id()
|
209
536
|
|
210
|
-
|
211
|
-
|
537
|
+
try:
|
538
|
+
await self.event_bus.listen()
|
539
|
+
self._emit_run_started_event(scenario_run_id)
|
212
540
|
|
213
|
-
|
541
|
+
if self.config.verbose:
|
542
|
+
print("") # new line
|
214
543
|
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
544
|
+
self.reset()
|
545
|
+
|
546
|
+
for script_step in self.script:
|
547
|
+
callable = script_step(self._state)
|
548
|
+
if isinstance(callable, Awaitable):
|
549
|
+
result = await callable
|
550
|
+
else:
|
551
|
+
result = callable
|
552
|
+
self._emit_message_snapshot_event(scenario_run_id)
|
221
553
|
|
222
|
-
|
223
|
-
|
554
|
+
if isinstance(result, ScenarioResult):
|
555
|
+
status = ScenarioRunFinishedEventStatus.SUCCESS if result.success else ScenarioRunFinishedEventStatus.FAILED
|
556
|
+
self._emit_run_finished_event(scenario_run_id, result, status)
|
557
|
+
return result
|
224
558
|
|
225
|
-
|
226
|
-
|
559
|
+
result = self._reached_max_turns(
|
560
|
+
"""Reached end of script without conclusion, add one of the following to the end of the script:
|
227
561
|
|
228
562
|
- `scenario.proceed()` to let the simulation continue to play out
|
229
563
|
- `scenario.judge()` to force criteria judgement
|
230
564
|
- `scenario.succeed()` or `scenario.fail()` to end the test with an explicit result
|
231
|
-
|
232
|
-
|
565
|
+
"""
|
566
|
+
)
|
567
|
+
|
568
|
+
status = ScenarioRunFinishedEventStatus.SUCCESS if result.success else ScenarioRunFinishedEventStatus.FAILED
|
569
|
+
self._emit_run_finished_event(scenario_run_id, result, status)
|
570
|
+
return result
|
571
|
+
|
572
|
+
except Exception as e:
|
573
|
+
# Publish failure event before propagating the error
|
574
|
+
error_result = ScenarioResult(
|
575
|
+
success=False,
|
576
|
+
messages=self._state.messages,
|
577
|
+
reasoning=f"Scenario failed with error: {str(e)}",
|
578
|
+
total_time=time.time() - self._total_start_time,
|
579
|
+
agent_time=0,
|
580
|
+
)
|
581
|
+
self._emit_run_finished_event(scenario_run_id, error_result, ScenarioRunFinishedEventStatus.ERROR)
|
582
|
+
raise # Re-raise the exception after cleanup
|
233
583
|
|
234
584
|
async def _call_agent(
|
235
|
-
self, idx: int, role:
|
585
|
+
self, idx: int, role: AgentRole, request_judgment: bool = False
|
236
586
|
) -> Union[List[ChatCompletionMessageParam], ScenarioResult]:
|
237
|
-
agent = self.
|
587
|
+
agent = self.agents[idx]
|
238
588
|
|
239
|
-
if role ==
|
589
|
+
if role == AgentRole.USER and self.config.debug:
|
240
590
|
print(
|
241
591
|
f"\n{self._scenario_name()}{termcolor.colored('[Debug Mode]', 'yellow')} Press enter to continue or type a message to send"
|
242
592
|
)
|
@@ -258,28 +608,26 @@ class ScenarioExecutor:
|
|
258
608
|
with show_spinner(
|
259
609
|
text=(
|
260
610
|
"Judging..."
|
261
|
-
if role ==
|
262
|
-
else f"{role.value if isinstance(role,
|
611
|
+
if role == AgentRole.JUDGE
|
612
|
+
else f"{role.value if isinstance(role, AgentRole) else role}:"
|
263
613
|
),
|
264
614
|
color=(
|
265
615
|
"blue"
|
266
|
-
if role ==
|
267
|
-
else "green" if role ==
|
616
|
+
if role == AgentRole.AGENT
|
617
|
+
else "green" if role == AgentRole.USER else "yellow"
|
268
618
|
),
|
269
|
-
enabled=self.
|
619
|
+
enabled=self.config.verbose,
|
270
620
|
):
|
271
621
|
start_time = time.time()
|
272
622
|
|
273
623
|
agent_response = agent.call(
|
274
624
|
AgentInput(
|
275
625
|
# TODO: test thread_id
|
276
|
-
thread_id=self.thread_id,
|
277
|
-
messages=self.messages,
|
626
|
+
thread_id=self._state.thread_id,
|
627
|
+
messages=self._state.messages,
|
278
628
|
new_messages=self._pending_messages.get(idx, []),
|
279
|
-
|
280
|
-
|
281
|
-
requested_role=role,
|
282
|
-
scenario_state=self,
|
629
|
+
judgment_request=request_judgment,
|
630
|
+
scenario_state=self._state,
|
283
631
|
)
|
284
632
|
)
|
285
633
|
if not isinstance(agent_response, Awaitable):
|
@@ -303,12 +651,12 @@ class ScenarioExecutor:
|
|
303
651
|
else:
|
304
652
|
messages = convert_agent_return_types_to_openai_messages(
|
305
653
|
agent_response,
|
306
|
-
role="user" if role ==
|
654
|
+
role="user" if role == AgentRole.USER else "assistant",
|
307
655
|
)
|
308
656
|
|
309
657
|
self.add_messages(messages, from_agent_idx=idx)
|
310
658
|
|
311
|
-
if messages and self.
|
659
|
+
if messages and self.config.verbose:
|
312
660
|
print_openai_messages(
|
313
661
|
self._scenario_name(),
|
314
662
|
[m for m in messages if m["role"] != "system"],
|
@@ -317,75 +665,51 @@ class ScenarioExecutor:
|
|
317
665
|
return messages
|
318
666
|
|
319
667
|
def _scenario_name(self):
|
320
|
-
if self.
|
321
|
-
return termcolor.colored(f"[Scenario: {self.
|
668
|
+
if self.config.verbose == 2:
|
669
|
+
return termcolor.colored(f"[Scenario: {self.name}] ", "yellow")
|
322
670
|
else:
|
323
671
|
return ""
|
324
672
|
|
325
|
-
# State access utils
|
326
|
-
|
327
|
-
def last_message(self) -> ChatCompletionMessageParam:
|
328
|
-
if len(self.messages) == 0:
|
329
|
-
raise ValueError("No messages found")
|
330
|
-
return self.messages[-1]
|
331
|
-
|
332
|
-
def last_user_message(self) -> ChatCompletionUserMessageParam:
|
333
|
-
user_messages = [m for m in self.messages if m["role"] == "user"]
|
334
|
-
if not user_messages:
|
335
|
-
raise ValueError("No user messages found")
|
336
|
-
return user_messages[-1]
|
337
|
-
|
338
|
-
def last_tool_call(
|
339
|
-
self, tool_name: str
|
340
|
-
) -> Optional[ChatCompletionMessageToolCallParam]:
|
341
|
-
for message in reversed(self.messages):
|
342
|
-
if message["role"] == "assistant" and "tool_calls" in message:
|
343
|
-
for tool_call in message["tool_calls"]:
|
344
|
-
if tool_call["function"]["name"] == tool_name:
|
345
|
-
return tool_call
|
346
|
-
return None
|
347
|
-
|
348
|
-
def has_tool_call(self, tool_name: str) -> bool:
|
349
|
-
return self.last_tool_call(tool_name) is not None
|
350
|
-
|
351
673
|
# Scripting utils
|
352
674
|
|
353
675
|
async def message(self, message: ChatCompletionMessageParam) -> None:
|
354
676
|
if message["role"] == "user":
|
355
|
-
await self._script_call_agent(
|
677
|
+
await self._script_call_agent(AgentRole.USER, message)
|
356
678
|
elif message["role"] == "assistant":
|
357
|
-
await self._script_call_agent(
|
679
|
+
await self._script_call_agent(AgentRole.AGENT, message)
|
358
680
|
else:
|
359
681
|
self.add_message(message)
|
360
682
|
|
361
683
|
async def user(
|
362
684
|
self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
|
363
685
|
) -> None:
|
364
|
-
await self._script_call_agent(
|
686
|
+
await self._script_call_agent(AgentRole.USER, content)
|
365
687
|
|
366
688
|
async def agent(
|
367
689
|
self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
|
368
690
|
) -> None:
|
369
|
-
await self._script_call_agent(
|
691
|
+
await self._script_call_agent(AgentRole.AGENT, content)
|
370
692
|
|
371
693
|
async def judge(
|
372
694
|
self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
|
373
695
|
) -> Optional[ScenarioResult]:
|
374
|
-
return await self._script_call_agent(
|
696
|
+
return await self._script_call_agent(
|
697
|
+
AgentRole.JUDGE, content, request_judgment=True
|
698
|
+
)
|
375
699
|
|
376
700
|
async def proceed(
|
377
701
|
self,
|
378
702
|
turns: Optional[int] = None,
|
379
703
|
on_turn: Optional[
|
380
704
|
Union[
|
381
|
-
Callable[["
|
382
|
-
Callable[["
|
705
|
+
Callable[["ScenarioState"], None],
|
706
|
+
Callable[["ScenarioState"], Awaitable[None]],
|
383
707
|
]
|
384
708
|
] = None,
|
385
709
|
on_step: Optional[
|
386
710
|
Union[
|
387
|
-
Callable[["
|
388
|
-
Callable[["
|
711
|
+
Callable[["ScenarioState"], None],
|
712
|
+
Callable[["ScenarioState"], Awaitable[None]],
|
389
713
|
]
|
390
714
|
] = None,
|
391
715
|
) -> Optional[ScenarioResult]:
|
@@ -396,71 +720,223 @@ class ScenarioExecutor:
|
|
396
720
|
go_to_next_turn=(
|
397
721
|
turns is None
|
398
722
|
or initial_turn is None
|
399
|
-
or (self.current_turn + 1 < initial_turn + turns)
|
723
|
+
or (self._state.current_turn + 1 < initial_turn + turns)
|
400
724
|
),
|
401
725
|
)
|
402
726
|
|
403
727
|
if initial_turn is None:
|
404
|
-
initial_turn = self.current_turn
|
728
|
+
initial_turn = self._state.current_turn
|
405
729
|
|
406
730
|
if next_message is None:
|
407
731
|
break
|
408
732
|
|
409
733
|
if on_step:
|
410
|
-
await await_if_awaitable(on_step(self))
|
734
|
+
await await_if_awaitable(on_step(self._state))
|
411
735
|
|
412
736
|
if isinstance(next_message, ScenarioResult):
|
413
737
|
return next_message
|
414
738
|
|
415
|
-
async def succeed(self) -> ScenarioResult:
|
739
|
+
async def succeed(self, reasoning: Optional[str] = None) -> ScenarioResult:
|
416
740
|
return ScenarioResult(
|
417
741
|
success=True,
|
418
|
-
messages=self.messages,
|
419
|
-
reasoning=
|
420
|
-
|
742
|
+
messages=self._state.messages,
|
743
|
+
reasoning=reasoning
|
744
|
+
or "Scenario marked as successful with scenario.succeed()",
|
421
745
|
)
|
422
746
|
|
423
|
-
async def fail(self) -> ScenarioResult:
|
747
|
+
async def fail(self, reasoning: Optional[str] = None) -> ScenarioResult:
|
424
748
|
return ScenarioResult(
|
425
749
|
success=False,
|
426
|
-
messages=self.messages,
|
427
|
-
reasoning="Scenario marked as failed with scenario.fail()",
|
428
|
-
passed_criteria=self.scenario.criteria,
|
750
|
+
messages=self._state.messages,
|
751
|
+
reasoning=reasoning or "Scenario marked as failed with scenario.fail()",
|
429
752
|
)
|
430
753
|
|
754
|
+
def _consume_until_role(self, role: AgentRole) -> None:
|
755
|
+
while len(self._pending_roles_on_turn) > 0:
|
756
|
+
next_role = self._pending_roles_on_turn[0]
|
757
|
+
if next_role == role:
|
758
|
+
break
|
759
|
+
self._pending_roles_on_turn.pop(0)
|
760
|
+
|
431
761
|
async def _script_call_agent(
|
432
762
|
self,
|
433
|
-
role:
|
763
|
+
role: AgentRole,
|
434
764
|
content: Optional[Union[str, ChatCompletionMessageParam]] = None,
|
765
|
+
request_judgment: bool = False,
|
435
766
|
) -> Optional[ScenarioResult]:
|
767
|
+
self._consume_until_role(role)
|
436
768
|
idx, next_agent = self._next_agent_for_role(role)
|
437
769
|
if not next_agent:
|
438
770
|
self._new_turn()
|
771
|
+
self._consume_until_role(role)
|
439
772
|
idx, next_agent = self._next_agent_for_role(role)
|
440
773
|
|
441
774
|
if not next_agent:
|
775
|
+
role_class = (
|
776
|
+
"a scenario.UserSimulatorAgent()"
|
777
|
+
if role == AgentRole.USER
|
778
|
+
else (
|
779
|
+
"a scenario.JudgeAgent()"
|
780
|
+
if role == AgentRole.JUDGE
|
781
|
+
else "your agent"
|
782
|
+
)
|
783
|
+
)
|
442
784
|
if content:
|
443
785
|
raise ValueError(
|
444
|
-
f"Cannot generate a message for role `{role.value}` with content `{content}` because no agent with this role was found"
|
786
|
+
f"Cannot generate a message for role `{role.value}` with content `{content}` because no agent with this role was found, please add {role_class} to the scenario `agents` list"
|
445
787
|
)
|
446
788
|
raise ValueError(
|
447
|
-
f"Cannot generate a message for role `{role.value}` because no agent with this role was found"
|
789
|
+
f"Cannot generate a message for role `{role.value}` because no agent with this role was found, please add {role_class} to the scenario `agents` list"
|
448
790
|
)
|
449
791
|
|
450
792
|
self._pending_agents_on_turn.remove(next_agent)
|
451
|
-
self._pending_roles_on_turn.remove(role)
|
452
793
|
|
453
794
|
if content:
|
454
795
|
if isinstance(content, str):
|
455
|
-
message =
|
796
|
+
message = (
|
797
|
+
ChatCompletionUserMessageParam(role="user", content=content)
|
798
|
+
if role == AgentRole.USER
|
799
|
+
else ChatCompletionAssistantMessageParam(
|
800
|
+
role="assistant", content=content
|
801
|
+
)
|
802
|
+
)
|
456
803
|
else:
|
457
804
|
message = content
|
458
805
|
|
459
806
|
self.add_message(message)
|
460
|
-
if self.
|
807
|
+
if self.config.verbose:
|
461
808
|
print_openai_messages(self._scenario_name(), [message])
|
462
809
|
return
|
463
810
|
|
464
|
-
result = await self._call_agent(
|
811
|
+
result = await self._call_agent(
|
812
|
+
idx, role=role, request_judgment=request_judgment
|
813
|
+
)
|
465
814
|
if isinstance(result, ScenarioResult):
|
466
815
|
return result
|
816
|
+
|
817
|
+
# Event handling methods
|
818
|
+
|
819
|
+
class _CommonEventFields(TypedDict):
|
820
|
+
"""
|
821
|
+
Common fields shared across all scenario events.
|
822
|
+
|
823
|
+
These fields provide consistent identification and timing information
|
824
|
+
for all events emitted during scenario execution.
|
825
|
+
|
826
|
+
Attributes:
|
827
|
+
batch_run_id: Unique identifier for the batch of scenario runs
|
828
|
+
scenario_run_id: Unique identifier for this specific scenario run
|
829
|
+
scenario_id: Human-readable name/identifier for the scenario
|
830
|
+
timestamp: Unix timestamp in milliseconds when the event occurred
|
831
|
+
"""
|
832
|
+
batch_run_id: str
|
833
|
+
scenario_run_id: str
|
834
|
+
scenario_id: str
|
835
|
+
timestamp: int
|
836
|
+
|
837
|
+
def _create_common_event_fields(self, scenario_run_id: str) -> _CommonEventFields:
|
838
|
+
"""
|
839
|
+
Create common fields used across all scenario events.
|
840
|
+
|
841
|
+
This method generates the standard fields that every scenario event
|
842
|
+
must include for proper identification and timing.
|
843
|
+
|
844
|
+
Args:
|
845
|
+
scenario_run_id: Unique identifier for the current scenario run
|
846
|
+
|
847
|
+
Returns:
|
848
|
+
Dictionary containing common event fields with current timestamp
|
849
|
+
"""
|
850
|
+
return {
|
851
|
+
"batch_run_id": self.batch_run_id,
|
852
|
+
"scenario_run_id": scenario_run_id,
|
853
|
+
"scenario_id": self.name,
|
854
|
+
"timestamp": int(time.time() * 1000),
|
855
|
+
}
|
856
|
+
|
857
|
+
def _emit_run_started_event(self, scenario_run_id: str) -> None:
|
858
|
+
"""
|
859
|
+
Emit a scenario run started event.
|
860
|
+
|
861
|
+
This event is published when a scenario begins execution. It includes
|
862
|
+
metadata about the scenario such as name and description, and is used
|
863
|
+
to track the start of scenario runs in monitoring systems.
|
864
|
+
|
865
|
+
Args:
|
866
|
+
scenario_run_id: Unique identifier for the current scenario run
|
867
|
+
|
868
|
+
Note:
|
869
|
+
This event is automatically published at the beginning of `_run()`
|
870
|
+
and signals the start of scenario execution to any event listeners.
|
871
|
+
"""
|
872
|
+
common_fields = self._create_common_event_fields(scenario_run_id)
|
873
|
+
metadata = ScenarioRunStartedEventMetadata(
|
874
|
+
name=self.name,
|
875
|
+
description=self.description,
|
876
|
+
)
|
877
|
+
|
878
|
+
event = ScenarioRunStartedEvent(
|
879
|
+
**common_fields,
|
880
|
+
metadata=metadata,
|
881
|
+
)
|
882
|
+
self.event_bus.publish(event)
|
883
|
+
|
884
|
+
def _emit_message_snapshot_event(self, scenario_run_id: str) -> None:
|
885
|
+
"""
|
886
|
+
Emit a message snapshot event.
|
887
|
+
|
888
|
+
This event captures the current state of the conversation during
|
889
|
+
scenario execution. It's published whenever messages are added to
|
890
|
+
the conversation, allowing real-time tracking of scenario progress.
|
891
|
+
|
892
|
+
Note:
|
893
|
+
This event is automatically published by `add_message()` and
|
894
|
+
`add_messages()` to provide continuous visibility into scenario
|
895
|
+
execution state.
|
896
|
+
"""
|
897
|
+
common_fields = self._create_common_event_fields(scenario_run_id)
|
898
|
+
|
899
|
+
event = ScenarioMessageSnapshotEvent(
|
900
|
+
**common_fields,
|
901
|
+
messages=convert_messages_to_ag_ui_messages(self._state.messages),
|
902
|
+
)
|
903
|
+
self.event_bus.publish(event)
|
904
|
+
|
905
|
+
def _emit_run_finished_event(
|
906
|
+
self,
|
907
|
+
scenario_run_id: str,
|
908
|
+
result: ScenarioResult,
|
909
|
+
status: ScenarioRunFinishedEventStatus
|
910
|
+
) -> None:
|
911
|
+
"""
|
912
|
+
Emit a scenario run finished event.
|
913
|
+
|
914
|
+
This event is published when a scenario completes execution, whether
|
915
|
+
successfully or with an error. It includes the final results, verdict,
|
916
|
+
and reasoning for the scenario outcome.
|
917
|
+
|
918
|
+
Args:
|
919
|
+
scenario_run_id: Unique identifier for the current scenario run
|
920
|
+
result: The final scenario result containing success/failure status
|
921
|
+
status: The execution status (SUCCESS, FAILED, or ERROR)
|
922
|
+
|
923
|
+
Note:
|
924
|
+
This event is automatically published at the end of `_run()` and
|
925
|
+
signals the completion of scenario execution to any event listeners.
|
926
|
+
It includes detailed results for monitoring and analysis purposes.
|
927
|
+
"""
|
928
|
+
common_fields = self._create_common_event_fields(scenario_run_id)
|
929
|
+
|
930
|
+
results = ScenarioRunFinishedEventResults(
|
931
|
+
verdict=ScenarioRunFinishedEventVerdict.SUCCESS if result.success else ScenarioRunFinishedEventVerdict.FAILURE,
|
932
|
+
reasoning=result.reasoning or "",
|
933
|
+
met_criteria=result.passed_criteria,
|
934
|
+
unmet_criteria=result.failed_criteria,
|
935
|
+
)
|
936
|
+
|
937
|
+
event = ScenarioRunFinishedEvent(
|
938
|
+
**common_fields,
|
939
|
+
status=status,
|
940
|
+
results=results,
|
941
|
+
)
|
942
|
+
self.event_bus.publish(event)
|