langwatch-scenario 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langwatch_scenario-0.3.0.dist-info → langwatch_scenario-0.4.0.dist-info}/METADATA +140 -79
- langwatch_scenario-0.4.0.dist-info/RECORD +18 -0
- scenario/__init__.py +223 -9
- scenario/agent_adapter.py +111 -0
- scenario/cache.py +132 -8
- scenario/config.py +154 -10
- scenario/error_messages.py +8 -38
- scenario/judge_agent.py +435 -0
- scenario/pytest_plugin.py +223 -15
- scenario/scenario_executor.py +428 -136
- scenario/scenario_state.py +205 -0
- scenario/script.py +361 -0
- scenario/types.py +193 -20
- scenario/user_simulator_agent.py +249 -0
- scenario/utils.py +252 -2
- langwatch_scenario-0.3.0.dist-info/RECORD +0 -16
- scenario/scenario.py +0 -238
- scenario/scenario_agent_adapter.py +0 -16
- scenario/testing_agent.py +0 -279
- {langwatch_scenario-0.3.0.dist-info → langwatch_scenario-0.4.0.dist-info}/WHEEL +0 -0
- {langwatch_scenario-0.3.0.dist-info → langwatch_scenario-0.4.0.dist-info}/entry_points.txt +0 -0
- {langwatch_scenario-0.3.0.dist-info → langwatch_scenario-0.4.0.dist-info}/top_level.txt +0 -0
scenario/scenario_executor.py
CHANGED
@@ -1,10 +1,13 @@
|
|
1
1
|
"""
|
2
|
-
|
2
|
+
Scenario execution engine for agent testing.
|
3
|
+
|
4
|
+
This module contains the core ScenarioExecutor class that orchestrates the execution
|
5
|
+
of scenario tests, managing the interaction between user simulators, agents under test,
|
6
|
+
and judge agents to determine test success or failure.
|
3
7
|
"""
|
4
8
|
|
5
9
|
import sys
|
6
10
|
from typing import (
|
7
|
-
TYPE_CHECKING,
|
8
11
|
Awaitable,
|
9
12
|
Callable,
|
10
13
|
Dict,
|
@@ -17,7 +20,10 @@ from typing import (
|
|
17
20
|
)
|
18
21
|
import time
|
19
22
|
import termcolor
|
23
|
+
import asyncio
|
24
|
+
import concurrent.futures
|
20
25
|
|
26
|
+
from scenario.config import ScenarioConfig
|
21
27
|
from scenario.utils import (
|
22
28
|
await_if_awaitable,
|
23
29
|
check_valid_return_type,
|
@@ -28,83 +34,346 @@ from scenario.utils import (
|
|
28
34
|
from openai.types.chat import (
|
29
35
|
ChatCompletionMessageParam,
|
30
36
|
ChatCompletionUserMessageParam,
|
31
|
-
ChatCompletionMessageToolCallParam,
|
32
37
|
)
|
33
38
|
|
34
|
-
from .types import AgentInput,
|
39
|
+
from .types import AgentInput, AgentRole, ScenarioResult, ScriptStep
|
35
40
|
from .error_messages import agent_response_not_awaitable
|
36
41
|
from .cache import context_scenario
|
37
|
-
from .
|
42
|
+
from .agent_adapter import AgentAdapter
|
43
|
+
from .script import proceed
|
38
44
|
from pksuid import PKSUID
|
39
|
-
|
40
|
-
if TYPE_CHECKING:
|
41
|
-
from scenario.scenario import Scenario
|
45
|
+
from .scenario_state import ScenarioState
|
42
46
|
|
43
47
|
|
44
48
|
class ScenarioExecutor:
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
49
|
+
"""
|
50
|
+
Core orchestrator for scenario-based agent testing.
|
51
|
+
|
52
|
+
The ScenarioExecutor manages the complete lifecycle of a scenario test, including:
|
53
|
+
- Orchestrating conversations between user simulators, agents, and judges
|
54
|
+
- Managing turn-based execution flow
|
55
|
+
- Handling script-based scenario control
|
56
|
+
- Collecting and reporting test results
|
57
|
+
- Supporting debug mode for interactive testing
|
58
|
+
|
59
|
+
This class serves as both a builder (for configuration) and an executor (for running tests).
|
60
|
+
Most users will interact with it through the high-level `scenario.run()` function rather
|
61
|
+
than instantiating it directly.
|
62
|
+
|
63
|
+
Attributes:
|
64
|
+
name: Human-readable name for the scenario
|
65
|
+
description: Detailed description of what the scenario tests
|
66
|
+
agents: List of agent adapters participating in the scenario
|
67
|
+
script: Optional list of script steps to control scenario flow
|
68
|
+
config: Configuration settings for execution behavior
|
69
|
+
|
70
|
+
Example:
|
71
|
+
```python
|
72
|
+
# Direct instantiation (less common)
|
73
|
+
executor = ScenarioExecutor(
|
74
|
+
name="weather query test",
|
75
|
+
description="User asks about weather, agent should provide helpful response",
|
76
|
+
agents=[
|
77
|
+
weather_agent,
|
78
|
+
scenario.UserSimulatorAgent(),
|
79
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
|
80
|
+
],
|
81
|
+
max_turns=10,
|
82
|
+
verbose=True
|
83
|
+
)
|
84
|
+
result = await executor._run()
|
85
|
+
|
86
|
+
# Preferred high-level API
|
87
|
+
result = await scenario.run(
|
88
|
+
name="weather query test",
|
89
|
+
description="User asks about weather, agent should provide helpful response",
|
90
|
+
agents=[
|
91
|
+
weather_agent,
|
92
|
+
scenario.UserSimulatorAgent(),
|
93
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
|
94
|
+
]
|
95
|
+
)
|
96
|
+
```
|
97
|
+
|
98
|
+
Note:
|
99
|
+
- Scenarios run in isolated thread pools to support parallel execution
|
100
|
+
- All agent interactions are cached when cache_key is configured
|
101
|
+
- Debug mode allows step-by-step execution with user intervention
|
102
|
+
- Results include detailed timing information and conversation history
|
103
|
+
"""
|
104
|
+
name: str
|
105
|
+
description: str
|
106
|
+
agents: List[AgentAdapter]
|
107
|
+
script: List[ScriptStep]
|
108
|
+
|
109
|
+
config: ScenarioConfig
|
110
|
+
|
111
|
+
_state: ScenarioState
|
53
112
|
_total_start_time: float
|
54
113
|
_pending_messages: Dict[int, List[ChatCompletionMessageParam]]
|
55
114
|
|
56
|
-
_pending_roles_on_turn: List[
|
57
|
-
_pending_agents_on_turn: Set[
|
115
|
+
_pending_roles_on_turn: List[AgentRole] = []
|
116
|
+
_pending_agents_on_turn: Set[AgentAdapter] = set()
|
58
117
|
_agent_times: Dict[int, float] = {}
|
59
118
|
|
60
119
|
def __init__(
|
61
120
|
self,
|
62
|
-
|
63
|
-
|
121
|
+
name: str,
|
122
|
+
description: str,
|
123
|
+
agents: List[AgentAdapter] = [],
|
64
124
|
script: Optional[List[ScriptStep]] = None,
|
125
|
+
# Config
|
126
|
+
max_turns: Optional[int] = None,
|
127
|
+
verbose: Optional[Union[bool, int]] = None,
|
128
|
+
cache_key: Optional[str] = None,
|
129
|
+
debug: Optional[bool] = None,
|
65
130
|
):
|
66
|
-
|
131
|
+
"""
|
132
|
+
Initialize a scenario executor.
|
133
|
+
|
134
|
+
Args:
|
135
|
+
name: Human-readable name for the scenario (used in reports and logs)
|
136
|
+
description: Detailed description of what the scenario tests.
|
137
|
+
This guides the user simulator's behavior and provides context.
|
138
|
+
agents: List of agent adapters participating in the scenario.
|
139
|
+
Typically includes: agent under test, user simulator, and judge.
|
140
|
+
script: Optional list of script steps to control scenario flow.
|
141
|
+
If not provided, defaults to automatic proceeding.
|
142
|
+
max_turns: Maximum number of conversation turns before timeout.
|
143
|
+
Overrides global configuration for this scenario.
|
144
|
+
verbose: Whether to show detailed output during execution.
|
145
|
+
Can be True/False or integer level (2 for extra details).
|
146
|
+
cache_key: Cache key for deterministic behavior across runs.
|
147
|
+
Overrides global configuration for this scenario.
|
148
|
+
debug: Whether to enable debug mode with step-by-step execution.
|
149
|
+
Overrides global configuration for this scenario.
|
150
|
+
|
151
|
+
Example:
|
152
|
+
```python
|
153
|
+
executor = ScenarioExecutor(
|
154
|
+
name="customer service test",
|
155
|
+
description="Customer has a billing question and needs help",
|
156
|
+
agents=[
|
157
|
+
customer_service_agent,
|
158
|
+
scenario.UserSimulatorAgent(),
|
159
|
+
scenario.JudgeAgent(criteria=[
|
160
|
+
"Agent is polite and professional",
|
161
|
+
"Agent addresses the billing question",
|
162
|
+
"Agent provides clear next steps"
|
163
|
+
])
|
164
|
+
],
|
165
|
+
max_turns=15,
|
166
|
+
verbose=True,
|
167
|
+
debug=False
|
168
|
+
)
|
169
|
+
```
|
170
|
+
"""
|
171
|
+
self.name = name
|
172
|
+
self.description = description
|
173
|
+
self.agents = agents
|
174
|
+
self.script = script or [proceed()]
|
175
|
+
|
176
|
+
config = ScenarioConfig(
|
177
|
+
max_turns=max_turns,
|
178
|
+
verbose=verbose,
|
179
|
+
cache_key=cache_key,
|
180
|
+
debug=debug,
|
181
|
+
)
|
182
|
+
self.config = (ScenarioConfig.default_config or ScenarioConfig()).merge(config)
|
67
183
|
|
68
|
-
self.scenario = scenario.model_copy()
|
69
|
-
self._context = context
|
70
|
-
self._script = script or [scenario.proceed()]
|
71
|
-
self.current_turn = 0
|
72
184
|
self.reset()
|
73
185
|
|
186
|
+
@classmethod
|
187
|
+
async def run(
|
188
|
+
cls,
|
189
|
+
name: str,
|
190
|
+
description: str,
|
191
|
+
agents: List[AgentAdapter] = [],
|
192
|
+
max_turns: Optional[int] = None,
|
193
|
+
verbose: Optional[Union[bool, int]] = None,
|
194
|
+
cache_key: Optional[str] = None,
|
195
|
+
debug: Optional[bool] = None,
|
196
|
+
script: Optional[List[ScriptStep]] = None,
|
197
|
+
) -> ScenarioResult:
|
198
|
+
"""
|
199
|
+
High-level interface for running a scenario test.
|
200
|
+
|
201
|
+
This is the main entry point for executing scenario tests. It creates a
|
202
|
+
ScenarioExecutor instance and runs it in an isolated thread pool to support
|
203
|
+
parallel execution and prevent blocking.
|
204
|
+
|
205
|
+
Args:
|
206
|
+
name: Human-readable name for the scenario
|
207
|
+
description: Detailed description of what the scenario tests
|
208
|
+
agents: List of agent adapters (agent under test, user simulator, judge)
|
209
|
+
max_turns: Maximum conversation turns before timeout (default: 10)
|
210
|
+
verbose: Show detailed output during execution
|
211
|
+
cache_key: Cache key for deterministic behavior
|
212
|
+
debug: Enable debug mode for step-by-step execution
|
213
|
+
script: Optional script steps to control scenario flow
|
214
|
+
|
215
|
+
Returns:
|
216
|
+
ScenarioResult containing the test outcome, conversation history,
|
217
|
+
success/failure status, and detailed reasoning
|
218
|
+
|
219
|
+
Example:
|
220
|
+
```python
|
221
|
+
import scenario
|
222
|
+
|
223
|
+
# Simple scenario with automatic flow
|
224
|
+
result = await scenario.run(
|
225
|
+
name="help request",
|
226
|
+
description="User asks for help with a technical problem",
|
227
|
+
agents=[
|
228
|
+
my_agent,
|
229
|
+
scenario.UserSimulatorAgent(),
|
230
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful response"])
|
231
|
+
]
|
232
|
+
)
|
233
|
+
|
234
|
+
# Scripted scenario with custom evaluations
|
235
|
+
result = await scenario.run(
|
236
|
+
name="custom interaction",
|
237
|
+
description="Test specific conversation flow",
|
238
|
+
agents=[
|
239
|
+
my_agent,
|
240
|
+
scenario.UserSimulatorAgent(),
|
241
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful response"])
|
242
|
+
],
|
243
|
+
script=[
|
244
|
+
scenario.user("Hello"),
|
245
|
+
scenario.agent(),
|
246
|
+
custom_eval,
|
247
|
+
scenario.succeed()
|
248
|
+
]
|
249
|
+
)
|
250
|
+
|
251
|
+
# Results analysis
|
252
|
+
print(f"Test {'PASSED' if result.success else 'FAILED'}")
|
253
|
+
print(f"Reasoning: {result.reasoning}")
|
254
|
+
print(f"Conversation had {len(result.messages)} messages")
|
255
|
+
```
|
256
|
+
|
257
|
+
Note:
|
258
|
+
- Runs in isolated thread pool to support parallel execution
|
259
|
+
- Blocks until scenario completes or times out
|
260
|
+
- All agent calls are automatically cached when cache_key is set
|
261
|
+
- Exception handling ensures clean resource cleanup
|
262
|
+
"""
|
263
|
+
scenario = cls(
|
264
|
+
name=name,
|
265
|
+
description=description,
|
266
|
+
agents=agents,
|
267
|
+
max_turns=max_turns,
|
268
|
+
verbose=verbose,
|
269
|
+
cache_key=cache_key,
|
270
|
+
debug=debug,
|
271
|
+
script=script,
|
272
|
+
)
|
273
|
+
|
274
|
+
# We'll use a thread pool to run the execution logic, we
|
275
|
+
# require a separate thread because even though asyncio is
|
276
|
+
# being used throughout, any user code on the callback can
|
277
|
+
# be blocking, preventing them from running scenarios in parallel
|
278
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
279
|
+
|
280
|
+
def run_in_thread():
|
281
|
+
loop = asyncio.new_event_loop()
|
282
|
+
asyncio.set_event_loop(loop)
|
283
|
+
|
284
|
+
try:
|
285
|
+
return loop.run_until_complete(scenario._run())
|
286
|
+
finally:
|
287
|
+
loop.close()
|
288
|
+
|
289
|
+
# Run the function in the thread pool and await its result
|
290
|
+
# This converts the thread's execution into a Future that the current
|
291
|
+
# event loop can await without blocking
|
292
|
+
loop = asyncio.get_event_loop()
|
293
|
+
result = await loop.run_in_executor(executor, run_in_thread)
|
294
|
+
return result
|
295
|
+
|
74
296
|
def reset(self):
|
75
|
-
|
76
|
-
|
297
|
+
"""
|
298
|
+
Reset the scenario executor to initial state.
|
299
|
+
|
300
|
+
This method reinitializes all internal state for a fresh scenario run,
|
301
|
+
including conversation history, turn counters, and agent timing information.
|
302
|
+
Called automatically during initialization and can be used to rerun scenarios.
|
303
|
+
|
304
|
+
Example:
|
305
|
+
```python
|
306
|
+
executor = ScenarioExecutor(...)
|
307
|
+
|
308
|
+
# Run first test
|
309
|
+
result1 = await executor._run()
|
310
|
+
|
311
|
+
# Reset and run again
|
312
|
+
executor.reset()
|
313
|
+
result2 = await executor._run()
|
314
|
+
```
|
315
|
+
"""
|
316
|
+
self._state = ScenarioState(
|
317
|
+
description=self.description,
|
318
|
+
messages=[],
|
319
|
+
thread_id=str(PKSUID("thread")),
|
320
|
+
current_turn=0,
|
321
|
+
config=self.config,
|
322
|
+
_executor=self,
|
323
|
+
)
|
324
|
+
# Pydantic doesn't actually set the _executor field from the constructor, as it's private, so we need to do it manually
|
325
|
+
self._state._executor = self
|
326
|
+
|
77
327
|
self._pending_messages = {}
|
78
|
-
self.thread_id = str(PKSUID("thread"))
|
79
328
|
self._total_start_time = time.time()
|
80
329
|
self._agent_times = {}
|
81
330
|
|
82
|
-
for AgentClass in self.scenario.agents:
|
83
|
-
self._agents.append(
|
84
|
-
AgentClass(
|
85
|
-
input=AgentInput(
|
86
|
-
thread_id=self.thread_id,
|
87
|
-
messages=[],
|
88
|
-
new_messages=[],
|
89
|
-
context=self._context or {},
|
90
|
-
requested_role=list(AgentClass.roles)[0],
|
91
|
-
scenario_state=self,
|
92
|
-
)
|
93
|
-
)
|
94
|
-
)
|
95
|
-
|
96
331
|
self._new_turn()
|
97
|
-
self.current_turn = 0
|
332
|
+
self._state.current_turn = 0
|
98
333
|
|
99
|
-
context_scenario.set(self
|
334
|
+
context_scenario.set(self)
|
100
335
|
|
101
336
|
def add_message(
|
102
337
|
self, message: ChatCompletionMessageParam, from_agent_idx: Optional[int] = None
|
103
338
|
):
|
104
|
-
|
339
|
+
"""
|
340
|
+
Add a message to the conversation and broadcast to other agents.
|
341
|
+
|
342
|
+
This method adds a message to the conversation history and makes it available
|
343
|
+
to other agents in their next call. It's used internally by the executor
|
344
|
+
and can be called from script steps to inject custom messages.
|
345
|
+
|
346
|
+
Args:
|
347
|
+
message: OpenAI-compatible message to add to the conversation
|
348
|
+
from_agent_idx: Index of the agent that generated this message.
|
349
|
+
Used to avoid broadcasting the message back to its creator.
|
350
|
+
|
351
|
+
Example:
|
352
|
+
```python
|
353
|
+
def inject_system_message(state: ScenarioState) -> None:
|
354
|
+
state._executor.add_message({
|
355
|
+
"role": "system",
|
356
|
+
"content": "The user is now in a hurry"
|
357
|
+
})
|
358
|
+
|
359
|
+
# Use in script
|
360
|
+
result = await scenario.run(
|
361
|
+
name="system message test",
|
362
|
+
agents=[agent, user_sim, judge],
|
363
|
+
script=[
|
364
|
+
scenario.user("Hello"),
|
365
|
+
scenario.agent(),
|
366
|
+
inject_system_message,
|
367
|
+
scenario.user(), # Will see the system message
|
368
|
+
scenario.succeed()
|
369
|
+
]
|
370
|
+
)
|
371
|
+
```
|
372
|
+
"""
|
373
|
+
self._state.messages.append(message)
|
105
374
|
|
106
375
|
# Broadcast the message to other agents
|
107
|
-
for idx, _ in enumerate(self.
|
376
|
+
for idx, _ in enumerate(self.agents):
|
108
377
|
if idx == from_agent_idx:
|
109
378
|
continue
|
110
379
|
if idx not in self._pending_messages:
|
@@ -116,19 +385,57 @@ class ScenarioExecutor:
|
|
116
385
|
messages: List[ChatCompletionMessageParam],
|
117
386
|
from_agent_idx: Optional[int] = None,
|
118
387
|
):
|
388
|
+
"""
|
389
|
+
Add multiple messages to the conversation.
|
390
|
+
|
391
|
+
Convenience method for adding multiple messages at once. Each message
|
392
|
+
is added individually using add_message().
|
393
|
+
|
394
|
+
Args:
|
395
|
+
messages: List of OpenAI-compatible messages to add
|
396
|
+
from_agent_idx: Index of the agent that generated these messages
|
397
|
+
|
398
|
+
Example:
|
399
|
+
```python
|
400
|
+
# Agent returns multiple messages for a complex interaction
|
401
|
+
messages = [
|
402
|
+
{"role": "assistant", "content": "Let me search for that..."},
|
403
|
+
{"role": "assistant", "content": "Here's what I found: ..."}
|
404
|
+
]
|
405
|
+
executor.add_messages(messages, from_agent_idx=0)
|
406
|
+
```
|
407
|
+
"""
|
119
408
|
for message in messages:
|
120
409
|
self.add_message(message, from_agent_idx)
|
121
410
|
|
122
411
|
def _new_turn(self):
|
123
|
-
self._pending_agents_on_turn = set(self.
|
412
|
+
self._pending_agents_on_turn = set(self.agents)
|
124
413
|
self._pending_roles_on_turn = [
|
125
|
-
|
126
|
-
|
127
|
-
|
414
|
+
AgentRole.USER,
|
415
|
+
AgentRole.AGENT,
|
416
|
+
AgentRole.JUDGE,
|
128
417
|
]
|
129
|
-
self.current_turn += 1
|
418
|
+
self._state.current_turn += 1
|
130
419
|
|
131
420
|
async def step(self) -> Union[List[ChatCompletionMessageParam], ScenarioResult]:
|
421
|
+
"""
|
422
|
+
Execute a single step in the scenario.
|
423
|
+
|
424
|
+
A step consists of calling the next agent in the current turn's sequence
|
425
|
+
and processing their response. This method is used internally by the
|
426
|
+
scenario execution flow.
|
427
|
+
|
428
|
+
Returns:
|
429
|
+
Either a list of messages (if the scenario continues) or a
|
430
|
+
ScenarioResult (if the scenario should end)
|
431
|
+
|
432
|
+
Raises:
|
433
|
+
ValueError: If no result is returned from the internal step method
|
434
|
+
|
435
|
+
Note:
|
436
|
+
This is primarily an internal method. Most users should use the
|
437
|
+
high-level run() method or script DSL functions instead.
|
438
|
+
"""
|
132
439
|
result = await self._step()
|
133
440
|
if result is None:
|
134
441
|
raise ValueError("No result from step")
|
@@ -139,8 +446,8 @@ class ScenarioExecutor:
|
|
139
446
|
go_to_next_turn=True,
|
140
447
|
on_turn: Optional[
|
141
448
|
Union[
|
142
|
-
Callable[["
|
143
|
-
Callable[["
|
449
|
+
Callable[["ScenarioState"], None],
|
450
|
+
Callable[["ScenarioState"], Awaitable[None]],
|
144
451
|
]
|
145
452
|
] = None,
|
146
453
|
) -> Union[List[ChatCompletionMessageParam], ScenarioResult, None]:
|
@@ -151,9 +458,9 @@ class ScenarioExecutor:
|
|
151
458
|
self._new_turn()
|
152
459
|
|
153
460
|
if on_turn:
|
154
|
-
await await_if_awaitable(on_turn(self))
|
461
|
+
await await_if_awaitable(on_turn(self._state))
|
155
462
|
|
156
|
-
if self.current_turn >= (self.
|
463
|
+
if self._state.current_turn >= (self.config.max_turns or 10):
|
157
464
|
return self._reached_max_turns()
|
158
465
|
|
159
466
|
current_role = self._pending_roles_on_turn[0]
|
@@ -166,10 +473,10 @@ class ScenarioExecutor:
|
|
166
473
|
return await self._call_agent(idx, role=current_role)
|
167
474
|
|
168
475
|
def _next_agent_for_role(
|
169
|
-
self, role:
|
170
|
-
) -> Tuple[int, Optional[
|
171
|
-
for idx, agent in enumerate(self.
|
172
|
-
if role
|
476
|
+
self, role: AgentRole
|
477
|
+
) -> Tuple[int, Optional[AgentAdapter]]:
|
478
|
+
for idx, agent in enumerate(self.agents):
|
479
|
+
if role == agent.role and agent in self._pending_agents_on_turn:
|
173
480
|
return idx, agent
|
174
481
|
return -1, None
|
175
482
|
|
@@ -177,8 +484,8 @@ class ScenarioExecutor:
|
|
177
484
|
# If we reached max turns without conclusion, fail the test
|
178
485
|
agent_roles_agents_idx = [
|
179
486
|
idx
|
180
|
-
for idx, agent in enumerate(self.
|
181
|
-
if
|
487
|
+
for idx, agent in enumerate(self.agents)
|
488
|
+
if agent.role == AgentRole.AGENT
|
182
489
|
]
|
183
490
|
agent_times = [
|
184
491
|
self._agent_times[idx]
|
@@ -189,14 +496,14 @@ class ScenarioExecutor:
|
|
189
496
|
|
190
497
|
return ScenarioResult(
|
191
498
|
success=False,
|
192
|
-
messages=self.messages,
|
499
|
+
messages=self._state.messages,
|
193
500
|
reasoning=error_message
|
194
|
-
or f"Reached maximum turns ({self.
|
501
|
+
or f"Reached maximum turns ({self.config.max_turns or 10}) without conclusion",
|
195
502
|
total_time=time.time() - self._total_start_time,
|
196
503
|
agent_time=agent_time,
|
197
504
|
)
|
198
505
|
|
199
|
-
async def
|
506
|
+
async def _run(self) -> ScenarioResult:
|
200
507
|
"""
|
201
508
|
Run a scenario against the agent under test.
|
202
509
|
|
@@ -207,13 +514,13 @@ class ScenarioExecutor:
|
|
207
514
|
ScenarioResult containing the test outcome
|
208
515
|
"""
|
209
516
|
|
210
|
-
if self.
|
517
|
+
if self.config.verbose:
|
211
518
|
print("") # new line
|
212
519
|
|
213
520
|
self.reset()
|
214
521
|
|
215
|
-
for script_step in self.
|
216
|
-
callable = script_step(self)
|
522
|
+
for script_step in self.script:
|
523
|
+
callable = script_step(self._state)
|
217
524
|
if isinstance(callable, Awaitable):
|
218
525
|
result = await callable
|
219
526
|
else:
|
@@ -232,11 +539,11 @@ class ScenarioExecutor:
|
|
232
539
|
)
|
233
540
|
|
234
541
|
async def _call_agent(
|
235
|
-
self, idx: int, role:
|
542
|
+
self, idx: int, role: AgentRole, request_judgment: bool = False
|
236
543
|
) -> Union[List[ChatCompletionMessageParam], ScenarioResult]:
|
237
|
-
agent = self.
|
544
|
+
agent = self.agents[idx]
|
238
545
|
|
239
|
-
if role ==
|
546
|
+
if role == AgentRole.USER and self.config.debug:
|
240
547
|
print(
|
241
548
|
f"\n{self._scenario_name()}{termcolor.colored('[Debug Mode]', 'yellow')} Press enter to continue or type a message to send"
|
242
549
|
)
|
@@ -258,28 +565,26 @@ class ScenarioExecutor:
|
|
258
565
|
with show_spinner(
|
259
566
|
text=(
|
260
567
|
"Judging..."
|
261
|
-
if role ==
|
262
|
-
else f"{role.value if isinstance(role,
|
568
|
+
if role == AgentRole.JUDGE
|
569
|
+
else f"{role.value if isinstance(role, AgentRole) else role}:"
|
263
570
|
),
|
264
571
|
color=(
|
265
572
|
"blue"
|
266
|
-
if role ==
|
267
|
-
else "green" if role ==
|
573
|
+
if role == AgentRole.AGENT
|
574
|
+
else "green" if role == AgentRole.USER else "yellow"
|
268
575
|
),
|
269
|
-
enabled=self.
|
576
|
+
enabled=self.config.verbose,
|
270
577
|
):
|
271
578
|
start_time = time.time()
|
272
579
|
|
273
580
|
agent_response = agent.call(
|
274
581
|
AgentInput(
|
275
582
|
# TODO: test thread_id
|
276
|
-
thread_id=self.thread_id,
|
277
|
-
messages=self.messages,
|
583
|
+
thread_id=self._state.thread_id,
|
584
|
+
messages=self._state.messages,
|
278
585
|
new_messages=self._pending_messages.get(idx, []),
|
279
|
-
|
280
|
-
|
281
|
-
requested_role=role,
|
282
|
-
scenario_state=self,
|
586
|
+
judgment_request=request_judgment,
|
587
|
+
scenario_state=self._state,
|
283
588
|
)
|
284
589
|
)
|
285
590
|
if not isinstance(agent_response, Awaitable):
|
@@ -303,12 +608,12 @@ class ScenarioExecutor:
|
|
303
608
|
else:
|
304
609
|
messages = convert_agent_return_types_to_openai_messages(
|
305
610
|
agent_response,
|
306
|
-
role="user" if role ==
|
611
|
+
role="user" if role == AgentRole.USER else "assistant",
|
307
612
|
)
|
308
613
|
|
309
614
|
self.add_messages(messages, from_agent_idx=idx)
|
310
615
|
|
311
|
-
if messages and self.
|
616
|
+
if messages and self.config.verbose:
|
312
617
|
print_openai_messages(
|
313
618
|
self._scenario_name(),
|
314
619
|
[m for m in messages if m["role"] != "system"],
|
@@ -317,75 +622,51 @@ class ScenarioExecutor:
|
|
317
622
|
return messages
|
318
623
|
|
319
624
|
def _scenario_name(self):
|
320
|
-
if self.
|
321
|
-
return termcolor.colored(f"[Scenario: {self.
|
625
|
+
if self.config.verbose == 2:
|
626
|
+
return termcolor.colored(f"[Scenario: {self.name}] ", "yellow")
|
322
627
|
else:
|
323
628
|
return ""
|
324
629
|
|
325
|
-
# State access utils
|
326
|
-
|
327
|
-
def last_message(self) -> ChatCompletionMessageParam:
|
328
|
-
if len(self.messages) == 0:
|
329
|
-
raise ValueError("No messages found")
|
330
|
-
return self.messages[-1]
|
331
|
-
|
332
|
-
def last_user_message(self) -> ChatCompletionUserMessageParam:
|
333
|
-
user_messages = [m for m in self.messages if m["role"] == "user"]
|
334
|
-
if not user_messages:
|
335
|
-
raise ValueError("No user messages found")
|
336
|
-
return user_messages[-1]
|
337
|
-
|
338
|
-
def last_tool_call(
|
339
|
-
self, tool_name: str
|
340
|
-
) -> Optional[ChatCompletionMessageToolCallParam]:
|
341
|
-
for message in reversed(self.messages):
|
342
|
-
if message["role"] == "assistant" and "tool_calls" in message:
|
343
|
-
for tool_call in message["tool_calls"]:
|
344
|
-
if tool_call["function"]["name"] == tool_name:
|
345
|
-
return tool_call
|
346
|
-
return None
|
347
|
-
|
348
|
-
def has_tool_call(self, tool_name: str) -> bool:
|
349
|
-
return self.last_tool_call(tool_name) is not None
|
350
|
-
|
351
630
|
# Scripting utils
|
352
631
|
|
353
632
|
async def message(self, message: ChatCompletionMessageParam) -> None:
|
354
633
|
if message["role"] == "user":
|
355
|
-
await self._script_call_agent(
|
634
|
+
await self._script_call_agent(AgentRole.USER, message)
|
356
635
|
elif message["role"] == "assistant":
|
357
|
-
await self._script_call_agent(
|
636
|
+
await self._script_call_agent(AgentRole.AGENT, message)
|
358
637
|
else:
|
359
638
|
self.add_message(message)
|
360
639
|
|
361
640
|
async def user(
|
362
641
|
self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
|
363
642
|
) -> None:
|
364
|
-
await self._script_call_agent(
|
643
|
+
await self._script_call_agent(AgentRole.USER, content)
|
365
644
|
|
366
645
|
async def agent(
|
367
646
|
self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
|
368
647
|
) -> None:
|
369
|
-
await self._script_call_agent(
|
648
|
+
await self._script_call_agent(AgentRole.AGENT, content)
|
370
649
|
|
371
650
|
async def judge(
|
372
651
|
self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
|
373
652
|
) -> Optional[ScenarioResult]:
|
374
|
-
return await self._script_call_agent(
|
653
|
+
return await self._script_call_agent(
|
654
|
+
AgentRole.JUDGE, content, request_judgment=True
|
655
|
+
)
|
375
656
|
|
376
657
|
async def proceed(
|
377
658
|
self,
|
378
659
|
turns: Optional[int] = None,
|
379
660
|
on_turn: Optional[
|
380
661
|
Union[
|
381
|
-
Callable[["
|
382
|
-
Callable[["
|
662
|
+
Callable[["ScenarioState"], None],
|
663
|
+
Callable[["ScenarioState"], Awaitable[None]],
|
383
664
|
]
|
384
665
|
] = None,
|
385
666
|
on_step: Optional[
|
386
667
|
Union[
|
387
|
-
Callable[["
|
388
|
-
Callable[["
|
668
|
+
Callable[["ScenarioState"], None],
|
669
|
+
Callable[["ScenarioState"], Awaitable[None]],
|
389
670
|
]
|
390
671
|
] = None,
|
391
672
|
) -> Optional[ScenarioResult]:
|
@@ -396,42 +677,42 @@ class ScenarioExecutor:
|
|
396
677
|
go_to_next_turn=(
|
397
678
|
turns is None
|
398
679
|
or initial_turn is None
|
399
|
-
or (self.current_turn + 1 < initial_turn + turns)
|
680
|
+
or (self._state.current_turn + 1 < initial_turn + turns)
|
400
681
|
),
|
401
682
|
)
|
402
683
|
|
403
684
|
if initial_turn is None:
|
404
|
-
initial_turn = self.current_turn
|
685
|
+
initial_turn = self._state.current_turn
|
405
686
|
|
406
687
|
if next_message is None:
|
407
688
|
break
|
408
689
|
|
409
690
|
if on_step:
|
410
|
-
await await_if_awaitable(on_step(self))
|
691
|
+
await await_if_awaitable(on_step(self._state))
|
411
692
|
|
412
693
|
if isinstance(next_message, ScenarioResult):
|
413
694
|
return next_message
|
414
695
|
|
415
|
-
async def succeed(self) -> ScenarioResult:
|
696
|
+
async def succeed(self, reasoning: Optional[str] = None) -> ScenarioResult:
|
416
697
|
return ScenarioResult(
|
417
698
|
success=True,
|
418
|
-
messages=self.messages,
|
419
|
-
reasoning=
|
420
|
-
|
699
|
+
messages=self._state.messages,
|
700
|
+
reasoning=reasoning
|
701
|
+
or "Scenario marked as successful with scenario.succeed()",
|
421
702
|
)
|
422
703
|
|
423
|
-
async def fail(self) -> ScenarioResult:
|
704
|
+
async def fail(self, reasoning: Optional[str] = None) -> ScenarioResult:
|
424
705
|
return ScenarioResult(
|
425
706
|
success=False,
|
426
|
-
messages=self.messages,
|
427
|
-
reasoning="Scenario marked as failed with scenario.fail()",
|
428
|
-
passed_criteria=self.scenario.criteria,
|
707
|
+
messages=self._state.messages,
|
708
|
+
reasoning=reasoning or "Scenario marked as failed with scenario.fail()",
|
429
709
|
)
|
430
710
|
|
431
711
|
async def _script_call_agent(
|
432
712
|
self,
|
433
|
-
role:
|
713
|
+
role: AgentRole,
|
434
714
|
content: Optional[Union[str, ChatCompletionMessageParam]] = None,
|
715
|
+
request_judgment: bool = False,
|
435
716
|
) -> Optional[ScenarioResult]:
|
436
717
|
idx, next_agent = self._next_agent_for_role(role)
|
437
718
|
if not next_agent:
|
@@ -439,12 +720,21 @@ class ScenarioExecutor:
|
|
439
720
|
idx, next_agent = self._next_agent_for_role(role)
|
440
721
|
|
441
722
|
if not next_agent:
|
723
|
+
role_class = (
|
724
|
+
"a scenario.UserSimulatorAgent()"
|
725
|
+
if role == AgentRole.USER
|
726
|
+
else (
|
727
|
+
"a scenario.JudgeAgent()"
|
728
|
+
if role == AgentRole.JUDGE
|
729
|
+
else "your agent"
|
730
|
+
)
|
731
|
+
)
|
442
732
|
if content:
|
443
733
|
raise ValueError(
|
444
|
-
f"Cannot generate a message for role `{role.value}` with content `{content}` because no agent with this role was found"
|
734
|
+
f"Cannot generate a message for role `{role.value}` with content `{content}` because no agent with this role was found, please add {role_class} to the scenario `agents` list"
|
445
735
|
)
|
446
736
|
raise ValueError(
|
447
|
-
f"Cannot generate a message for role `{role.value}` because no agent with this role was found"
|
737
|
+
f"Cannot generate a message for role `{role.value}` because no agent with this role was found, please add {role_class} to the scenario `agents` list"
|
448
738
|
)
|
449
739
|
|
450
740
|
self._pending_agents_on_turn.remove(next_agent)
|
@@ -457,10 +747,12 @@ class ScenarioExecutor:
|
|
457
747
|
message = content
|
458
748
|
|
459
749
|
self.add_message(message)
|
460
|
-
if self.
|
750
|
+
if self.config.verbose:
|
461
751
|
print_openai_messages(self._scenario_name(), [message])
|
462
752
|
return
|
463
753
|
|
464
|
-
result = await self._call_agent(
|
754
|
+
result = await self._call_agent(
|
755
|
+
idx, role=role, request_judgment=request_judgment
|
756
|
+
)
|
465
757
|
if isinstance(result, ScenarioResult):
|
466
758
|
return result
|