langwatch-scenario 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,187 +1,555 @@
1
1
  """
2
- ScenarioExecutor module: holds the scenario execution logic and state, orchestrating the conversation between the testing agent and the agent under test.
2
+ Scenario execution engine for agent testing.
3
+
4
+ This module contains the core ScenarioExecutor class that orchestrates the execution
5
+ of scenario tests, managing the interaction between user simulators, agents under test,
6
+ and judge agents to determine test success or failure.
3
7
  """
4
8
 
5
- import json
6
9
  import sys
7
- from typing import TYPE_CHECKING, Awaitable, Dict, List, Any, Optional, Union
10
+ from typing import (
11
+ Awaitable,
12
+ Callable,
13
+ Dict,
14
+ List,
15
+ Any,
16
+ Optional,
17
+ Set,
18
+ Tuple,
19
+ Union,
20
+ )
8
21
  import time
9
22
  import termcolor
10
-
11
- from scenario.error_messages import message_return_error_message
12
- from scenario.utils import print_openai_messages, safe_attr_or_key, safe_list_at, show_spinner
13
- from openai.types.chat import ChatCompletionMessageParam
14
-
15
- from .result import ScenarioResult
16
- from .error_messages import default_config_error_message
23
+ import asyncio
24
+ import concurrent.futures
25
+
26
+ from scenario.config import ScenarioConfig
27
+ from scenario.utils import (
28
+ await_if_awaitable,
29
+ check_valid_return_type,
30
+ convert_agent_return_types_to_openai_messages,
31
+ print_openai_messages,
32
+ show_spinner,
33
+ )
34
+ from openai.types.chat import (
35
+ ChatCompletionMessageParam,
36
+ ChatCompletionUserMessageParam,
37
+ )
38
+
39
+ from .types import AgentInput, AgentRole, ScenarioResult, ScriptStep
40
+ from .error_messages import agent_response_not_awaitable
17
41
  from .cache import context_scenario
18
-
19
- if TYPE_CHECKING:
20
- from scenario.scenario import Scenario
21
-
42
+ from .agent_adapter import AgentAdapter
43
+ from .script import proceed
44
+ from pksuid import PKSUID
45
+ from .scenario_state import ScenarioState
22
46
 
23
47
 
24
48
  class ScenarioExecutor:
25
- def __init__(self, scenario: "Scenario"):
26
- self.scenario = scenario.model_copy()
49
+ """
50
+ Core orchestrator for scenario-based agent testing.
51
+
52
+ The ScenarioExecutor manages the complete lifecycle of a scenario test, including:
53
+ - Orchestrating conversations between user simulators, agents, and judges
54
+ - Managing turn-based execution flow
55
+ - Handling script-based scenario control
56
+ - Collecting and reporting test results
57
+ - Supporting debug mode for interactive testing
58
+
59
+ This class serves as both a builder (for configuration) and an executor (for running tests).
60
+ Most users will interact with it through the high-level `scenario.run()` function rather
61
+ than instantiating it directly.
62
+
63
+ Attributes:
64
+ name: Human-readable name for the scenario
65
+ description: Detailed description of what the scenario tests
66
+ agents: List of agent adapters participating in the scenario
67
+ script: Optional list of script steps to control scenario flow
68
+ config: Configuration settings for execution behavior
69
+
70
+ Example:
71
+ ```python
72
+ # Direct instantiation (less common)
73
+ executor = ScenarioExecutor(
74
+ name="weather query test",
75
+ description="User asks about weather, agent should provide helpful response",
76
+ agents=[
77
+ weather_agent,
78
+ scenario.UserSimulatorAgent(),
79
+ scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
80
+ ],
81
+ max_turns=10,
82
+ verbose=True
83
+ )
84
+ result = await executor._run()
85
+
86
+ # Preferred high-level API
87
+ result = await scenario.run(
88
+ name="weather query test",
89
+ description="User asks about weather, agent should provide helpful response",
90
+ agents=[
91
+ weather_agent,
92
+ scenario.UserSimulatorAgent(),
93
+ scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
94
+ ]
95
+ )
96
+ ```
97
+
98
+ Note:
99
+ - Scenarios run in isolated thread pools to support parallel execution
100
+ - All agent interactions are cached when cache_key is configured
101
+ - Debug mode allows step-by-step execution with user intervention
102
+ - Results include detailed timing information and conversation history
103
+ """
104
+ name: str
105
+ description: str
106
+ agents: List[AgentAdapter]
107
+ script: List[ScriptStep]
108
+
109
+ config: ScenarioConfig
110
+
111
+ _state: ScenarioState
112
+ _total_start_time: float
113
+ _pending_messages: Dict[int, List[ChatCompletionMessageParam]]
114
+
115
+ _pending_roles_on_turn: List[AgentRole] = []
116
+ _pending_agents_on_turn: Set[AgentAdapter] = set()
117
+ _agent_times: Dict[int, float] = {}
118
+
119
+ def __init__(
120
+ self,
121
+ name: str,
122
+ description: str,
123
+ agents: List[AgentAdapter] = [],
124
+ script: Optional[List[ScriptStep]] = None,
125
+ # Config
126
+ max_turns: Optional[int] = None,
127
+ verbose: Optional[Union[bool, int]] = None,
128
+ cache_key: Optional[str] = None,
129
+ debug: Optional[bool] = None,
130
+ ):
131
+ """
132
+ Initialize a scenario executor.
27
133
 
28
- testing_agent = scenario.testing_agent
29
- if not testing_agent or not testing_agent.model:
30
- raise Exception(default_config_error_message)
31
- self.testing_agent = testing_agent
134
+ Args:
135
+ name: Human-readable name for the scenario (used in reports and logs)
136
+ description: Detailed description of what the scenario tests.
137
+ This guides the user simulator's behavior and provides context.
138
+ agents: List of agent adapters participating in the scenario.
139
+ Typically includes: agent under test, user simulator, and judge.
140
+ script: Optional list of script steps to control scenario flow.
141
+ If not provided, defaults to automatic proceeding.
142
+ max_turns: Maximum number of conversation turns before timeout.
143
+ Overrides global configuration for this scenario.
144
+ verbose: Whether to show detailed output during execution.
145
+ Can be True/False or integer level (2 for extra details).
146
+ cache_key: Cache key for deterministic behavior across runs.
147
+ Overrides global configuration for this scenario.
148
+ debug: Whether to enable debug mode with step-by-step execution.
149
+ Overrides global configuration for this scenario.
150
+
151
+ Example:
152
+ ```python
153
+ executor = ScenarioExecutor(
154
+ name="customer service test",
155
+ description="Customer has a billing question and needs help",
156
+ agents=[
157
+ customer_service_agent,
158
+ scenario.UserSimulatorAgent(),
159
+ scenario.JudgeAgent(criteria=[
160
+ "Agent is polite and professional",
161
+ "Agent addresses the billing question",
162
+ "Agent provides clear next steps"
163
+ ])
164
+ ],
165
+ max_turns=15,
166
+ verbose=True,
167
+ debug=False
168
+ )
169
+ ```
170
+ """
171
+ self.name = name
172
+ self.description = description
173
+ self.agents = agents
174
+ self.script = script or [proceed()]
175
+
176
+ config = ScenarioConfig(
177
+ max_turns=max_turns,
178
+ verbose=verbose,
179
+ cache_key=cache_key,
180
+ debug=debug,
181
+ )
182
+ self.config = (ScenarioConfig.default_config or ScenarioConfig()).merge(config)
32
183
 
33
- self.conversation: List[Dict[str, Any]] = []
184
+ self.reset()
34
185
 
186
+ @classmethod
35
187
  async def run(
36
- self,
37
- context: Optional[Dict[str, Any]] = None,
188
+ cls,
189
+ name: str,
190
+ description: str,
191
+ agents: List[AgentAdapter] = [],
192
+ max_turns: Optional[int] = None,
193
+ verbose: Optional[Union[bool, int]] = None,
194
+ cache_key: Optional[str] = None,
195
+ debug: Optional[bool] = None,
196
+ script: Optional[List[ScriptStep]] = None,
38
197
  ) -> ScenarioResult:
39
198
  """
40
- Run a scenario against the agent under test.
199
+ High-level interface for running a scenario test.
200
+
201
+ This is the main entry point for executing scenario tests. It creates a
202
+ ScenarioExecutor instance and runs it in an isolated thread pool to support
203
+ parallel execution and prevent blocking.
41
204
 
42
205
  Args:
43
- context: Optional initial context for the agent
206
+ name: Human-readable name for the scenario
207
+ description: Detailed description of what the scenario tests
208
+ agents: List of agent adapters (agent under test, user simulator, judge)
209
+ max_turns: Maximum conversation turns before timeout (default: 10)
210
+ verbose: Show detailed output during execution
211
+ cache_key: Cache key for deterministic behavior
212
+ debug: Enable debug mode for step-by-step execution
213
+ script: Optional script steps to control scenario flow
44
214
 
45
215
  Returns:
46
- ScenarioResult containing the test outcome
216
+ ScenarioResult containing the test outcome, conversation history,
217
+ success/failure status, and detailed reasoning
218
+
219
+ Example:
220
+ ```python
221
+ import scenario
222
+
223
+ # Simple scenario with automatic flow
224
+ result = await scenario.run(
225
+ name="help request",
226
+ description="User asks for help with a technical problem",
227
+ agents=[
228
+ my_agent,
229
+ scenario.UserSimulatorAgent(),
230
+ scenario.JudgeAgent(criteria=["Agent provides helpful response"])
231
+ ]
232
+ )
233
+
234
+ # Scripted scenario with custom evaluations
235
+ result = await scenario.run(
236
+ name="custom interaction",
237
+ description="Test specific conversation flow",
238
+ agents=[
239
+ my_agent,
240
+ scenario.UserSimulatorAgent(),
241
+ scenario.JudgeAgent(criteria=["Agent provides helpful response"])
242
+ ],
243
+ script=[
244
+ scenario.user("Hello"),
245
+ scenario.agent(),
246
+ custom_eval,
247
+ scenario.succeed()
248
+ ]
249
+ )
250
+
251
+ # Results analysis
252
+ print(f"Test {'PASSED' if result.success else 'FAILED'}")
253
+ print(f"Reasoning: {result.reasoning}")
254
+ print(f"Conversation had {len(result.messages)} messages")
255
+ ```
256
+
257
+ Note:
258
+ - Runs in isolated thread pool to support parallel execution
259
+ - Blocks until scenario completes or times out
260
+ - All agent calls are automatically cached when cache_key is set
261
+ - Exception handling ensures clean resource cleanup
47
262
  """
263
+ scenario = cls(
264
+ name=name,
265
+ description=description,
266
+ agents=agents,
267
+ max_turns=max_turns,
268
+ verbose=verbose,
269
+ cache_key=cache_key,
270
+ debug=debug,
271
+ script=script,
272
+ )
48
273
 
49
- if self.scenario.verbose:
50
- print("") # new line
274
+ # We'll use a thread pool to run the execution logic, we
275
+ # require a separate thread because even though asyncio is
276
+ # being used throughout, any user code on the callback can
277
+ # be blocking, preventing them from running scenarios in parallel
278
+ with concurrent.futures.ThreadPoolExecutor() as executor:
279
+
280
+ def run_in_thread():
281
+ loop = asyncio.new_event_loop()
282
+ asyncio.set_event_loop(loop)
283
+
284
+ try:
285
+ return loop.run_until_complete(scenario._run())
286
+ finally:
287
+ loop.close()
288
+
289
+ # Run the function in the thread pool and await its result
290
+ # This converts the thread's execution into a Future that the current
291
+ # event loop can await without blocking
292
+ loop = asyncio.get_event_loop()
293
+ result = await loop.run_in_executor(executor, run_in_thread)
294
+ return result
295
+
296
+ def reset(self):
297
+ """
298
+ Reset the scenario executor to initial state.
299
+
300
+ This method reinitializes all internal state for a fresh scenario run,
301
+ including conversation history, turn counters, and agent timing information.
302
+ Called automatically during initialization and can be used to rerun scenarios.
303
+
304
+ Example:
305
+ ```python
306
+ executor = ScenarioExecutor(...)
51
307
 
52
- # Run the initial testing agent prompt to get started
53
- total_start_time = time.time()
54
- context_scenario.set(self.scenario)
55
- next_message = self._generate_next_message(
56
- self.scenario, self.conversation, first_message=True
308
+ # Run first test
309
+ result1 = await executor._run()
310
+
311
+ # Reset and run again
312
+ executor.reset()
313
+ result2 = await executor._run()
314
+ ```
315
+ """
316
+ self._state = ScenarioState(
317
+ description=self.description,
318
+ messages=[],
319
+ thread_id=str(PKSUID("thread")),
320
+ current_turn=0,
321
+ config=self.config,
322
+ _executor=self,
57
323
  )
324
+ # Pydantic doesn't actually set the _executor field from the constructor, as it's private, so we need to do it manually
325
+ self._state._executor = self
58
326
 
59
- if isinstance(next_message, ScenarioResult):
60
- raise Exception(
61
- "Unexpectedly generated a ScenarioResult for the initial message",
62
- next_message.__repr__(),
63
- )
64
- elif self.scenario.verbose:
65
- print(self._scenario_name() + termcolor.colored("User:", "green"), next_message)
327
+ self._pending_messages = {}
328
+ self._total_start_time = time.time()
329
+ self._agent_times = {}
66
330
 
67
- # Execute the conversation
68
- current_turn = 0
69
- max_turns = self.scenario.max_turns or 10
70
- agent_time = 0
331
+ self._new_turn()
332
+ self._state.current_turn = 0
71
333
 
72
- # Start the test with the initial message
73
- while current_turn < max_turns:
74
- # Record the testing agent's message
75
- self.conversation.append({"role": "user", "content": next_message})
334
+ context_scenario.set(self)
76
335
 
77
- # Get response from the agent under test
78
- start_time = time.time()
336
+ def add_message(
337
+ self, message: ChatCompletionMessageParam, from_agent_idx: Optional[int] = None
338
+ ):
339
+ """
340
+ Add a message to the conversation and broadcast to other agents.
79
341
 
80
- context_scenario.set(self.scenario)
81
- with show_spinner(text="Agent:", color="blue", enabled=self.scenario.verbose):
82
- agent_response = self.scenario.agent(next_message, context)
83
- if isinstance(agent_response, Awaitable):
84
- agent_response = await agent_response
342
+ This method adds a message to the conversation history and makes it available
343
+ to other agents in their next call. It's used internally by the executor
344
+ and can be called from script steps to inject custom messages.
85
345
 
86
- has_valid_message = (
87
- "message" in agent_response
88
- and type(agent_response["message"]) is str
89
- and agent_response["message"] is not None
90
- )
91
- has_valid_messages = (
92
- "messages" in agent_response
93
- and isinstance(agent_response["messages"], list)
94
- and all(
95
- "role" in msg or hasattr(msg, "role")
96
- for msg in agent_response["messages"]
97
- )
346
+ Args:
347
+ message: OpenAI-compatible message to add to the conversation
348
+ from_agent_idx: Index of the agent that generated this message.
349
+ Used to avoid broadcasting the message back to its creator.
350
+
351
+ Example:
352
+ ```python
353
+ def inject_system_message(state: ScenarioState) -> None:
354
+ state._executor.add_message({
355
+ "role": "system",
356
+ "content": "The user is now in a hurry"
357
+ })
358
+
359
+ # Use in script
360
+ result = await scenario.run(
361
+ name="system message test",
362
+ agents=[agent, user_sim, judge],
363
+ script=[
364
+ scenario.user("Hello"),
365
+ scenario.agent(),
366
+ inject_system_message,
367
+ scenario.user(), # Will see the system message
368
+ scenario.succeed()
369
+ ]
98
370
  )
99
- if not has_valid_message and not has_valid_messages:
100
- raise Exception(message_return_error_message(agent_response))
101
-
102
- messages: list[ChatCompletionMessageParam] = []
103
- if has_valid_messages and len(agent_response["messages"]) > 0:
104
- messages = agent_response["messages"]
105
-
106
- # Drop the first messages both if they are system or user messages
107
- if safe_attr_or_key(safe_list_at(messages, 0), "role") == "system":
108
- messages = messages[1:]
109
- if safe_attr_or_key(safe_list_at(messages, 0), "role") == "user":
110
- messages = messages[1:]
111
-
112
- if has_valid_message and self.scenario.verbose:
113
- print(self._scenario_name() + termcolor.colored("Agent:", "blue"), agent_response["message"])
114
-
115
- if messages and self.scenario.verbose:
116
- print_openai_messages(self._scenario_name(), messages)
117
-
118
- if (
119
- self.scenario.verbose
120
- and "extra" in agent_response
121
- and len(agent_response["extra"].keys()) > 0
122
- ):
123
- print(
124
- termcolor.colored(
125
- "Extra:" + json.dumps(agent_response["extra"]),
126
- "magenta",
127
- )
128
- )
129
- response_time = time.time() - start_time
130
- agent_time += response_time
131
-
132
- if messages:
133
- self.conversation.extend(agent_response["messages"])
134
- if "message" in agent_response:
135
- self.conversation.append(
136
- {"role": "assistant", "content": agent_response["message"]}
137
- )
138
- if "extra" in agent_response:
139
- self.conversation.append(
140
- {
141
- "role": "assistant",
142
- "content": json.dumps(agent_response["extra"]),
143
- }
144
- )
371
+ ```
372
+ """
373
+ self._state.messages.append(message)
145
374
 
146
- # Generate the next message OR finish the test based on the agent's evaluation
147
- result = self._generate_next_message(
148
- self.scenario,
149
- self.conversation,
150
- last_message=current_turn == max_turns - 1,
151
- )
375
+ # Broadcast the message to other agents
376
+ for idx, _ in enumerate(self.agents):
377
+ if idx == from_agent_idx:
378
+ continue
379
+ if idx not in self._pending_messages:
380
+ self._pending_messages[idx] = []
381
+ self._pending_messages[idx].append(message)
152
382
 
153
- # Check if the result is a ScenarioResult (indicating test completion)
154
- if isinstance(result, ScenarioResult):
155
- result.total_time = time.time() - start_time
156
- result.agent_time = agent_time
157
- return result
158
- elif self.scenario.verbose:
159
- print(self._scenario_name() + termcolor.colored("User:", "green"), result)
383
+ def add_messages(
384
+ self,
385
+ messages: List[ChatCompletionMessageParam],
386
+ from_agent_idx: Optional[int] = None,
387
+ ):
388
+ """
389
+ Add multiple messages to the conversation.
160
390
 
161
- # Otherwise, it's the next message to send to the agent
162
- next_message = result
391
+ Convenience method for adding multiple messages at once. Each message
392
+ is added individually using add_message().
163
393
 
164
- # Increment turn counter
165
- current_turn += 1
394
+ Args:
395
+ messages: List of OpenAI-compatible messages to add
396
+ from_agent_idx: Index of the agent that generated these messages
397
+
398
+ Example:
399
+ ```python
400
+ # Agent returns multiple messages for a complex interaction
401
+ messages = [
402
+ {"role": "assistant", "content": "Let me search for that..."},
403
+ {"role": "assistant", "content": "Here's what I found: ..."}
404
+ ]
405
+ executor.add_messages(messages, from_agent_idx=0)
406
+ ```
407
+ """
408
+ for message in messages:
409
+ self.add_message(message, from_agent_idx)
410
+
411
+ def _new_turn(self):
412
+ self._pending_agents_on_turn = set(self.agents)
413
+ self._pending_roles_on_turn = [
414
+ AgentRole.USER,
415
+ AgentRole.AGENT,
416
+ AgentRole.JUDGE,
417
+ ]
418
+ self._state.current_turn += 1
419
+
420
+ async def step(self) -> Union[List[ChatCompletionMessageParam], ScenarioResult]:
421
+ """
422
+ Execute a single step in the scenario.
423
+
424
+ A step consists of calling the next agent in the current turn's sequence
425
+ and processing their response. This method is used internally by the
426
+ scenario execution flow.
427
+
428
+ Returns:
429
+ Either a list of messages (if the scenario continues) or a
430
+ ScenarioResult (if the scenario should end)
431
+
432
+ Raises:
433
+ ValueError: If no result is returned from the internal step method
166
434
 
435
+ Note:
436
+ This is primarily an internal method. Most users should use the
437
+ high-level run() method or script DSL functions instead.
438
+ """
439
+ result = await self._step()
440
+ if result is None:
441
+ raise ValueError("No result from step")
442
+ return result
443
+
444
+ async def _step(
445
+ self,
446
+ go_to_next_turn=True,
447
+ on_turn: Optional[
448
+ Union[
449
+ Callable[["ScenarioState"], None],
450
+ Callable[["ScenarioState"], Awaitable[None]],
451
+ ]
452
+ ] = None,
453
+ ) -> Union[List[ChatCompletionMessageParam], ScenarioResult, None]:
454
+ if len(self._pending_roles_on_turn) == 0:
455
+ if not go_to_next_turn:
456
+ return None
457
+
458
+ self._new_turn()
459
+
460
+ if on_turn:
461
+ await await_if_awaitable(on_turn(self._state))
462
+
463
+ if self._state.current_turn >= (self.config.max_turns or 10):
464
+ return self._reached_max_turns()
465
+
466
+ current_role = self._pending_roles_on_turn[0]
467
+ idx, next_agent = self._next_agent_for_role(current_role)
468
+ if not next_agent:
469
+ self._pending_roles_on_turn.pop(0)
470
+ return await self._step(go_to_next_turn=go_to_next_turn, on_turn=on_turn)
471
+
472
+ self._pending_agents_on_turn.remove(next_agent)
473
+ return await self._call_agent(idx, role=current_role)
474
+
475
+ def _next_agent_for_role(
476
+ self, role: AgentRole
477
+ ) -> Tuple[int, Optional[AgentAdapter]]:
478
+ for idx, agent in enumerate(self.agents):
479
+ if role == agent.role and agent in self._pending_agents_on_turn:
480
+ return idx, agent
481
+ return -1, None
482
+
483
+ def _reached_max_turns(self, error_message: Optional[str] = None) -> ScenarioResult:
167
484
  # If we reached max turns without conclusion, fail the test
168
- return ScenarioResult.failure_result(
169
- conversation=self.conversation,
170
- reasoning=f"Reached maximum turns ({max_turns}) without conclusion",
171
- total_time=time.time() - total_start_time,
485
+ agent_roles_agents_idx = [
486
+ idx
487
+ for idx, agent in enumerate(self.agents)
488
+ if agent.role == AgentRole.AGENT
489
+ ]
490
+ agent_times = [
491
+ self._agent_times[idx]
492
+ for idx in agent_roles_agents_idx
493
+ if idx in self._agent_times
494
+ ]
495
+ agent_time = sum(agent_times)
496
+
497
+ return ScenarioResult(
498
+ success=False,
499
+ messages=self._state.messages,
500
+ reasoning=error_message
501
+ or f"Reached maximum turns ({self.config.max_turns or 10}) without conclusion",
502
+ total_time=time.time() - self._total_start_time,
172
503
  agent_time=agent_time,
173
504
  )
174
505
 
175
- def _generate_next_message(
176
- self,
177
- scenario: "Scenario",
178
- conversation: List[Dict[str, Any]],
179
- first_message: bool = False,
180
- last_message: bool = False,
181
- ) -> Union[str, ScenarioResult]:
182
- if self.scenario.debug:
183
- print(f"\n{self._scenario_name()}{termcolor.colored('[Debug Mode]', 'yellow')} Press enter to continue or type a message to send")
184
- input_message = input(self._scenario_name() + termcolor.colored('User: ', 'green'))
506
+ async def _run(self) -> ScenarioResult:
507
+ """
508
+ Run a scenario against the agent under test.
509
+
510
+ Args:
511
+ context: Optional initial context for the agent
512
+
513
+ Returns:
514
+ ScenarioResult containing the test outcome
515
+ """
516
+
517
+ if self.config.verbose:
518
+ print("") # new line
519
+
520
+ self.reset()
521
+
522
+ for script_step in self.script:
523
+ callable = script_step(self._state)
524
+ if isinstance(callable, Awaitable):
525
+ result = await callable
526
+ else:
527
+ result = callable
528
+
529
+ if isinstance(result, ScenarioResult):
530
+ return result
531
+
532
+ return self._reached_max_turns(
533
+ """Reached end of script without conclusion, add one of the following to the end of the script:
534
+
535
+ - `scenario.proceed()` to let the simulation continue to play out
536
+ - `scenario.judge()` to force criteria judgement
537
+ - `scenario.succeed()` or `scenario.fail()` to end the test with an explicit result
538
+ """
539
+ )
540
+
541
+ async def _call_agent(
542
+ self, idx: int, role: AgentRole, request_judgment: bool = False
543
+ ) -> Union[List[ChatCompletionMessageParam], ScenarioResult]:
544
+ agent = self.agents[idx]
545
+
546
+ if role == AgentRole.USER and self.config.debug:
547
+ print(
548
+ f"\n{self._scenario_name()}{termcolor.colored('[Debug Mode]', 'yellow')} Press enter to continue or type a message to send"
549
+ )
550
+ input_message = input(
551
+ self._scenario_name() + termcolor.colored("User: ", "green")
552
+ )
185
553
 
186
554
  # Clear the input prompt lines completely
187
555
  for _ in range(3):
@@ -190,15 +558,201 @@ class ScenarioExecutor:
190
558
  sys.stdout.flush() # Make sure the clearing is visible
191
559
 
192
560
  if input_message:
193
- return input_message
561
+ return [
562
+ ChatCompletionUserMessageParam(role="user", content=input_message)
563
+ ]
564
+
565
+ with show_spinner(
566
+ text=(
567
+ "Judging..."
568
+ if role == AgentRole.JUDGE
569
+ else f"{role.value if isinstance(role, AgentRole) else role}:"
570
+ ),
571
+ color=(
572
+ "blue"
573
+ if role == AgentRole.AGENT
574
+ else "green" if role == AgentRole.USER else "yellow"
575
+ ),
576
+ enabled=self.config.verbose,
577
+ ):
578
+ start_time = time.time()
194
579
 
195
- with show_spinner(text=f"{self._scenario_name()}User:", color="green", enabled=self.scenario.verbose):
196
- return self.testing_agent.generate_next_message(
197
- scenario, conversation, first_message, last_message
580
+ agent_response = agent.call(
581
+ AgentInput(
582
+ # TODO: test thread_id
583
+ thread_id=self._state.thread_id,
584
+ messages=self._state.messages,
585
+ new_messages=self._pending_messages.get(idx, []),
586
+ judgment_request=request_judgment,
587
+ scenario_state=self._state,
588
+ )
198
589
  )
590
+ if not isinstance(agent_response, Awaitable):
591
+ raise Exception(
592
+ agent_response_not_awaitable(agent.__class__.__name__),
593
+ )
594
+
595
+ agent_response = await agent_response
596
+
597
+ if idx not in self._agent_times:
598
+ self._agent_times[idx] = 0
599
+ self._agent_times[idx] += time.time() - start_time
600
+
601
+ self._pending_messages[idx] = []
602
+ check_valid_return_type(agent_response, agent.__class__.__name__)
603
+
604
+ messages = []
605
+ if isinstance(agent_response, ScenarioResult):
606
+ # TODO: should be an event
607
+ return agent_response
608
+ else:
609
+ messages = convert_agent_return_types_to_openai_messages(
610
+ agent_response,
611
+ role="user" if role == AgentRole.USER else "assistant",
612
+ )
613
+
614
+ self.add_messages(messages, from_agent_idx=idx)
615
+
616
+ if messages and self.config.verbose:
617
+ print_openai_messages(
618
+ self._scenario_name(),
619
+ [m for m in messages if m["role"] != "system"],
620
+ )
621
+
622
+ return messages
199
623
 
200
624
  def _scenario_name(self):
201
- if self.scenario.verbose == 2:
202
- return termcolor.colored(f"[Scenario: {self.scenario.name}] ", "yellow")
625
+ if self.config.verbose == 2:
626
+ return termcolor.colored(f"[Scenario: {self.name}] ", "yellow")
203
627
  else:
204
628
  return ""
629
+
630
+ # Scripting utils
631
+
632
+ async def message(self, message: ChatCompletionMessageParam) -> None:
633
+ if message["role"] == "user":
634
+ await self._script_call_agent(AgentRole.USER, message)
635
+ elif message["role"] == "assistant":
636
+ await self._script_call_agent(AgentRole.AGENT, message)
637
+ else:
638
+ self.add_message(message)
639
+
640
+ async def user(
641
+ self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
642
+ ) -> None:
643
+ await self._script_call_agent(AgentRole.USER, content)
644
+
645
+ async def agent(
646
+ self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
647
+ ) -> None:
648
+ await self._script_call_agent(AgentRole.AGENT, content)
649
+
650
+ async def judge(
651
+ self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
652
+ ) -> Optional[ScenarioResult]:
653
+ return await self._script_call_agent(
654
+ AgentRole.JUDGE, content, request_judgment=True
655
+ )
656
+
657
+ async def proceed(
658
+ self,
659
+ turns: Optional[int] = None,
660
+ on_turn: Optional[
661
+ Union[
662
+ Callable[["ScenarioState"], None],
663
+ Callable[["ScenarioState"], Awaitable[None]],
664
+ ]
665
+ ] = None,
666
+ on_step: Optional[
667
+ Union[
668
+ Callable[["ScenarioState"], None],
669
+ Callable[["ScenarioState"], Awaitable[None]],
670
+ ]
671
+ ] = None,
672
+ ) -> Optional[ScenarioResult]:
673
+ initial_turn: Optional[int] = None
674
+ while True:
675
+ next_message = await self._step(
676
+ on_turn=on_turn,
677
+ go_to_next_turn=(
678
+ turns is None
679
+ or initial_turn is None
680
+ or (self._state.current_turn + 1 < initial_turn + turns)
681
+ ),
682
+ )
683
+
684
+ if initial_turn is None:
685
+ initial_turn = self._state.current_turn
686
+
687
+ if next_message is None:
688
+ break
689
+
690
+ if on_step:
691
+ await await_if_awaitable(on_step(self._state))
692
+
693
+ if isinstance(next_message, ScenarioResult):
694
+ return next_message
695
+
696
+ async def succeed(self, reasoning: Optional[str] = None) -> ScenarioResult:
697
+ return ScenarioResult(
698
+ success=True,
699
+ messages=self._state.messages,
700
+ reasoning=reasoning
701
+ or "Scenario marked as successful with scenario.succeed()",
702
+ )
703
+
704
+ async def fail(self, reasoning: Optional[str] = None) -> ScenarioResult:
705
+ return ScenarioResult(
706
+ success=False,
707
+ messages=self._state.messages,
708
+ reasoning=reasoning or "Scenario marked as failed with scenario.fail()",
709
+ )
710
+
711
+ async def _script_call_agent(
712
+ self,
713
+ role: AgentRole,
714
+ content: Optional[Union[str, ChatCompletionMessageParam]] = None,
715
+ request_judgment: bool = False,
716
+ ) -> Optional[ScenarioResult]:
717
+ idx, next_agent = self._next_agent_for_role(role)
718
+ if not next_agent:
719
+ self._new_turn()
720
+ idx, next_agent = self._next_agent_for_role(role)
721
+
722
+ if not next_agent:
723
+ role_class = (
724
+ "a scenario.UserSimulatorAgent()"
725
+ if role == AgentRole.USER
726
+ else (
727
+ "a scenario.JudgeAgent()"
728
+ if role == AgentRole.JUDGE
729
+ else "your agent"
730
+ )
731
+ )
732
+ if content:
733
+ raise ValueError(
734
+ f"Cannot generate a message for role `{role.value}` with content `{content}` because no agent with this role was found, please add {role_class} to the scenario `agents` list"
735
+ )
736
+ raise ValueError(
737
+ f"Cannot generate a message for role `{role.value}` because no agent with this role was found, please add {role_class} to the scenario `agents` list"
738
+ )
739
+
740
+ self._pending_agents_on_turn.remove(next_agent)
741
+ self._pending_roles_on_turn.remove(role)
742
+
743
+ if content:
744
+ if isinstance(content, str):
745
+ message = ChatCompletionUserMessageParam(role="user", content=content)
746
+ else:
747
+ message = content
748
+
749
+ self.add_message(message)
750
+ if self.config.verbose:
751
+ print_openai_messages(self._scenario_name(), [message])
752
+ return
753
+
754
+ result = await self._call_agent(
755
+ idx, role=role, request_judgment=request_judgment
756
+ )
757
+ if isinstance(result, ScenarioResult):
758
+ return result