langwatch-scenario 0.3.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,134 +1,456 @@
1
1
  """
2
- ScenarioExecutor module: holds the scenario execution logic and state, orchestrating the conversation between the testing agent and the agent under test.
2
+ Scenario execution engine for agent testing.
3
+
4
+ This module contains the core ScenarioExecutor class that orchestrates the execution
5
+ of scenario tests, managing the interaction between user simulators, agents under test,
6
+ and judge agents to determine test success or failure.
3
7
  """
4
8
 
5
9
  import sys
6
10
  from typing import (
7
- TYPE_CHECKING,
8
11
  Awaitable,
9
12
  Callable,
10
13
  Dict,
11
14
  List,
12
- Any,
13
15
  Optional,
14
16
  Set,
15
17
  Tuple,
16
18
  Union,
19
+ TypedDict,
17
20
  )
18
21
  import time
19
22
  import termcolor
23
+ import asyncio
24
+ import concurrent.futures
20
25
 
21
- from scenario.utils import (
22
- await_if_awaitable,
26
+ from scenario.config import ScenarioConfig
27
+ from scenario._utils import (
23
28
  check_valid_return_type,
24
29
  convert_agent_return_types_to_openai_messages,
25
30
  print_openai_messages,
26
31
  show_spinner,
32
+ await_if_awaitable,
33
+ get_or_create_batch_run_id,
34
+ generate_scenario_run_id,
27
35
  )
28
36
  from openai.types.chat import (
29
37
  ChatCompletionMessageParam,
30
38
  ChatCompletionUserMessageParam,
31
- ChatCompletionMessageToolCallParam,
39
+ ChatCompletionAssistantMessageParam,
32
40
  )
33
41
 
34
- from .types import AgentInput, ScenarioAgentRole, ScenarioResult, ScriptStep
35
- from .error_messages import agent_response_not_awaitable
42
+ from .types import AgentInput, AgentRole, ScenarioResult, ScriptStep
43
+ from ._error_messages import agent_response_not_awaitable
36
44
  from .cache import context_scenario
37
- from .scenario_agent_adapter import ScenarioAgentAdapter
45
+ from .agent_adapter import AgentAdapter
46
+ from .script import proceed
38
47
  from pksuid import PKSUID
39
-
40
- if TYPE_CHECKING:
41
- from scenario.scenario import Scenario
48
+ from .scenario_state import ScenarioState
49
+ from .events import (
50
+ ScenarioEventBus,
51
+ ScenarioRunStartedEvent,
52
+ ScenarioMessageSnapshotEvent,
53
+ ScenarioRunFinishedEvent,
54
+ ScenarioRunStartedEventMetadata,
55
+ ScenarioRunFinishedEventResults,
56
+ ScenarioRunFinishedEventVerdict,
57
+ ScenarioRunFinishedEventStatus,
58
+ convert_messages_to_ag_ui_messages,
59
+ )
42
60
 
43
61
 
44
62
  class ScenarioExecutor:
45
- scenario: "Scenario"
46
- messages: List[ChatCompletionMessageParam]
47
- thread_id: str
48
- current_turn: int
49
-
50
- _context: Optional[Dict[str, Any]]
51
- _script: List[ScriptStep]
52
- _agents: List[ScenarioAgentAdapter]
63
+ """
64
+ Core orchestrator for scenario-based agent testing.
65
+
66
+ The ScenarioExecutor manages the complete lifecycle of a scenario test, including:
67
+ - Orchestrating conversations between user simulators, agents, and judges
68
+ - Managing turn-based execution flow
69
+ - Handling script-based scenario control
70
+ - Collecting and reporting test results
71
+ - Supporting debug mode for interactive testing
72
+
73
+ This class serves as both a builder (for configuration) and an executor (for running tests).
74
+ Most users will interact with it through the high-level `scenario.run()` function rather
75
+ than instantiating it directly.
76
+
77
+ Attributes:
78
+ name: Human-readable name for the scenario
79
+ description: Detailed description of what the scenario tests
80
+ agents: List of agent adapters participating in the scenario
81
+ script: Optional list of script steps to control scenario flow
82
+ config: Configuration settings for execution behavior
83
+
84
+ Example:
85
+ ```
86
+ # Direct instantiation (less common)
87
+ executor = ScenarioExecutor(
88
+ name="weather query test",
89
+ description="User asks about weather, agent should provide helpful response",
90
+ agents=[
91
+ weather_agent,
92
+ scenario.UserSimulatorAgent(),
93
+ scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
94
+ ],
95
+ max_turns=10,
96
+ verbose=True
97
+ )
98
+ result = await executor._run()
99
+
100
+ # Preferred high-level API
101
+ result = await scenario.run(
102
+ name="weather query test",
103
+ description="User asks about weather, agent should provide helpful response",
104
+ agents=[
105
+ weather_agent,
106
+ scenario.UserSimulatorAgent(),
107
+ scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
108
+ ]
109
+ )
110
+ ```
111
+
112
+ Note:
113
+ - Scenarios run in isolated thread pools to support parallel execution
114
+ - All agent interactions are cached when cache_key is configured
115
+ - Debug mode allows step-by-step execution with user intervention
116
+ - Results include detailed timing information and conversation history
117
+ """
118
+
119
+ name: str
120
+ description: str
121
+ agents: List[AgentAdapter]
122
+ script: List[ScriptStep]
123
+
124
+ config: ScenarioConfig
125
+
126
+ _state: ScenarioState
53
127
  _total_start_time: float
54
128
  _pending_messages: Dict[int, List[ChatCompletionMessageParam]]
55
129
 
56
- _pending_roles_on_turn: List[ScenarioAgentRole] = []
57
- _pending_agents_on_turn: Set[ScenarioAgentAdapter] = set()
130
+ _pending_roles_on_turn: List[AgentRole] = []
131
+ _pending_agents_on_turn: Set[AgentAdapter] = set()
58
132
  _agent_times: Dict[int, float] = {}
59
133
 
134
+ event_bus: ScenarioEventBus
135
+
136
+ batch_run_id: str
137
+
60
138
  def __init__(
61
139
  self,
62
- scenario: "Scenario",
63
- context: Optional[Dict[str, Any]] = None,
140
+ name: str,
141
+ description: str,
142
+ agents: List[AgentAdapter] = [],
64
143
  script: Optional[List[ScriptStep]] = None,
144
+ # Config
145
+ max_turns: Optional[int] = None,
146
+ verbose: Optional[Union[bool, int]] = None,
147
+ cache_key: Optional[str] = None,
148
+ debug: Optional[bool] = None,
149
+ event_bus: Optional[ScenarioEventBus] = None,
65
150
  ):
66
- super().__init__()
151
+ """
152
+ Initialize a scenario executor.
153
+
154
+ Args:
155
+ name: Human-readable name for the scenario (used in reports and logs)
156
+ description: Detailed description of what the scenario tests.
157
+ This guides the user simulator's behavior and provides context.
158
+ agents: List of agent adapters participating in the scenario.
159
+ Typically includes: agent under test, user simulator, and judge.
160
+ script: Optional list of script steps to control scenario flow.
161
+ If not provided, defaults to automatic proceeding.
162
+ max_turns: Maximum number of conversation turns before timeout.
163
+ Overrides global configuration for this scenario.
164
+ verbose: Whether to show detailed output during execution.
165
+ Can be True/False or integer level (2 for extra details).
166
+ cache_key: Cache key for deterministic behavior across runs.
167
+ Overrides global configuration for this scenario.
168
+ debug: Whether to enable debug mode with step-by-step execution.
169
+ Overrides global configuration for this scenario.
170
+ event_reporter: Optional event reporter for the scenario
171
+
172
+ Example:
173
+ ```python
174
+ executor = ScenarioExecutor(
175
+ name="customer service test",
176
+ description="Customer has a billing question and needs help",
177
+ agents=[
178
+ customer_service_agent,
179
+ scenario.UserSimulatorAgent(),
180
+ scenario.JudgeAgent(criteria=[
181
+ "Agent is polite and professional",
182
+ "Agent addresses the billing question",
183
+ "Agent provides clear next steps"
184
+ ])
185
+ ],
186
+ max_turns=15,
187
+ verbose=True,
188
+ debug=False
189
+ )
190
+ ```
191
+ """
192
+ self.name = name
193
+ self.description = description
194
+ self.agents = agents
195
+ self.script = script or [proceed()]
196
+
197
+ config = ScenarioConfig(
198
+ max_turns=max_turns,
199
+ verbose=verbose,
200
+ cache_key=cache_key,
201
+ debug=debug,
202
+ )
203
+ self.config = (ScenarioConfig.default_config or ScenarioConfig()).merge(config)
67
204
 
68
- self.scenario = scenario.model_copy()
69
- self._context = context
70
- self._script = script or [scenario.proceed()]
71
- self.current_turn = 0
72
205
  self.reset()
73
206
 
207
+ self.event_bus = event_bus or ScenarioEventBus()
208
+
209
+ self.batch_run_id = get_or_create_batch_run_id()
210
+
211
+ @classmethod
212
+ async def run(
213
+ cls,
214
+ name: str,
215
+ description: str,
216
+ agents: List[AgentAdapter] = [],
217
+ max_turns: Optional[int] = None,
218
+ verbose: Optional[Union[bool, int]] = None,
219
+ cache_key: Optional[str] = None,
220
+ debug: Optional[bool] = None,
221
+ script: Optional[List[ScriptStep]] = None,
222
+ ) -> ScenarioResult:
223
+ """
224
+ High-level interface for running a scenario test.
225
+
226
+ This is the main entry point for executing scenario tests. It creates a
227
+ ScenarioExecutor instance and runs it in an isolated thread pool to support
228
+ parallel execution and prevent blocking.
229
+
230
+ Args:
231
+ name: Human-readable name for the scenario
232
+ description: Detailed description of what the scenario tests
233
+ agents: List of agent adapters (agent under test, user simulator, judge)
234
+ max_turns: Maximum conversation turns before timeout (default: 10)
235
+ verbose: Show detailed output during execution
236
+ cache_key: Cache key for deterministic behavior
237
+ debug: Enable debug mode for step-by-step execution
238
+ script: Optional script steps to control scenario flow
239
+
240
+ Returns:
241
+ ScenarioResult containing the test outcome, conversation history,
242
+ success/failure status, and detailed reasoning
243
+
244
+ Example:
245
+ ```
246
+ import scenario
247
+
248
+ # Simple scenario with automatic flow
249
+ result = await scenario.run(
250
+ name="help request",
251
+ description="User asks for help with a technical problem",
252
+ agents=[
253
+ my_agent,
254
+ scenario.UserSimulatorAgent(),
255
+ scenario.JudgeAgent(criteria=["Agent provides helpful response"])
256
+ ]
257
+ )
258
+
259
+ # Scripted scenario with custom evaluations
260
+ result = await scenario.run(
261
+ name="custom interaction",
262
+ description="Test specific conversation flow",
263
+ agents=[
264
+ my_agent,
265
+ scenario.UserSimulatorAgent(),
266
+ scenario.JudgeAgent(criteria=["Agent provides helpful response"])
267
+ ],
268
+ script=[
269
+ scenario.user("Hello"),
270
+ scenario.agent(),
271
+ custom_eval,
272
+ scenario.succeed()
273
+ ]
274
+ )
275
+
276
+ # Results analysis
277
+ print(f"Test {'PASSED' if result.success else 'FAILED'}")
278
+ print(f"Reasoning: {result.reasoning}")
279
+ print(f"Conversation had {len(result.messages)} messages")
280
+ ```
281
+
282
+ Note:
283
+ - Runs in isolated thread pool to support parallel execution
284
+ - Blocks until scenario completes or times out
285
+ - All agent calls are automatically cached when cache_key is set
286
+ - Exception handling ensures clean resource cleanup
287
+ """
288
+ scenario = cls(
289
+ name=name,
290
+ description=description,
291
+ agents=agents,
292
+ max_turns=max_turns,
293
+ verbose=verbose,
294
+ cache_key=cache_key,
295
+ debug=debug,
296
+ script=script,
297
+ )
298
+
299
+ # We'll use a thread pool to run the execution logic, we
300
+ # require a separate thread because even though asyncio is
301
+ # being used throughout, any user code on the callback can
302
+ # be blocking, preventing them from running scenarios in parallel
303
+ with concurrent.futures.ThreadPoolExecutor() as executor:
304
+
305
+ def run_in_thread():
306
+ loop = asyncio.new_event_loop()
307
+ asyncio.set_event_loop(loop)
308
+
309
+ try:
310
+ return loop.run_until_complete(scenario._run())
311
+ finally:
312
+ loop.run_until_complete(scenario.event_bus.drain())
313
+ loop.close()
314
+
315
+ # Run the function in the thread pool and await its result
316
+ # This converts the thread's execution into a Future that the current
317
+ # event loop can await without blocking
318
+ loop = asyncio.get_event_loop()
319
+ result = await loop.run_in_executor(executor, run_in_thread)
320
+ return result
321
+
74
322
  def reset(self):
75
- self.messages = []
76
- self._agents = []
323
+ """
324
+ Reset the scenario executor to initial state.
325
+
326
+ This method reinitializes all internal state for a fresh scenario run,
327
+ including conversation history, turn counters, and agent timing information.
328
+ Called automatically during initialization and can be used to rerun scenarios.
329
+ """
330
+ self._state = ScenarioState(
331
+ description=self.description,
332
+ messages=[],
333
+ thread_id=str(PKSUID("thread")),
334
+ current_turn=0,
335
+ config=self.config,
336
+ _executor=self,
337
+ )
338
+ # Pydantic doesn't actually set the _executor field from the constructor, as it's private, so we need to do it manually
339
+ self._state._executor = self
340
+
77
341
  self._pending_messages = {}
78
- self.thread_id = str(PKSUID("thread"))
79
342
  self._total_start_time = time.time()
80
343
  self._agent_times = {}
81
344
 
82
- for AgentClass in self.scenario.agents:
83
- self._agents.append(
84
- AgentClass(
85
- input=AgentInput(
86
- thread_id=self.thread_id,
87
- messages=[],
88
- new_messages=[],
89
- context=self._context or {},
90
- requested_role=list(AgentClass.roles)[0],
91
- scenario_state=self,
92
- )
93
- )
94
- )
95
-
96
345
  self._new_turn()
97
- self.current_turn = 0
346
+ self._state.current_turn = 0
98
347
 
99
- context_scenario.set(self.scenario)
348
+ context_scenario.set(self)
100
349
 
101
350
  def add_message(
102
351
  self, message: ChatCompletionMessageParam, from_agent_idx: Optional[int] = None
103
352
  ):
104
- self.messages.append(message)
353
+ """
354
+ Add a message to the conversation and broadcast to other agents.
355
+
356
+ This method adds a message to the conversation history and makes it available
357
+ to other agents in their next call. It's used internally by the executor
358
+ and can be called from script steps to inject custom messages.
359
+
360
+ Args:
361
+ message: OpenAI-compatible message to add to the conversation
362
+ from_agent_idx: Index of the agent that generated this message.
363
+ Used to avoid broadcasting the message back to its creator.
364
+
365
+ Example:
366
+ ```
367
+ def inject_system_message(state: ScenarioState) -> None:
368
+ state.add_message({
369
+ "role": "system",
370
+ "content": "The user is now in a hurry"
371
+ })
372
+
373
+ # Use in script
374
+ result = await scenario.run(
375
+ name="system message test",
376
+ agents=[agent, user_sim, judge],
377
+ script=[
378
+ scenario.user("Hello"),
379
+ scenario.agent(),
380
+ inject_system_message,
381
+ scenario.user(), # Will see the system message
382
+ scenario.succeed()
383
+ ]
384
+ )
385
+ ```
386
+ """
387
+ self._state.messages.append(message)
105
388
 
106
389
  # Broadcast the message to other agents
107
- for idx, _ in enumerate(self._agents):
390
+ for idx, _ in enumerate(self.agents):
108
391
  if idx == from_agent_idx:
109
392
  continue
110
393
  if idx not in self._pending_messages:
111
394
  self._pending_messages[idx] = []
112
395
  self._pending_messages[idx].append(message)
113
396
 
397
+
114
398
  def add_messages(
115
399
  self,
116
400
  messages: List[ChatCompletionMessageParam],
117
401
  from_agent_idx: Optional[int] = None,
118
402
  ):
403
+ """
404
+ Add multiple messages to the conversation.
405
+
406
+ Convenience method for adding multiple messages at once. Each message
407
+ is added individually using add_message().
408
+
409
+ Args:
410
+ messages: List of OpenAI-compatible messages to add
411
+ from_agent_idx: Index of the agent that generated these messages
412
+
413
+ Example:
414
+ ```
415
+ # Agent returns multiple messages for a complex interaction
416
+ messages = [
417
+ {"role": "assistant", "content": "Let me search for that..."},
418
+ {"role": "assistant", "content": "Here's what I found: ..."}
419
+ ]
420
+ executor.add_messages(messages, from_agent_idx=0)
421
+ ```
422
+ """
119
423
  for message in messages:
120
424
  self.add_message(message, from_agent_idx)
121
425
 
122
426
  def _new_turn(self):
123
- self._pending_agents_on_turn = set(self._agents)
427
+ self._pending_agents_on_turn = set(self.agents)
124
428
  self._pending_roles_on_turn = [
125
- ScenarioAgentRole.USER,
126
- ScenarioAgentRole.AGENT,
127
- ScenarioAgentRole.JUDGE,
429
+ AgentRole.USER,
430
+ AgentRole.AGENT,
431
+ AgentRole.JUDGE,
128
432
  ]
129
- self.current_turn += 1
433
+ self._state.current_turn += 1
130
434
 
131
435
  async def step(self) -> Union[List[ChatCompletionMessageParam], ScenarioResult]:
436
+ """
437
+ Execute a single step in the scenario.
438
+
439
+ A step consists of calling the next agent in the current turn's sequence
440
+ and processing their response. This method is used internally by the
441
+ scenario execution flow.
442
+
443
+ Returns:
444
+ Either a list of messages (if the scenario continues) or a
445
+ ScenarioResult (if the scenario should end)
446
+
447
+ Raises:
448
+ ValueError: If no result is returned from the internal step method
449
+
450
+ Note:
451
+ This is primarily an internal method. Most users should use the
452
+ high-level run() method or script DSL functions instead.
453
+ """
132
454
  result = await self._step()
133
455
  if result is None:
134
456
  raise ValueError("No result from step")
@@ -139,8 +461,8 @@ class ScenarioExecutor:
139
461
  go_to_next_turn=True,
140
462
  on_turn: Optional[
141
463
  Union[
142
- Callable[["ScenarioExecutor"], None],
143
- Callable[["ScenarioExecutor"], Awaitable[None]],
464
+ Callable[["ScenarioState"], None],
465
+ Callable[["ScenarioState"], Awaitable[None]],
144
466
  ]
145
467
  ] = None,
146
468
  ) -> Union[List[ChatCompletionMessageParam], ScenarioResult, None]:
@@ -151,9 +473,9 @@ class ScenarioExecutor:
151
473
  self._new_turn()
152
474
 
153
475
  if on_turn:
154
- await await_if_awaitable(on_turn(self))
476
+ await await_if_awaitable(on_turn(self._state))
155
477
 
156
- if self.current_turn >= (self.scenario.max_turns or 10):
478
+ if self._state.current_turn >= (self.config.max_turns or 10):
157
479
  return self._reached_max_turns()
158
480
 
159
481
  current_role = self._pending_roles_on_turn[0]
@@ -166,10 +488,14 @@ class ScenarioExecutor:
166
488
  return await self._call_agent(idx, role=current_role)
167
489
 
168
490
  def _next_agent_for_role(
169
- self, role: ScenarioAgentRole
170
- ) -> Tuple[int, Optional[ScenarioAgentAdapter]]:
171
- for idx, agent in enumerate(self._agents):
172
- if role in agent.roles and agent in self._pending_agents_on_turn:
491
+ self, role: AgentRole
492
+ ) -> Tuple[int, Optional[AgentAdapter]]:
493
+ for idx, agent in enumerate(self.agents):
494
+ if (
495
+ role == agent.role
496
+ and agent in self._pending_agents_on_turn
497
+ and agent.role in self._pending_roles_on_turn
498
+ ):
173
499
  return idx, agent
174
500
  return -1, None
175
501
 
@@ -177,8 +503,8 @@ class ScenarioExecutor:
177
503
  # If we reached max turns without conclusion, fail the test
178
504
  agent_roles_agents_idx = [
179
505
  idx
180
- for idx, agent in enumerate(self._agents)
181
- if ScenarioAgentRole.AGENT in agent.roles
506
+ for idx, agent in enumerate(self.agents)
507
+ if agent.role == AgentRole.AGENT
182
508
  ]
183
509
  agent_times = [
184
510
  self._agent_times[idx]
@@ -189,14 +515,14 @@ class ScenarioExecutor:
189
515
 
190
516
  return ScenarioResult(
191
517
  success=False,
192
- messages=self.messages,
518
+ messages=self._state.messages,
193
519
  reasoning=error_message
194
- or f"Reached maximum turns ({self.scenario.max_turns or 10}) without conclusion",
520
+ or f"Reached maximum turns ({self.config.max_turns or 10}) without conclusion",
195
521
  total_time=time.time() - self._total_start_time,
196
522
  agent_time=agent_time,
197
523
  )
198
524
 
199
- async def run(self) -> ScenarioResult:
525
+ async def _run(self) -> ScenarioResult:
200
526
  """
201
527
  Run a scenario against the agent under test.
202
528
 
@@ -206,37 +532,61 @@ class ScenarioExecutor:
206
532
  Returns:
207
533
  ScenarioResult containing the test outcome
208
534
  """
535
+ scenario_run_id = generate_scenario_run_id()
209
536
 
210
- if self.scenario.verbose:
211
- print("") # new line
537
+ try:
538
+ await self.event_bus.listen()
539
+ self._emit_run_started_event(scenario_run_id)
212
540
 
213
- self.reset()
541
+ if self.config.verbose:
542
+ print("") # new line
214
543
 
215
- for script_step in self._script:
216
- callable = script_step(self)
217
- if isinstance(callable, Awaitable):
218
- result = await callable
219
- else:
220
- result = callable
544
+ self.reset()
545
+
546
+ for script_step in self.script:
547
+ callable = script_step(self._state)
548
+ if isinstance(callable, Awaitable):
549
+ result = await callable
550
+ else:
551
+ result = callable
552
+ self._emit_message_snapshot_event(scenario_run_id)
221
553
 
222
- if isinstance(result, ScenarioResult):
223
- return result
554
+ if isinstance(result, ScenarioResult):
555
+ status = ScenarioRunFinishedEventStatus.SUCCESS if result.success else ScenarioRunFinishedEventStatus.FAILED
556
+ self._emit_run_finished_event(scenario_run_id, result, status)
557
+ return result
224
558
 
225
- return self._reached_max_turns(
226
- """Reached end of script without conclusion, add one of the following to the end of the script:
559
+ result = self._reached_max_turns(
560
+ """Reached end of script without conclusion, add one of the following to the end of the script:
227
561
 
228
562
  - `scenario.proceed()` to let the simulation continue to play out
229
563
  - `scenario.judge()` to force criteria judgement
230
564
  - `scenario.succeed()` or `scenario.fail()` to end the test with an explicit result
231
- """
232
- )
565
+ """
566
+ )
567
+
568
+ status = ScenarioRunFinishedEventStatus.SUCCESS if result.success else ScenarioRunFinishedEventStatus.FAILED
569
+ self._emit_run_finished_event(scenario_run_id, result, status)
570
+ return result
571
+
572
+ except Exception as e:
573
+ # Publish failure event before propagating the error
574
+ error_result = ScenarioResult(
575
+ success=False,
576
+ messages=self._state.messages,
577
+ reasoning=f"Scenario failed with error: {str(e)}",
578
+ total_time=time.time() - self._total_start_time,
579
+ agent_time=0,
580
+ )
581
+ self._emit_run_finished_event(scenario_run_id, error_result, ScenarioRunFinishedEventStatus.ERROR)
582
+ raise # Re-raise the exception after cleanup
233
583
 
234
584
  async def _call_agent(
235
- self, idx: int, role: ScenarioAgentRole
585
+ self, idx: int, role: AgentRole, request_judgment: bool = False
236
586
  ) -> Union[List[ChatCompletionMessageParam], ScenarioResult]:
237
- agent = self._agents[idx]
587
+ agent = self.agents[idx]
238
588
 
239
- if role == ScenarioAgentRole.USER and self.scenario.debug:
589
+ if role == AgentRole.USER and self.config.debug:
240
590
  print(
241
591
  f"\n{self._scenario_name()}{termcolor.colored('[Debug Mode]', 'yellow')} Press enter to continue or type a message to send"
242
592
  )
@@ -258,28 +608,26 @@ class ScenarioExecutor:
258
608
  with show_spinner(
259
609
  text=(
260
610
  "Judging..."
261
- if role == ScenarioAgentRole.JUDGE
262
- else f"{role.value if isinstance(role, ScenarioAgentRole) else role}:"
611
+ if role == AgentRole.JUDGE
612
+ else f"{role.value if isinstance(role, AgentRole) else role}:"
263
613
  ),
264
614
  color=(
265
615
  "blue"
266
- if role == ScenarioAgentRole.AGENT
267
- else "green" if role == ScenarioAgentRole.USER else "yellow"
616
+ if role == AgentRole.AGENT
617
+ else "green" if role == AgentRole.USER else "yellow"
268
618
  ),
269
- enabled=self.scenario.verbose,
619
+ enabled=self.config.verbose,
270
620
  ):
271
621
  start_time = time.time()
272
622
 
273
623
  agent_response = agent.call(
274
624
  AgentInput(
275
625
  # TODO: test thread_id
276
- thread_id=self.thread_id,
277
- messages=self.messages,
626
+ thread_id=self._state.thread_id,
627
+ messages=self._state.messages,
278
628
  new_messages=self._pending_messages.get(idx, []),
279
- # TODO: test context
280
- context=self._context or {},
281
- requested_role=role,
282
- scenario_state=self,
629
+ judgment_request=request_judgment,
630
+ scenario_state=self._state,
283
631
  )
284
632
  )
285
633
  if not isinstance(agent_response, Awaitable):
@@ -303,12 +651,12 @@ class ScenarioExecutor:
303
651
  else:
304
652
  messages = convert_agent_return_types_to_openai_messages(
305
653
  agent_response,
306
- role="user" if role == ScenarioAgentRole.USER else "assistant",
654
+ role="user" if role == AgentRole.USER else "assistant",
307
655
  )
308
656
 
309
657
  self.add_messages(messages, from_agent_idx=idx)
310
658
 
311
- if messages and self.scenario.verbose:
659
+ if messages and self.config.verbose:
312
660
  print_openai_messages(
313
661
  self._scenario_name(),
314
662
  [m for m in messages if m["role"] != "system"],
@@ -317,75 +665,51 @@ class ScenarioExecutor:
317
665
  return messages
318
666
 
319
667
  def _scenario_name(self):
320
- if self.scenario.verbose == 2:
321
- return termcolor.colored(f"[Scenario: {self.scenario.name}] ", "yellow")
668
+ if self.config.verbose == 2:
669
+ return termcolor.colored(f"[Scenario: {self.name}] ", "yellow")
322
670
  else:
323
671
  return ""
324
672
 
325
- # State access utils
326
-
327
- def last_message(self) -> ChatCompletionMessageParam:
328
- if len(self.messages) == 0:
329
- raise ValueError("No messages found")
330
- return self.messages[-1]
331
-
332
- def last_user_message(self) -> ChatCompletionUserMessageParam:
333
- user_messages = [m for m in self.messages if m["role"] == "user"]
334
- if not user_messages:
335
- raise ValueError("No user messages found")
336
- return user_messages[-1]
337
-
338
- def last_tool_call(
339
- self, tool_name: str
340
- ) -> Optional[ChatCompletionMessageToolCallParam]:
341
- for message in reversed(self.messages):
342
- if message["role"] == "assistant" and "tool_calls" in message:
343
- for tool_call in message["tool_calls"]:
344
- if tool_call["function"]["name"] == tool_name:
345
- return tool_call
346
- return None
347
-
348
- def has_tool_call(self, tool_name: str) -> bool:
349
- return self.last_tool_call(tool_name) is not None
350
-
351
673
  # Scripting utils
352
674
 
353
675
  async def message(self, message: ChatCompletionMessageParam) -> None:
354
676
  if message["role"] == "user":
355
- await self._script_call_agent(ScenarioAgentRole.USER, message)
677
+ await self._script_call_agent(AgentRole.USER, message)
356
678
  elif message["role"] == "assistant":
357
- await self._script_call_agent(ScenarioAgentRole.AGENT, message)
679
+ await self._script_call_agent(AgentRole.AGENT, message)
358
680
  else:
359
681
  self.add_message(message)
360
682
 
361
683
  async def user(
362
684
  self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
363
685
  ) -> None:
364
- await self._script_call_agent(ScenarioAgentRole.USER, content)
686
+ await self._script_call_agent(AgentRole.USER, content)
365
687
 
366
688
  async def agent(
367
689
  self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
368
690
  ) -> None:
369
- await self._script_call_agent(ScenarioAgentRole.AGENT, content)
691
+ await self._script_call_agent(AgentRole.AGENT, content)
370
692
 
371
693
  async def judge(
372
694
  self, content: Optional[Union[str, ChatCompletionMessageParam]] = None
373
695
  ) -> Optional[ScenarioResult]:
374
- return await self._script_call_agent(ScenarioAgentRole.JUDGE, content)
696
+ return await self._script_call_agent(
697
+ AgentRole.JUDGE, content, request_judgment=True
698
+ )
375
699
 
376
700
  async def proceed(
377
701
  self,
378
702
  turns: Optional[int] = None,
379
703
  on_turn: Optional[
380
704
  Union[
381
- Callable[["ScenarioExecutor"], None],
382
- Callable[["ScenarioExecutor"], Awaitable[None]],
705
+ Callable[["ScenarioState"], None],
706
+ Callable[["ScenarioState"], Awaitable[None]],
383
707
  ]
384
708
  ] = None,
385
709
  on_step: Optional[
386
710
  Union[
387
- Callable[["ScenarioExecutor"], None],
388
- Callable[["ScenarioExecutor"], Awaitable[None]],
711
+ Callable[["ScenarioState"], None],
712
+ Callable[["ScenarioState"], Awaitable[None]],
389
713
  ]
390
714
  ] = None,
391
715
  ) -> Optional[ScenarioResult]:
@@ -396,71 +720,223 @@ class ScenarioExecutor:
396
720
  go_to_next_turn=(
397
721
  turns is None
398
722
  or initial_turn is None
399
- or (self.current_turn + 1 < initial_turn + turns)
723
+ or (self._state.current_turn + 1 < initial_turn + turns)
400
724
  ),
401
725
  )
402
726
 
403
727
  if initial_turn is None:
404
- initial_turn = self.current_turn
728
+ initial_turn = self._state.current_turn
405
729
 
406
730
  if next_message is None:
407
731
  break
408
732
 
409
733
  if on_step:
410
- await await_if_awaitable(on_step(self))
734
+ await await_if_awaitable(on_step(self._state))
411
735
 
412
736
  if isinstance(next_message, ScenarioResult):
413
737
  return next_message
414
738
 
415
- async def succeed(self) -> ScenarioResult:
739
+ async def succeed(self, reasoning: Optional[str] = None) -> ScenarioResult:
416
740
  return ScenarioResult(
417
741
  success=True,
418
- messages=self.messages,
419
- reasoning="Scenario marked as successful with scenario.succeed()",
420
- passed_criteria=self.scenario.criteria,
742
+ messages=self._state.messages,
743
+ reasoning=reasoning
744
+ or "Scenario marked as successful with scenario.succeed()",
421
745
  )
422
746
 
423
- async def fail(self) -> ScenarioResult:
747
+ async def fail(self, reasoning: Optional[str] = None) -> ScenarioResult:
424
748
  return ScenarioResult(
425
749
  success=False,
426
- messages=self.messages,
427
- reasoning="Scenario marked as failed with scenario.fail()",
428
- passed_criteria=self.scenario.criteria,
750
+ messages=self._state.messages,
751
+ reasoning=reasoning or "Scenario marked as failed with scenario.fail()",
429
752
  )
430
753
 
754
+ def _consume_until_role(self, role: AgentRole) -> None:
755
+ while len(self._pending_roles_on_turn) > 0:
756
+ next_role = self._pending_roles_on_turn[0]
757
+ if next_role == role:
758
+ break
759
+ self._pending_roles_on_turn.pop(0)
760
+
431
761
  async def _script_call_agent(
432
762
  self,
433
- role: ScenarioAgentRole,
763
+ role: AgentRole,
434
764
  content: Optional[Union[str, ChatCompletionMessageParam]] = None,
765
+ request_judgment: bool = False,
435
766
  ) -> Optional[ScenarioResult]:
767
+ self._consume_until_role(role)
436
768
  idx, next_agent = self._next_agent_for_role(role)
437
769
  if not next_agent:
438
770
  self._new_turn()
771
+ self._consume_until_role(role)
439
772
  idx, next_agent = self._next_agent_for_role(role)
440
773
 
441
774
  if not next_agent:
775
+ role_class = (
776
+ "a scenario.UserSimulatorAgent()"
777
+ if role == AgentRole.USER
778
+ else (
779
+ "a scenario.JudgeAgent()"
780
+ if role == AgentRole.JUDGE
781
+ else "your agent"
782
+ )
783
+ )
442
784
  if content:
443
785
  raise ValueError(
444
- f"Cannot generate a message for role `{role.value}` with content `{content}` because no agent with this role was found"
786
+ f"Cannot generate a message for role `{role.value}` with content `{content}` because no agent with this role was found, please add {role_class} to the scenario `agents` list"
445
787
  )
446
788
  raise ValueError(
447
- f"Cannot generate a message for role `{role.value}` because no agent with this role was found"
789
+ f"Cannot generate a message for role `{role.value}` because no agent with this role was found, please add {role_class} to the scenario `agents` list"
448
790
  )
449
791
 
450
792
  self._pending_agents_on_turn.remove(next_agent)
451
- self._pending_roles_on_turn.remove(role)
452
793
 
453
794
  if content:
454
795
  if isinstance(content, str):
455
- message = ChatCompletionUserMessageParam(role="user", content=content)
796
+ message = (
797
+ ChatCompletionUserMessageParam(role="user", content=content)
798
+ if role == AgentRole.USER
799
+ else ChatCompletionAssistantMessageParam(
800
+ role="assistant", content=content
801
+ )
802
+ )
456
803
  else:
457
804
  message = content
458
805
 
459
806
  self.add_message(message)
460
- if self.scenario.verbose:
807
+ if self.config.verbose:
461
808
  print_openai_messages(self._scenario_name(), [message])
462
809
  return
463
810
 
464
- result = await self._call_agent(idx, role=role)
811
+ result = await self._call_agent(
812
+ idx, role=role, request_judgment=request_judgment
813
+ )
465
814
  if isinstance(result, ScenarioResult):
466
815
  return result
816
+
817
+ # Event handling methods
818
+
819
+ class _CommonEventFields(TypedDict):
820
+ """
821
+ Common fields shared across all scenario events.
822
+
823
+ These fields provide consistent identification and timing information
824
+ for all events emitted during scenario execution.
825
+
826
+ Attributes:
827
+ batch_run_id: Unique identifier for the batch of scenario runs
828
+ scenario_run_id: Unique identifier for this specific scenario run
829
+ scenario_id: Human-readable name/identifier for the scenario
830
+ timestamp: Unix timestamp in milliseconds when the event occurred
831
+ """
832
+ batch_run_id: str
833
+ scenario_run_id: str
834
+ scenario_id: str
835
+ timestamp: int
836
+
837
+ def _create_common_event_fields(self, scenario_run_id: str) -> _CommonEventFields:
838
+ """
839
+ Create common fields used across all scenario events.
840
+
841
+ This method generates the standard fields that every scenario event
842
+ must include for proper identification and timing.
843
+
844
+ Args:
845
+ scenario_run_id: Unique identifier for the current scenario run
846
+
847
+ Returns:
848
+ Dictionary containing common event fields with current timestamp
849
+ """
850
+ return {
851
+ "batch_run_id": self.batch_run_id,
852
+ "scenario_run_id": scenario_run_id,
853
+ "scenario_id": self.name,
854
+ "timestamp": int(time.time() * 1000),
855
+ }
856
+
857
+ def _emit_run_started_event(self, scenario_run_id: str) -> None:
858
+ """
859
+ Emit a scenario run started event.
860
+
861
+ This event is published when a scenario begins execution. It includes
862
+ metadata about the scenario such as name and description, and is used
863
+ to track the start of scenario runs in monitoring systems.
864
+
865
+ Args:
866
+ scenario_run_id: Unique identifier for the current scenario run
867
+
868
+ Note:
869
+ This event is automatically published at the beginning of `_run()`
870
+ and signals the start of scenario execution to any event listeners.
871
+ """
872
+ common_fields = self._create_common_event_fields(scenario_run_id)
873
+ metadata = ScenarioRunStartedEventMetadata(
874
+ name=self.name,
875
+ description=self.description,
876
+ )
877
+
878
+ event = ScenarioRunStartedEvent(
879
+ **common_fields,
880
+ metadata=metadata,
881
+ )
882
+ self.event_bus.publish(event)
883
+
884
+ def _emit_message_snapshot_event(self, scenario_run_id: str) -> None:
885
+ """
886
+ Emit a message snapshot event.
887
+
888
+ This event captures the current state of the conversation during
889
+ scenario execution. It's published whenever messages are added to
890
+ the conversation, allowing real-time tracking of scenario progress.
891
+
892
+ Note:
893
+ This event is automatically published by `add_message()` and
894
+ `add_messages()` to provide continuous visibility into scenario
895
+ execution state.
896
+ """
897
+ common_fields = self._create_common_event_fields(scenario_run_id)
898
+
899
+ event = ScenarioMessageSnapshotEvent(
900
+ **common_fields,
901
+ messages=convert_messages_to_ag_ui_messages(self._state.messages),
902
+ )
903
+ self.event_bus.publish(event)
904
+
905
+ def _emit_run_finished_event(
906
+ self,
907
+ scenario_run_id: str,
908
+ result: ScenarioResult,
909
+ status: ScenarioRunFinishedEventStatus
910
+ ) -> None:
911
+ """
912
+ Emit a scenario run finished event.
913
+
914
+ This event is published when a scenario completes execution, whether
915
+ successfully or with an error. It includes the final results, verdict,
916
+ and reasoning for the scenario outcome.
917
+
918
+ Args:
919
+ scenario_run_id: Unique identifier for the current scenario run
920
+ result: The final scenario result containing success/failure status
921
+ status: The execution status (SUCCESS, FAILED, or ERROR)
922
+
923
+ Note:
924
+ This event is automatically published at the end of `_run()` and
925
+ signals the completion of scenario execution to any event listeners.
926
+ It includes detailed results for monitoring and analysis purposes.
927
+ """
928
+ common_fields = self._create_common_event_fields(scenario_run_id)
929
+
930
+ results = ScenarioRunFinishedEventResults(
931
+ verdict=ScenarioRunFinishedEventVerdict.SUCCESS if result.success else ScenarioRunFinishedEventVerdict.FAILURE,
932
+ reasoning=result.reasoning or "",
933
+ met_criteria=result.passed_criteria,
934
+ unmet_criteria=result.failed_criteria,
935
+ )
936
+
937
+ event = ScenarioRunFinishedEvent(
938
+ **common_fields,
939
+ status=status,
940
+ results=results,
941
+ )
942
+ self.event_bus.publish(event)