langwatch-scenario 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langwatch_scenario-0.4.0.dist-info → langwatch_scenario-0.6.0.dist-info}/METADATA +93 -71
- langwatch_scenario-0.6.0.dist-info/RECORD +27 -0
- scenario/__init__.py +11 -114
- scenario/_utils/__init__.py +32 -0
- scenario/_utils/ids.py +58 -0
- scenario/_utils/message_conversion.py +103 -0
- scenario/{utils.py → _utils/utils.py} +21 -110
- scenario/agent_adapter.py +8 -4
- scenario/cache.py +4 -3
- scenario/config.py +7 -5
- scenario/events/__init__.py +66 -0
- scenario/events/event_bus.py +175 -0
- scenario/events/event_reporter.py +83 -0
- scenario/events/events.py +169 -0
- scenario/events/messages.py +84 -0
- scenario/events/utils.py +86 -0
- scenario/judge_agent.py +7 -28
- scenario/pytest_plugin.py +2 -47
- scenario/scenario_executor.py +268 -84
- scenario/scenario_state.py +6 -6
- scenario/script.py +9 -9
- scenario/types.py +10 -6
- scenario/user_simulator_agent.py +4 -11
- langwatch_scenario-0.4.0.dist-info/RECORD +0 -18
- {langwatch_scenario-0.4.0.dist-info → langwatch_scenario-0.6.0.dist-info}/WHEEL +0 -0
- {langwatch_scenario-0.4.0.dist-info → langwatch_scenario-0.6.0.dist-info}/entry_points.txt +0 -0
- {langwatch_scenario-0.4.0.dist-info → langwatch_scenario-0.6.0.dist-info}/top_level.txt +0 -0
- /scenario/{error_messages.py → _error_messages.py} +0 -0
scenario/pytest_plugin.py
CHANGED
@@ -8,7 +8,7 @@ pytest-based testing workflows.
|
|
8
8
|
"""
|
9
9
|
|
10
10
|
import pytest
|
11
|
-
from typing import TypedDict
|
11
|
+
from typing import TypedDict
|
12
12
|
import functools
|
13
13
|
from termcolor import colored
|
14
14
|
|
@@ -16,7 +16,6 @@ from scenario.config import ScenarioConfig
|
|
16
16
|
from scenario.types import ScenarioResult
|
17
17
|
|
18
18
|
from .scenario_executor import ScenarioExecutor
|
19
|
-
import scenario
|
20
19
|
|
21
20
|
|
22
21
|
class ScenarioReporterResults(TypedDict):
|
@@ -46,23 +45,6 @@ class ScenarioReporter:
|
|
46
45
|
|
47
46
|
Attributes:
|
48
47
|
results: List of all scenario test results collected during the session
|
49
|
-
|
50
|
-
Example:
|
51
|
-
The reporter is used automatically, but you can access it in tests:
|
52
|
-
|
53
|
-
```python
|
54
|
-
def test_my_scenarios(scenario_reporter):
|
55
|
-
# Run your scenarios
|
56
|
-
result1 = await scenario.run(...)
|
57
|
-
result2 = await scenario.run(...)
|
58
|
-
|
59
|
-
# Check collected results
|
60
|
-
assert len(scenario_reporter.results) == 2
|
61
|
-
|
62
|
-
# Get summary statistics
|
63
|
-
summary = scenario_reporter.get_summary()
|
64
|
-
print(f"Success rate: {summary['success_rate']}%")
|
65
|
-
```
|
66
48
|
"""
|
67
49
|
|
68
50
|
def __init__(self):
|
@@ -80,21 +62,6 @@ class ScenarioReporter:
|
|
80
62
|
Args:
|
81
63
|
scenario: The ScenarioExecutor instance that ran the test
|
82
64
|
result: The ScenarioResult containing test outcome and details
|
83
|
-
|
84
|
-
Example:
|
85
|
-
```python
|
86
|
-
# This happens automatically when you run scenarios
|
87
|
-
result = await scenario.run(
|
88
|
-
name="my test",
|
89
|
-
description="Test description",
|
90
|
-
agents=[
|
91
|
-
my_agent,
|
92
|
-
scenario.UserSimulatorAgent(),
|
93
|
-
scenario.JudgeAgent(criteria=["Agent provides helpful response"])
|
94
|
-
]
|
95
|
-
)
|
96
|
-
# Result is automatically added to the global reporter
|
97
|
-
```
|
98
65
|
"""
|
99
66
|
self.results.append({"scenario": scenario, "result": result})
|
100
67
|
|
@@ -111,18 +78,6 @@ class ScenarioReporter:
|
|
111
78
|
- passed: Number of scenarios that passed
|
112
79
|
- failed: Number of scenarios that failed
|
113
80
|
- success_rate: Percentage of scenarios that passed (0-100)
|
114
|
-
|
115
|
-
Example:
|
116
|
-
```python
|
117
|
-
def test_summary_check(scenario_reporter):
|
118
|
-
# Run some scenarios...
|
119
|
-
await scenario.run(...)
|
120
|
-
await scenario.run(...)
|
121
|
-
|
122
|
-
summary = scenario_reporter.get_summary()
|
123
|
-
assert summary['total'] == 2
|
124
|
-
assert summary['success_rate'] >= 80 # Require 80% success rate
|
125
|
-
```
|
126
81
|
"""
|
127
82
|
total = len(self.results)
|
128
83
|
passed = sum(1 for r in self.results if r["result"].success)
|
@@ -347,7 +302,7 @@ def scenario_reporter(request):
|
|
347
302
|
ScenarioReporter: The global reporter instance collecting all scenario results
|
348
303
|
|
349
304
|
Example:
|
350
|
-
```
|
305
|
+
```
|
351
306
|
@pytest.mark.agent_test
|
352
307
|
def test_with_custom_reporting(scenario_reporter):
|
353
308
|
# Run your scenarios
|
scenario/scenario_executor.py
CHANGED
@@ -12,11 +12,11 @@ from typing import (
|
|
12
12
|
Callable,
|
13
13
|
Dict,
|
14
14
|
List,
|
15
|
-
Any,
|
16
15
|
Optional,
|
17
16
|
Set,
|
18
17
|
Tuple,
|
19
18
|
Union,
|
19
|
+
TypedDict,
|
20
20
|
)
|
21
21
|
import time
|
22
22
|
import termcolor
|
@@ -24,25 +24,39 @@ import asyncio
|
|
24
24
|
import concurrent.futures
|
25
25
|
|
26
26
|
from scenario.config import ScenarioConfig
|
27
|
-
from scenario.
|
28
|
-
await_if_awaitable,
|
27
|
+
from scenario._utils import (
|
29
28
|
check_valid_return_type,
|
30
29
|
convert_agent_return_types_to_openai_messages,
|
31
30
|
print_openai_messages,
|
32
31
|
show_spinner,
|
32
|
+
await_if_awaitable,
|
33
|
+
get_or_create_batch_run_id,
|
34
|
+
generate_scenario_run_id,
|
33
35
|
)
|
34
36
|
from openai.types.chat import (
|
35
37
|
ChatCompletionMessageParam,
|
36
38
|
ChatCompletionUserMessageParam,
|
39
|
+
ChatCompletionAssistantMessageParam,
|
37
40
|
)
|
38
41
|
|
39
42
|
from .types import AgentInput, AgentRole, ScenarioResult, ScriptStep
|
40
|
-
from .
|
43
|
+
from ._error_messages import agent_response_not_awaitable
|
41
44
|
from .cache import context_scenario
|
42
45
|
from .agent_adapter import AgentAdapter
|
43
46
|
from .script import proceed
|
44
47
|
from pksuid import PKSUID
|
45
48
|
from .scenario_state import ScenarioState
|
49
|
+
from .events import (
|
50
|
+
ScenarioEventBus,
|
51
|
+
ScenarioRunStartedEvent,
|
52
|
+
ScenarioMessageSnapshotEvent,
|
53
|
+
ScenarioRunFinishedEvent,
|
54
|
+
ScenarioRunStartedEventMetadata,
|
55
|
+
ScenarioRunFinishedEventResults,
|
56
|
+
ScenarioRunFinishedEventVerdict,
|
57
|
+
ScenarioRunFinishedEventStatus,
|
58
|
+
convert_messages_to_ag_ui_messages,
|
59
|
+
)
|
46
60
|
|
47
61
|
|
48
62
|
class ScenarioExecutor:
|
@@ -68,30 +82,30 @@ class ScenarioExecutor:
|
|
68
82
|
config: Configuration settings for execution behavior
|
69
83
|
|
70
84
|
Example:
|
71
|
-
```
|
85
|
+
```
|
72
86
|
# Direct instantiation (less common)
|
73
87
|
executor = ScenarioExecutor(
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
88
|
+
name="weather query test",
|
89
|
+
description="User asks about weather, agent should provide helpful response",
|
90
|
+
agents=[
|
91
|
+
weather_agent,
|
92
|
+
scenario.UserSimulatorAgent(),
|
93
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
|
94
|
+
],
|
95
|
+
max_turns=10,
|
96
|
+
verbose=True
|
83
97
|
)
|
84
98
|
result = await executor._run()
|
85
99
|
|
86
100
|
# Preferred high-level API
|
87
101
|
result = await scenario.run(
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
102
|
+
name="weather query test",
|
103
|
+
description="User asks about weather, agent should provide helpful response",
|
104
|
+
agents=[
|
105
|
+
weather_agent,
|
106
|
+
scenario.UserSimulatorAgent(),
|
107
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
|
108
|
+
]
|
95
109
|
)
|
96
110
|
```
|
97
111
|
|
@@ -101,6 +115,7 @@ class ScenarioExecutor:
|
|
101
115
|
- Debug mode allows step-by-step execution with user intervention
|
102
116
|
- Results include detailed timing information and conversation history
|
103
117
|
"""
|
118
|
+
|
104
119
|
name: str
|
105
120
|
description: str
|
106
121
|
agents: List[AgentAdapter]
|
@@ -116,6 +131,10 @@ class ScenarioExecutor:
|
|
116
131
|
_pending_agents_on_turn: Set[AgentAdapter] = set()
|
117
132
|
_agent_times: Dict[int, float] = {}
|
118
133
|
|
134
|
+
event_bus: ScenarioEventBus
|
135
|
+
|
136
|
+
batch_run_id: str
|
137
|
+
|
119
138
|
def __init__(
|
120
139
|
self,
|
121
140
|
name: str,
|
@@ -127,6 +146,7 @@ class ScenarioExecutor:
|
|
127
146
|
verbose: Optional[Union[bool, int]] = None,
|
128
147
|
cache_key: Optional[str] = None,
|
129
148
|
debug: Optional[bool] = None,
|
149
|
+
event_bus: Optional[ScenarioEventBus] = None,
|
130
150
|
):
|
131
151
|
"""
|
132
152
|
Initialize a scenario executor.
|
@@ -147,6 +167,7 @@ class ScenarioExecutor:
|
|
147
167
|
Overrides global configuration for this scenario.
|
148
168
|
debug: Whether to enable debug mode with step-by-step execution.
|
149
169
|
Overrides global configuration for this scenario.
|
170
|
+
event_reporter: Optional event reporter for the scenario
|
150
171
|
|
151
172
|
Example:
|
152
173
|
```python
|
@@ -183,6 +204,10 @@ class ScenarioExecutor:
|
|
183
204
|
|
184
205
|
self.reset()
|
185
206
|
|
207
|
+
self.event_bus = event_bus or ScenarioEventBus()
|
208
|
+
|
209
|
+
self.batch_run_id = get_or_create_batch_run_id()
|
210
|
+
|
186
211
|
@classmethod
|
187
212
|
async def run(
|
188
213
|
cls,
|
@@ -217,35 +242,35 @@ class ScenarioExecutor:
|
|
217
242
|
success/failure status, and detailed reasoning
|
218
243
|
|
219
244
|
Example:
|
220
|
-
```
|
245
|
+
```
|
221
246
|
import scenario
|
222
247
|
|
223
248
|
# Simple scenario with automatic flow
|
224
249
|
result = await scenario.run(
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
250
|
+
name="help request",
|
251
|
+
description="User asks for help with a technical problem",
|
252
|
+
agents=[
|
253
|
+
my_agent,
|
254
|
+
scenario.UserSimulatorAgent(),
|
255
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful response"])
|
256
|
+
]
|
232
257
|
)
|
233
258
|
|
234
259
|
# Scripted scenario with custom evaluations
|
235
260
|
result = await scenario.run(
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
261
|
+
name="custom interaction",
|
262
|
+
description="Test specific conversation flow",
|
263
|
+
agents=[
|
264
|
+
my_agent,
|
265
|
+
scenario.UserSimulatorAgent(),
|
266
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful response"])
|
267
|
+
],
|
268
|
+
script=[
|
269
|
+
scenario.user("Hello"),
|
270
|
+
scenario.agent(),
|
271
|
+
custom_eval,
|
272
|
+
scenario.succeed()
|
273
|
+
]
|
249
274
|
)
|
250
275
|
|
251
276
|
# Results analysis
|
@@ -284,6 +309,7 @@ class ScenarioExecutor:
|
|
284
309
|
try:
|
285
310
|
return loop.run_until_complete(scenario._run())
|
286
311
|
finally:
|
312
|
+
loop.run_until_complete(scenario.event_bus.drain())
|
287
313
|
loop.close()
|
288
314
|
|
289
315
|
# Run the function in the thread pool and await its result
|
@@ -300,18 +326,6 @@ class ScenarioExecutor:
|
|
300
326
|
This method reinitializes all internal state for a fresh scenario run,
|
301
327
|
including conversation history, turn counters, and agent timing information.
|
302
328
|
Called automatically during initialization and can be used to rerun scenarios.
|
303
|
-
|
304
|
-
Example:
|
305
|
-
```python
|
306
|
-
executor = ScenarioExecutor(...)
|
307
|
-
|
308
|
-
# Run first test
|
309
|
-
result1 = await executor._run()
|
310
|
-
|
311
|
-
# Reset and run again
|
312
|
-
executor.reset()
|
313
|
-
result2 = await executor._run()
|
314
|
-
```
|
315
329
|
"""
|
316
330
|
self._state = ScenarioState(
|
317
331
|
description=self.description,
|
@@ -349,24 +363,24 @@ class ScenarioExecutor:
|
|
349
363
|
Used to avoid broadcasting the message back to its creator.
|
350
364
|
|
351
365
|
Example:
|
352
|
-
```
|
366
|
+
```
|
353
367
|
def inject_system_message(state: ScenarioState) -> None:
|
354
|
-
state.
|
368
|
+
state.add_message({
|
355
369
|
"role": "system",
|
356
370
|
"content": "The user is now in a hurry"
|
357
371
|
})
|
358
372
|
|
359
373
|
# Use in script
|
360
374
|
result = await scenario.run(
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
375
|
+
name="system message test",
|
376
|
+
agents=[agent, user_sim, judge],
|
377
|
+
script=[
|
378
|
+
scenario.user("Hello"),
|
379
|
+
scenario.agent(),
|
380
|
+
inject_system_message,
|
381
|
+
scenario.user(), # Will see the system message
|
382
|
+
scenario.succeed()
|
383
|
+
]
|
370
384
|
)
|
371
385
|
```
|
372
386
|
"""
|
@@ -380,6 +394,7 @@ class ScenarioExecutor:
|
|
380
394
|
self._pending_messages[idx] = []
|
381
395
|
self._pending_messages[idx].append(message)
|
382
396
|
|
397
|
+
|
383
398
|
def add_messages(
|
384
399
|
self,
|
385
400
|
messages: List[ChatCompletionMessageParam],
|
@@ -396,7 +411,7 @@ class ScenarioExecutor:
|
|
396
411
|
from_agent_idx: Index of the agent that generated these messages
|
397
412
|
|
398
413
|
Example:
|
399
|
-
```
|
414
|
+
```
|
400
415
|
# Agent returns multiple messages for a complex interaction
|
401
416
|
messages = [
|
402
417
|
{"role": "assistant", "content": "Let me search for that..."},
|
@@ -476,7 +491,11 @@ class ScenarioExecutor:
|
|
476
491
|
self, role: AgentRole
|
477
492
|
) -> Tuple[int, Optional[AgentAdapter]]:
|
478
493
|
for idx, agent in enumerate(self.agents):
|
479
|
-
if
|
494
|
+
if (
|
495
|
+
role == agent.role
|
496
|
+
and agent in self._pending_agents_on_turn
|
497
|
+
and agent.role in self._pending_roles_on_turn
|
498
|
+
):
|
480
499
|
return idx, agent
|
481
500
|
return -1, None
|
482
501
|
|
@@ -513,30 +532,54 @@ class ScenarioExecutor:
|
|
513
532
|
Returns:
|
514
533
|
ScenarioResult containing the test outcome
|
515
534
|
"""
|
535
|
+
scenario_run_id = generate_scenario_run_id()
|
516
536
|
|
517
|
-
|
518
|
-
|
537
|
+
try:
|
538
|
+
await self.event_bus.listen()
|
539
|
+
self._emit_run_started_event(scenario_run_id)
|
519
540
|
|
520
|
-
|
541
|
+
if self.config.verbose:
|
542
|
+
print("") # new line
|
521
543
|
|
522
|
-
|
523
|
-
callable = script_step(self._state)
|
524
|
-
if isinstance(callable, Awaitable):
|
525
|
-
result = await callable
|
526
|
-
else:
|
527
|
-
result = callable
|
544
|
+
self.reset()
|
528
545
|
|
529
|
-
|
530
|
-
|
546
|
+
for script_step in self.script:
|
547
|
+
callable = script_step(self._state)
|
548
|
+
if isinstance(callable, Awaitable):
|
549
|
+
result = await callable
|
550
|
+
else:
|
551
|
+
result = callable
|
552
|
+
self._emit_message_snapshot_event(scenario_run_id)
|
531
553
|
|
532
|
-
|
533
|
-
|
554
|
+
if isinstance(result, ScenarioResult):
|
555
|
+
status = ScenarioRunFinishedEventStatus.SUCCESS if result.success else ScenarioRunFinishedEventStatus.FAILED
|
556
|
+
self._emit_run_finished_event(scenario_run_id, result, status)
|
557
|
+
return result
|
558
|
+
|
559
|
+
result = self._reached_max_turns(
|
560
|
+
"""Reached end of script without conclusion, add one of the following to the end of the script:
|
534
561
|
|
535
562
|
- `scenario.proceed()` to let the simulation continue to play out
|
536
563
|
- `scenario.judge()` to force criteria judgement
|
537
564
|
- `scenario.succeed()` or `scenario.fail()` to end the test with an explicit result
|
538
|
-
|
539
|
-
|
565
|
+
"""
|
566
|
+
)
|
567
|
+
|
568
|
+
status = ScenarioRunFinishedEventStatus.SUCCESS if result.success else ScenarioRunFinishedEventStatus.FAILED
|
569
|
+
self._emit_run_finished_event(scenario_run_id, result, status)
|
570
|
+
return result
|
571
|
+
|
572
|
+
except Exception as e:
|
573
|
+
# Publish failure event before propagating the error
|
574
|
+
error_result = ScenarioResult(
|
575
|
+
success=False,
|
576
|
+
messages=self._state.messages,
|
577
|
+
reasoning=f"Scenario failed with error: {str(e)}",
|
578
|
+
total_time=time.time() - self._total_start_time,
|
579
|
+
agent_time=0,
|
580
|
+
)
|
581
|
+
self._emit_run_finished_event(scenario_run_id, error_result, ScenarioRunFinishedEventStatus.ERROR)
|
582
|
+
raise # Re-raise the exception after cleanup
|
540
583
|
|
541
584
|
async def _call_agent(
|
542
585
|
self, idx: int, role: AgentRole, request_judgment: bool = False
|
@@ -708,15 +751,24 @@ class ScenarioExecutor:
|
|
708
751
|
reasoning=reasoning or "Scenario marked as failed with scenario.fail()",
|
709
752
|
)
|
710
753
|
|
754
|
+
def _consume_until_role(self, role: AgentRole) -> None:
|
755
|
+
while len(self._pending_roles_on_turn) > 0:
|
756
|
+
next_role = self._pending_roles_on_turn[0]
|
757
|
+
if next_role == role:
|
758
|
+
break
|
759
|
+
self._pending_roles_on_turn.pop(0)
|
760
|
+
|
711
761
|
async def _script_call_agent(
|
712
762
|
self,
|
713
763
|
role: AgentRole,
|
714
764
|
content: Optional[Union[str, ChatCompletionMessageParam]] = None,
|
715
765
|
request_judgment: bool = False,
|
716
766
|
) -> Optional[ScenarioResult]:
|
767
|
+
self._consume_until_role(role)
|
717
768
|
idx, next_agent = self._next_agent_for_role(role)
|
718
769
|
if not next_agent:
|
719
770
|
self._new_turn()
|
771
|
+
self._consume_until_role(role)
|
720
772
|
idx, next_agent = self._next_agent_for_role(role)
|
721
773
|
|
722
774
|
if not next_agent:
|
@@ -738,11 +790,16 @@ class ScenarioExecutor:
|
|
738
790
|
)
|
739
791
|
|
740
792
|
self._pending_agents_on_turn.remove(next_agent)
|
741
|
-
self._pending_roles_on_turn.remove(role)
|
742
793
|
|
743
794
|
if content:
|
744
795
|
if isinstance(content, str):
|
745
|
-
message =
|
796
|
+
message = (
|
797
|
+
ChatCompletionUserMessageParam(role="user", content=content)
|
798
|
+
if role == AgentRole.USER
|
799
|
+
else ChatCompletionAssistantMessageParam(
|
800
|
+
role="assistant", content=content
|
801
|
+
)
|
802
|
+
)
|
746
803
|
else:
|
747
804
|
message = content
|
748
805
|
|
@@ -756,3 +813,130 @@ class ScenarioExecutor:
|
|
756
813
|
)
|
757
814
|
if isinstance(result, ScenarioResult):
|
758
815
|
return result
|
816
|
+
|
817
|
+
# Event handling methods
|
818
|
+
|
819
|
+
class _CommonEventFields(TypedDict):
|
820
|
+
"""
|
821
|
+
Common fields shared across all scenario events.
|
822
|
+
|
823
|
+
These fields provide consistent identification and timing information
|
824
|
+
for all events emitted during scenario execution.
|
825
|
+
|
826
|
+
Attributes:
|
827
|
+
batch_run_id: Unique identifier for the batch of scenario runs
|
828
|
+
scenario_run_id: Unique identifier for this specific scenario run
|
829
|
+
scenario_id: Human-readable name/identifier for the scenario
|
830
|
+
timestamp: Unix timestamp in milliseconds when the event occurred
|
831
|
+
"""
|
832
|
+
batch_run_id: str
|
833
|
+
scenario_run_id: str
|
834
|
+
scenario_id: str
|
835
|
+
timestamp: int
|
836
|
+
|
837
|
+
def _create_common_event_fields(self, scenario_run_id: str) -> _CommonEventFields:
|
838
|
+
"""
|
839
|
+
Create common fields used across all scenario events.
|
840
|
+
|
841
|
+
This method generates the standard fields that every scenario event
|
842
|
+
must include for proper identification and timing.
|
843
|
+
|
844
|
+
Args:
|
845
|
+
scenario_run_id: Unique identifier for the current scenario run
|
846
|
+
|
847
|
+
Returns:
|
848
|
+
Dictionary containing common event fields with current timestamp
|
849
|
+
"""
|
850
|
+
return {
|
851
|
+
"batch_run_id": self.batch_run_id,
|
852
|
+
"scenario_run_id": scenario_run_id,
|
853
|
+
"scenario_id": self.name,
|
854
|
+
"timestamp": int(time.time() * 1000),
|
855
|
+
}
|
856
|
+
|
857
|
+
def _emit_run_started_event(self, scenario_run_id: str) -> None:
|
858
|
+
"""
|
859
|
+
Emit a scenario run started event.
|
860
|
+
|
861
|
+
This event is published when a scenario begins execution. It includes
|
862
|
+
metadata about the scenario such as name and description, and is used
|
863
|
+
to track the start of scenario runs in monitoring systems.
|
864
|
+
|
865
|
+
Args:
|
866
|
+
scenario_run_id: Unique identifier for the current scenario run
|
867
|
+
|
868
|
+
Note:
|
869
|
+
This event is automatically published at the beginning of `_run()`
|
870
|
+
and signals the start of scenario execution to any event listeners.
|
871
|
+
"""
|
872
|
+
common_fields = self._create_common_event_fields(scenario_run_id)
|
873
|
+
metadata = ScenarioRunStartedEventMetadata(
|
874
|
+
name=self.name,
|
875
|
+
description=self.description,
|
876
|
+
)
|
877
|
+
|
878
|
+
event = ScenarioRunStartedEvent(
|
879
|
+
**common_fields,
|
880
|
+
metadata=metadata,
|
881
|
+
)
|
882
|
+
self.event_bus.publish(event)
|
883
|
+
|
884
|
+
def _emit_message_snapshot_event(self, scenario_run_id: str) -> None:
|
885
|
+
"""
|
886
|
+
Emit a message snapshot event.
|
887
|
+
|
888
|
+
This event captures the current state of the conversation during
|
889
|
+
scenario execution. It's published whenever messages are added to
|
890
|
+
the conversation, allowing real-time tracking of scenario progress.
|
891
|
+
|
892
|
+
Note:
|
893
|
+
This event is automatically published by `add_message()` and
|
894
|
+
`add_messages()` to provide continuous visibility into scenario
|
895
|
+
execution state.
|
896
|
+
"""
|
897
|
+
common_fields = self._create_common_event_fields(scenario_run_id)
|
898
|
+
|
899
|
+
event = ScenarioMessageSnapshotEvent(
|
900
|
+
**common_fields,
|
901
|
+
messages=convert_messages_to_ag_ui_messages(self._state.messages),
|
902
|
+
)
|
903
|
+
self.event_bus.publish(event)
|
904
|
+
|
905
|
+
def _emit_run_finished_event(
|
906
|
+
self,
|
907
|
+
scenario_run_id: str,
|
908
|
+
result: ScenarioResult,
|
909
|
+
status: ScenarioRunFinishedEventStatus
|
910
|
+
) -> None:
|
911
|
+
"""
|
912
|
+
Emit a scenario run finished event.
|
913
|
+
|
914
|
+
This event is published when a scenario completes execution, whether
|
915
|
+
successfully or with an error. It includes the final results, verdict,
|
916
|
+
and reasoning for the scenario outcome.
|
917
|
+
|
918
|
+
Args:
|
919
|
+
scenario_run_id: Unique identifier for the current scenario run
|
920
|
+
result: The final scenario result containing success/failure status
|
921
|
+
status: The execution status (SUCCESS, FAILED, or ERROR)
|
922
|
+
|
923
|
+
Note:
|
924
|
+
This event is automatically published at the end of `_run()` and
|
925
|
+
signals the completion of scenario execution to any event listeners.
|
926
|
+
It includes detailed results for monitoring and analysis purposes.
|
927
|
+
"""
|
928
|
+
common_fields = self._create_common_event_fields(scenario_run_id)
|
929
|
+
|
930
|
+
results = ScenarioRunFinishedEventResults(
|
931
|
+
verdict=ScenarioRunFinishedEventVerdict.SUCCESS if result.success else ScenarioRunFinishedEventVerdict.FAILURE,
|
932
|
+
reasoning=result.reasoning or "",
|
933
|
+
met_criteria=result.passed_criteria,
|
934
|
+
unmet_criteria=result.failed_criteria,
|
935
|
+
)
|
936
|
+
|
937
|
+
event = ScenarioRunFinishedEvent(
|
938
|
+
**common_fields,
|
939
|
+
status=status,
|
940
|
+
results=results,
|
941
|
+
)
|
942
|
+
self.event_bus.publish(event)
|
scenario/scenario_state.py
CHANGED
@@ -36,7 +36,7 @@ class ScenarioState(BaseModel):
|
|
36
36
|
config: Configuration settings for this scenario execution
|
37
37
|
|
38
38
|
Example:
|
39
|
-
```
|
39
|
+
```
|
40
40
|
def check_agent_behavior(state: ScenarioState) -> None:
|
41
41
|
# Check if the agent called a specific tool
|
42
42
|
if state.has_tool_call("get_weather"):
|
@@ -87,7 +87,7 @@ class ScenarioState(BaseModel):
|
|
87
87
|
message: OpenAI-compatible message to add to the conversation
|
88
88
|
|
89
89
|
Example:
|
90
|
-
```
|
90
|
+
```
|
91
91
|
def inject_system_message(state: ScenarioState) -> None:
|
92
92
|
state.add_message({
|
93
93
|
"role": "system",
|
@@ -108,7 +108,7 @@ class ScenarioState(BaseModel):
|
|
108
108
|
ValueError: If no messages exist in the conversation
|
109
109
|
|
110
110
|
Example:
|
111
|
-
```
|
111
|
+
```
|
112
112
|
def check_last_response(state: ScenarioState) -> None:
|
113
113
|
last = state.last_message()
|
114
114
|
if last["role"] == "assistant":
|
@@ -131,7 +131,7 @@ class ScenarioState(BaseModel):
|
|
131
131
|
ValueError: If no user messages exist in the conversation
|
132
132
|
|
133
133
|
Example:
|
134
|
-
```
|
134
|
+
```
|
135
135
|
def analyze_user_intent(state: ScenarioState) -> None:
|
136
136
|
user_msg = state.last_user_message()
|
137
137
|
content = user_msg["content"]
|
@@ -162,7 +162,7 @@ class ScenarioState(BaseModel):
|
|
162
162
|
The tool call object if found, None otherwise
|
163
163
|
|
164
164
|
Example:
|
165
|
-
```
|
165
|
+
```
|
166
166
|
def verify_weather_call(state: ScenarioState) -> None:
|
167
167
|
weather_call = state.last_tool_call("get_current_weather")
|
168
168
|
if weather_call:
|
@@ -192,7 +192,7 @@ class ScenarioState(BaseModel):
|
|
192
192
|
True if the tool has been called, False otherwise
|
193
193
|
|
194
194
|
Example:
|
195
|
-
```
|
195
|
+
```
|
196
196
|
def ensure_tool_usage(state: ScenarioState) -> None:
|
197
197
|
# Verify the agent used required tools
|
198
198
|
assert state.has_tool_call("search_database")
|