langwatch-scenario 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langwatch_scenario-0.4.0.dist-info → langwatch_scenario-0.6.0.dist-info}/METADATA +93 -71
- langwatch_scenario-0.6.0.dist-info/RECORD +27 -0
- scenario/__init__.py +11 -114
- scenario/_utils/__init__.py +32 -0
- scenario/_utils/ids.py +58 -0
- scenario/_utils/message_conversion.py +103 -0
- scenario/{utils.py → _utils/utils.py} +21 -110
- scenario/agent_adapter.py +8 -4
- scenario/cache.py +4 -3
- scenario/config.py +7 -5
- scenario/events/__init__.py +66 -0
- scenario/events/event_bus.py +175 -0
- scenario/events/event_reporter.py +83 -0
- scenario/events/events.py +169 -0
- scenario/events/messages.py +84 -0
- scenario/events/utils.py +86 -0
- scenario/judge_agent.py +7 -28
- scenario/pytest_plugin.py +2 -47
- scenario/scenario_executor.py +268 -84
- scenario/scenario_state.py +6 -6
- scenario/script.py +9 -9
- scenario/types.py +10 -6
- scenario/user_simulator_agent.py +4 -11
- langwatch_scenario-0.4.0.dist-info/RECORD +0 -18
- {langwatch_scenario-0.4.0.dist-info → langwatch_scenario-0.6.0.dist-info}/WHEEL +0 -0
- {langwatch_scenario-0.4.0.dist-info → langwatch_scenario-0.6.0.dist-info}/entry_points.txt +0 -0
- {langwatch_scenario-0.4.0.dist-info → langwatch_scenario-0.6.0.dist-info}/top_level.txt +0 -0
- /scenario/{error_messages.py → _error_messages.py} +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langwatch-scenario
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.6.0
|
4
4
|
Summary: The end-to-end agent testing library
|
5
5
|
Author-email: LangWatch Team <support@langwatch.ai>
|
6
6
|
License: MIT
|
@@ -26,6 +26,11 @@ Requires-Dist: wrapt>=1.17.2
|
|
26
26
|
Requires-Dist: pytest-asyncio>=0.26.0
|
27
27
|
Requires-Dist: rich<15.0.0,>=13.3.3
|
28
28
|
Requires-Dist: pksuid>=1.1.2
|
29
|
+
Requires-Dist: pdoc3>=0.11.6
|
30
|
+
Requires-Dist: ag-ui-protocol>=0.1.0
|
31
|
+
Requires-Dist: httpx>=0.27.0
|
32
|
+
Requires-Dist: rx>=3.2.0
|
33
|
+
Requires-Dist: respx>=0.22.0
|
29
34
|
Provides-Extra: dev
|
30
35
|
Requires-Dist: black; extra == "dev"
|
31
36
|
Requires-Dist: isort; extra == "dev"
|
@@ -44,65 +49,59 @@ Requires-Dist: function-schema; extra == "dev"
|
|
44
49
|
|
45
50
|
# Scenario
|
46
51
|
|
47
|
-
Scenario is an Agent Testing Framework
|
52
|
+
Scenario is an Agent Testing Framework based on simulations, it can:
|
48
53
|
|
49
|
-
|
50
|
-
|
51
|
-
-
|
52
|
-
-
|
53
|
-
-
|
54
|
-
- Works in combination with any testing and LLM evaluation frameworks, completely agnostic
|
55
|
-
- Works with any LLM and Agent Framework, easy integration
|
54
|
+
- Test real agent behavior by simulating users in different scenarios and edge cases
|
55
|
+
- Evaluate and judge at any point of the conversation, powerful multi-turn control
|
56
|
+
- Combine it with any LLM eval framework or custom evals, agnostic by design
|
57
|
+
- Integrate your Agent by implementing just one `call()` method
|
58
|
+
- Available in Python, TypeScript and Go
|
56
59
|
|
57
60
|
[📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
|
58
61
|
|
59
|
-
###
|
62
|
+
### In other languages
|
60
63
|
|
61
64
|
- [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
|
62
65
|
- [Scenario Go](https://github.com/langwatch/scenario-go/)
|
63
66
|
|
64
67
|
## Example
|
65
68
|
|
69
|
+
This is how a simple simulation with tool check looks like with Scenario:
|
70
|
+
|
66
71
|
```python
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
# Integrate with your agent
|
71
|
-
class WeatherAgent(scenario.AgentAdapter):
|
72
|
-
async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
|
73
|
-
return weather_agent(input.messages)
|
72
|
+
# Define any custom assertions
|
73
|
+
def check_for_weather_tool_call(state: scenario.ScenarioState):
|
74
|
+
assert state.has_tool_call("get_current_weather")
|
74
75
|
|
75
|
-
|
76
|
-
|
77
|
-
assert state.has_tool_call("get_current_weather")
|
76
|
+
result = await scenario.run(
|
77
|
+
name="checking the weather",
|
78
78
|
|
79
|
-
#
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
and is wondering what the weather will be like.
|
85
|
-
""",
|
86
|
-
agents=[
|
87
|
-
WeatherAgent(),
|
88
|
-
scenario.UserSimulatorAgent(model="openai/gpt-4.1-mini"),
|
89
|
-
],
|
90
|
-
script=[
|
91
|
-
scenario.user(),
|
92
|
-
scenario.agent(),
|
93
|
-
check_for_weather_tool_call, # check for tool call after the first agent response
|
94
|
-
scenario.succeed(),
|
95
|
-
],
|
96
|
-
)
|
79
|
+
# Define the prompt to guide the simulation
|
80
|
+
description="""
|
81
|
+
The user is planning a boat trip from Barcelona to Rome,
|
82
|
+
and is wondering what the weather will be like.
|
83
|
+
""",
|
97
84
|
|
98
|
-
#
|
99
|
-
|
85
|
+
# Define the agents that will play this simulation
|
86
|
+
agents=[
|
87
|
+
WeatherAgent(),
|
88
|
+
scenario.UserSimulatorAgent(model="openai/gpt-4.1-mini"),
|
89
|
+
],
|
90
|
+
|
91
|
+
# (Optional) Control the simulation
|
92
|
+
script=[
|
93
|
+
scenario.user(), # let the user simulator generate a user message
|
94
|
+
scenario.agent(), # agent responds
|
95
|
+
check_for_weather_tool_call, # check for tool call after the first agent response
|
96
|
+
scenario.succeed(), # simulation ends successfully
|
97
|
+
],
|
98
|
+
)
|
99
|
+
|
100
|
+
assert result.success
|
100
101
|
```
|
101
102
|
|
102
103
|
> [!NOTE]
|
103
|
-
>
|
104
|
-
|
105
|
-
Check out more examples in the [examples folder](./examples/).
|
104
|
+
> Check out full examples in the [examples folder](./examples/).
|
106
105
|
|
107
106
|
## Getting Started
|
108
107
|
|
@@ -193,17 +192,17 @@ pytest -s tests/test_vegetarian_recipe_agent.py
|
|
193
192
|
|
194
193
|
This is how it will look like:
|
195
194
|
|
196
|
-
[](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11)
|
197
196
|
|
198
197
|
You can find the same code example in [examples/test_vegetarian_recipe_agent.py](examples/test_vegetarian_recipe_agent.py).
|
199
198
|
|
200
|
-
##
|
199
|
+
## Simulation on Autopilot
|
201
200
|
|
202
|
-
By providing a User Simulator Agent and a description of the Scenario, the simulated user will automatically generate messages to the agent until the scenario is successful or the maximum number of turns is reached.
|
201
|
+
By providing a User Simulator Agent and a description of the Scenario without a script, the simulated user will automatically generate messages to the agent until the scenario is successful or the maximum number of turns is reached.
|
203
202
|
|
204
203
|
You can then use a Judge Agent to evaluate the scenario in real-time given certain criteria, at every turn, the Judge Agent will decide if it should let the simulation proceed or end it with a verdict.
|
205
204
|
|
206
|
-
|
205
|
+
For example, here is a scenario that tests a vibe coding assistant:
|
207
206
|
|
208
207
|
```python
|
209
208
|
result = await scenario.run(
|
@@ -233,6 +232,8 @@ result = await scenario.run(
|
|
233
232
|
|
234
233
|
Check out the fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
|
235
234
|
|
235
|
+
You can also combine it with a partial script too! By for example controlling only the beginning of the conversation, and let the rest proceed on autopilot, see the next section.
|
236
|
+
|
236
237
|
## Full Control of the Conversation
|
237
238
|
|
238
239
|
You can specify a script for guiding the scenario by passing a list of steps to the `script` field, those steps are simply arbitrary functions that take the current state of the scenario as an argument, so you can do things like:
|
@@ -250,35 +251,35 @@ Everything is possible, using the same simple structure:
|
|
250
251
|
```python
|
251
252
|
@pytest.mark.agent_test
|
252
253
|
@pytest.mark.asyncio
|
253
|
-
async def
|
254
|
-
|
255
|
-
name="
|
254
|
+
async def test_early_assumption_bias():
|
255
|
+
result = await scenario.run(
|
256
|
+
name="early assumption bias",
|
256
257
|
description="""
|
257
258
|
The agent makes false assumption that the user is talking about an ATM bank, and user corrects it that they actually mean river banks
|
258
259
|
""",
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
260
|
+
agents=[
|
261
|
+
Agent(),
|
262
|
+
scenario.UserSimulatorAgent(),
|
263
|
+
scenario.JudgeAgent(
|
264
|
+
criteria=[
|
265
|
+
"user should get good recommendations on river crossing",
|
266
|
+
"agent should NOT keep following up about ATM recommendation after user has corrected them that they are actually just hiking",
|
267
|
+
],
|
268
|
+
),
|
263
269
|
],
|
264
|
-
max_turns=
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
assert state.has_tool_call("web_search")
|
269
|
-
|
270
|
-
result = await scenario.script(
|
271
|
-
[
|
272
|
-
# Define existing history of messages
|
270
|
+
max_turns=10,
|
271
|
+
script=[
|
272
|
+
# Define hardcoded messages
|
273
|
+
scenario.agent("Hello, how can I help you today?"),
|
273
274
|
scenario.user("how do I safely approach a bank?"),
|
274
275
|
|
275
|
-
# Or let it be
|
276
|
+
# Or let it be generated automatically
|
276
277
|
scenario.agent(),
|
277
278
|
|
278
279
|
# Add custom assertions, for example making sure a tool was called
|
279
280
|
check_if_tool_was_called,
|
280
281
|
|
281
|
-
#
|
282
|
+
# Generate a user follow-up message
|
282
283
|
scenario.user(),
|
283
284
|
|
284
285
|
# Let the simulation proceed for 2 more turns, print at every turn
|
@@ -289,8 +290,8 @@ async def test_ai_assistant_agent():
|
|
289
290
|
|
290
291
|
# Time to make a judgment call
|
291
292
|
scenario.judge(),
|
292
|
-
]
|
293
|
-
)
|
293
|
+
],
|
294
|
+
)
|
294
295
|
|
295
296
|
assert result.success
|
296
297
|
```
|
@@ -302,7 +303,7 @@ You can enable debug mode by setting the `debug` field to `True` in the `Scenari
|
|
302
303
|
Debug mode allows you to see the messages in slow motion step by step, and intervene with your own inputs to debug your agent from the middle of the conversation.
|
303
304
|
|
304
305
|
```python
|
305
|
-
|
306
|
+
scenario.configure(default_model="openai/gpt-4.1-mini", debug=True)
|
306
307
|
```
|
307
308
|
|
308
309
|
or
|
@@ -316,16 +317,17 @@ pytest -s tests/test_vegetarian_recipe_agent.py --debug
|
|
316
317
|
Each time the scenario runs, the testing agent might chose a different input to start, this is good to make sure it covers the variance of real users as well, however we understand that the non-deterministic nature of it might make it less repeatable, costly and harder to debug. To solve for it, you can use the `cache_key` field in the `Scenario.configure` method or in the specific scenario you are running, this will make the testing agent give the same input for given the same scenario:
|
317
318
|
|
318
319
|
```python
|
319
|
-
|
320
|
+
scenario.configure(default_model="openai/gpt-4.1-mini", cache_key="42")
|
320
321
|
```
|
321
322
|
|
322
323
|
To bust the cache, you can simply pass a different `cache_key`, disable it, or delete the cache files located at `~/.scenario/cache`.
|
323
324
|
|
324
|
-
To go a step further and fully cache the test end-to-end, you can also wrap the LLM calls or any other non-deterministic functions in your application side with the `@
|
325
|
+
To go a step further and fully cache the test end-to-end, you can also wrap the LLM calls or any other non-deterministic functions in your application side with the `@scenario.cache` decorator:
|
325
326
|
|
326
327
|
```python
|
328
|
+
# Inside your actual agent implementation
|
327
329
|
class MyAgent:
|
328
|
-
@
|
330
|
+
@scenario.cache()
|
329
331
|
def invoke(self, message, context):
|
330
332
|
return client.chat.completions.create(
|
331
333
|
# ...
|
@@ -358,6 +360,26 @@ async def test_user_is_very_hungry():
|
|
358
360
|
|
359
361
|
Those two scenarios should now run in parallel.
|
360
362
|
|
363
|
+
## Events System
|
364
|
+
|
365
|
+
Scenario automatically publishes events during execution for monitoring and observability. You can enable event reporting by setting environment variables:
|
366
|
+
|
367
|
+
```bash
|
368
|
+
# Enable automatic event reporting
|
369
|
+
export LANGWATCH_ENDPOINT="https://api.langwatch.ai"
|
370
|
+
export LANGWATCH_API_KEY="your-api-key"
|
371
|
+
```
|
372
|
+
|
373
|
+
With these variables set, Scenario will automatically:
|
374
|
+
|
375
|
+
- Publish events when scenarios start, finish, and when messages are added
|
376
|
+
- Handle retries and error handling automatically
|
377
|
+
- Process events asynchronously without blocking your tests
|
378
|
+
|
379
|
+
The events include timing information, conversation history, and success/failure metrics for analysis.
|
380
|
+
|
381
|
+
For advanced customization, see the event classes in the codebase for detailed documentation.
|
382
|
+
|
361
383
|
## License
|
362
384
|
|
363
385
|
MIT License
|
@@ -0,0 +1,27 @@
|
|
1
|
+
scenario/__init__.py,sha256=UJ5l-sG4TMG0wR8Ba-dxdDW36m3apTvawP-lNvk7Jm0,4293
|
2
|
+
scenario/_error_messages.py,sha256=6lEx3jBGMbPx0kG0eX5zoZE-ENVM3O_ZkIbVMlnidYs,3892
|
3
|
+
scenario/agent_adapter.py,sha256=PoY2KQqYuqzIIb3-nhIU-MPXwHJc1vmwdweMy7ut-hk,4255
|
4
|
+
scenario/cache.py,sha256=J6s6Sia_Ce6TrnsInlhfxm6SF8tygo3sH-_cQCRX1WA,6213
|
5
|
+
scenario/config.py,sha256=xhUuXH-sThwPTmJNSuajKxX-WC_tcFwJ1jZc119DswA,6093
|
6
|
+
scenario/judge_agent.py,sha256=9CCO699qoWqXvWdQ73Yc3dqPOwaJdJ-zqxVaLaKi_cA,16161
|
7
|
+
scenario/pytest_plugin.py,sha256=f2ETBpATz80k7K87M6046ZIFiQpHEvDN7dxakd3y2wk,11321
|
8
|
+
scenario/scenario_executor.py,sha256=nkSIuIlwPHfr6pueSBbARrgiqPtW0SxajV3PFypAnJ4,34508
|
9
|
+
scenario/scenario_state.py,sha256=dQDjazem-dn1c5mw6TwngEu6Tv_cHwEzemepsPBy2f0,7039
|
10
|
+
scenario/script.py,sha256=A0N5pP0l4FFn1xdKc78U_wkwWhEWH3EFeU_LRDtNyEI,12241
|
11
|
+
scenario/types.py,sha256=BhXcTEMGyGg_1QysN-GXVjm8DP2VH3UEzj_qvoglp2k,9466
|
12
|
+
scenario/user_simulator_agent.py,sha256=fhwi8W44s343BGrjJXSJw960wcK7MgwTg-epxR1bqHo,9088
|
13
|
+
scenario/_utils/__init__.py,sha256=wNX9hU8vzYlyLDwjkt7JUW3IPo2DhME6UIt_zvLM3B0,1000
|
14
|
+
scenario/_utils/ids.py,sha256=K1iPuJgPh3gX9HCrDZGqK5lDgdwZXfOBF1YXVOWNHRg,1843
|
15
|
+
scenario/_utils/message_conversion.py,sha256=AM9DLyWpy97CrAH8RmId9Mv2rmLquQhFoUpRyp-jVeY,3622
|
16
|
+
scenario/_utils/utils.py,sha256=msQgUWaLh3U9jIIHmxkEbOaklga63AF0KJzsaKa_mZc,14008
|
17
|
+
scenario/events/__init__.py,sha256=_autF1cMZYpNXE-kJNvvRb-H_hYqy4gOSSp2fT3Wi9k,1533
|
18
|
+
scenario/events/event_bus.py,sha256=MThIMIaI2nj2CoegZazTNxeHbtl4_M7bW3vEAHz6R8g,7102
|
19
|
+
scenario/events/event_reporter.py,sha256=cMh_5jA5hG3Q9IsoAgPJhxnIVs_M1Q0e2lgLTEK4oPc,3100
|
20
|
+
scenario/events/events.py,sha256=jPXylwiADb0Bdk7u1YkAaU_jLebH7NW8J7SZI9JDTxw,6750
|
21
|
+
scenario/events/messages.py,sha256=1QAkwDExdF6AHgXdEFhHwmCv3Mxu3j0AXIptMekc_bg,3299
|
22
|
+
scenario/events/utils.py,sha256=yrTUTByeb0eAAQniQH7EyKs-usgGti8f17IemUyBZBw,3357
|
23
|
+
langwatch_scenario-0.6.0.dist-info/METADATA,sha256=IvD9on4tP57ldmizFzfGQBtiCT6Z7yoz0trlCSPSW9M,14227
|
24
|
+
langwatch_scenario-0.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
25
|
+
langwatch_scenario-0.6.0.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
|
26
|
+
langwatch_scenario-0.6.0.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
|
27
|
+
langwatch_scenario-0.6.0.dist-info/RECORD,,
|
scenario/__init__.py
CHANGED
@@ -7,15 +7,21 @@ happy paths and edge cases by simulating user interactions and evaluating agent
|
|
7
7
|
against configurable success criteria.
|
8
8
|
|
9
9
|
Key Features:
|
10
|
+
|
10
11
|
- End-to-end conversation testing with specified scenarios
|
12
|
+
|
11
13
|
- Flexible control from fully scripted to completely automated simulations
|
14
|
+
|
12
15
|
- Multi-turn evaluation designed for complex conversational agents
|
16
|
+
|
13
17
|
- Works with any testing framework (pytest, unittest, etc.)
|
18
|
+
|
14
19
|
- Framework-agnostic integration with any LLM or agent architecture
|
20
|
+
|
15
21
|
- Built-in caching for deterministic and faster test execution
|
16
22
|
|
17
23
|
Basic Usage:
|
18
|
-
|
24
|
+
|
19
25
|
import scenario
|
20
26
|
|
21
27
|
# Configure global settings
|
@@ -42,10 +48,9 @@ Basic Usage:
|
|
42
48
|
)
|
43
49
|
|
44
50
|
assert result.success
|
45
|
-
```
|
46
51
|
|
47
52
|
Advanced Usage:
|
48
|
-
|
53
|
+
|
49
54
|
# Script-controlled scenario with custom evaluations
|
50
55
|
def check_tool_usage(state: scenario.ScenarioState) -> None:
|
51
56
|
assert state.has_tool_call("get_customer_info")
|
@@ -66,10 +71,9 @@ Advanced Usage:
|
|
66
71
|
scenario.succeed("All requirements met")
|
67
72
|
]
|
68
73
|
)
|
69
|
-
```
|
70
74
|
|
71
75
|
Integration with Testing Frameworks:
|
72
|
-
|
76
|
+
|
73
77
|
import pytest
|
74
78
|
|
75
79
|
@pytest.mark.agent_test
|
@@ -85,7 +89,6 @@ Integration with Testing Frameworks:
|
|
85
89
|
]
|
86
90
|
)
|
87
91
|
assert result.success
|
88
|
-
```
|
89
92
|
|
90
93
|
For more examples and detailed documentation, visit: https://github.com/langwatch/scenario
|
91
94
|
"""
|
@@ -104,113 +107,15 @@ from .cache import scenario_cache
|
|
104
107
|
from .script import message, user, agent, judge, proceed, succeed, fail
|
105
108
|
|
106
109
|
# Import pytest plugin components
|
107
|
-
from .pytest_plugin import pytest_configure, scenario_reporter
|
110
|
+
# from .pytest_plugin import pytest_configure, scenario_reporter
|
108
111
|
|
109
112
|
run = ScenarioExecutor.run
|
110
|
-
"""
|
111
|
-
High-level interface for running scenario tests.
|
112
|
-
|
113
|
-
This is the main entry point for executing scenario-based agent tests. It creates
|
114
|
-
and runs a complete scenario simulation including user interactions, agent responses,
|
115
|
-
and success evaluation.
|
116
|
-
|
117
|
-
Args:
|
118
|
-
name: Human-readable name for the scenario
|
119
|
-
description: Detailed description that guides the simulation behavior
|
120
|
-
agents: List of agent adapters (agent under test, user simulator, judge)
|
121
|
-
max_turns: Maximum conversation turns before timeout (default: 10)
|
122
|
-
verbose: Show detailed output during execution
|
123
|
-
cache_key: Cache key for deterministic behavior across runs
|
124
|
-
debug: Enable debug mode for step-by-step execution
|
125
|
-
script: Optional script steps to control scenario flow
|
126
|
-
|
127
|
-
Returns:
|
128
|
-
ScenarioResult containing test outcome, conversation history, and detailed analysis
|
129
|
-
|
130
|
-
Example:
|
131
|
-
```python
|
132
|
-
result = await scenario.run(
|
133
|
-
name="help request",
|
134
|
-
description="User needs help with a technical problem",
|
135
|
-
agents=[
|
136
|
-
MyAgentAdapter(),
|
137
|
-
scenario.UserSimulatorAgent(),
|
138
|
-
scenario.JudgeAgent(criteria=["Provides helpful response"])
|
139
|
-
]
|
140
|
-
)
|
141
|
-
|
142
|
-
print(f"Test {'PASSED' if result.success else 'FAILED'}")
|
143
|
-
print(f"Reasoning: {result.reasoning}")
|
144
|
-
```
|
145
|
-
"""
|
146
113
|
|
147
114
|
configure = ScenarioConfig.configure
|
148
|
-
"""
|
149
|
-
Set global configuration settings for all scenario executions.
|
150
|
-
|
151
|
-
This function allows you to configure default behavior that will be applied
|
152
|
-
to all scenarios unless explicitly overridden in individual scenario runs.
|
153
|
-
|
154
|
-
Args:
|
155
|
-
default_model: Default LLM model identifier for user simulator and judge agents
|
156
|
-
max_turns: Maximum number of conversation turns before timeout (default: 10)
|
157
|
-
verbose: Enable verbose output during scenario execution
|
158
|
-
cache_key: Cache key for deterministic scenario behavior across runs
|
159
|
-
debug: Enable debug mode for step-by-step execution with user intervention
|
160
|
-
|
161
|
-
Example:
|
162
|
-
```python
|
163
|
-
# Set up global defaults
|
164
|
-
scenario.configure(
|
165
|
-
default_model="openai/gpt-4.1-mini",
|
166
|
-
max_turns=15,
|
167
|
-
verbose=True,
|
168
|
-
cache_key="my-test-suite-v1"
|
169
|
-
)
|
170
|
-
|
171
|
-
# All subsequent scenarios will use these defaults
|
172
|
-
result = await scenario.run(...)
|
173
|
-
```
|
174
|
-
"""
|
175
115
|
|
176
116
|
default_config = ScenarioConfig.default_config
|
177
|
-
"""
|
178
|
-
Access to the current global configuration settings.
|
179
|
-
|
180
|
-
This provides read-only access to the default configuration that has been
|
181
|
-
set via scenario.configure(). Useful for debugging or conditional logic
|
182
|
-
based on current settings.
|
183
|
-
|
184
|
-
Example:
|
185
|
-
```python
|
186
|
-
if scenario.default_config and scenario.default_config.debug:
|
187
|
-
print("Debug mode is enabled")
|
188
|
-
```
|
189
|
-
"""
|
190
117
|
|
191
118
|
cache = scenario_cache
|
192
|
-
"""
|
193
|
-
Decorator for caching function calls during scenario execution.
|
194
|
-
|
195
|
-
This decorator enables deterministic testing by caching LLM calls and other
|
196
|
-
non-deterministic operations based on scenario configuration and function arguments.
|
197
|
-
Results are cached when a cache_key is configured, making tests repeatable and faster.
|
198
|
-
|
199
|
-
Args:
|
200
|
-
ignore: List of argument names to exclude from cache key computation
|
201
|
-
|
202
|
-
Example:
|
203
|
-
```python
|
204
|
-
class MyAgent:
|
205
|
-
@scenario.cache(ignore=["self"])
|
206
|
-
def invoke(self, message: str) -> str:
|
207
|
-
# This LLM call will be cached when cache_key is set
|
208
|
-
return llm_client.complete(model="gpt-4", prompt=message)
|
209
|
-
|
210
|
-
# Enable caching for deterministic tests
|
211
|
-
scenario.configure(cache_key="test-suite-v1")
|
212
|
-
```
|
213
|
-
"""
|
214
119
|
|
215
120
|
__all__ = [
|
216
121
|
# Functions
|
@@ -218,7 +123,6 @@ __all__ = [
|
|
218
123
|
"configure",
|
219
124
|
"default_config",
|
220
125
|
"cache",
|
221
|
-
|
222
126
|
# Script
|
223
127
|
"message",
|
224
128
|
"proceed",
|
@@ -227,24 +131,17 @@ __all__ = [
|
|
227
131
|
"judge",
|
228
132
|
"agent",
|
229
133
|
"user",
|
230
|
-
|
231
134
|
# Types
|
232
135
|
"ScenarioResult",
|
233
136
|
"AgentInput",
|
234
137
|
"AgentRole",
|
235
138
|
"ScenarioConfig",
|
236
139
|
"AgentReturnTypes",
|
237
|
-
|
238
140
|
# Classes
|
239
141
|
"ScenarioExecutor",
|
240
142
|
"ScenarioState",
|
241
143
|
"AgentAdapter",
|
242
144
|
"UserSimulatorAgent",
|
243
145
|
"JudgeAgent",
|
244
|
-
|
245
|
-
# Plugins
|
246
|
-
"pytest_configure",
|
247
|
-
"scenario_reporter",
|
248
|
-
"scenario_cache",
|
249
146
|
]
|
250
|
-
__version__ = "0.1.0"
|
147
|
+
__version__ = "0.1.0"
|
@@ -0,0 +1,32 @@
|
|
1
|
+
"""
|
2
|
+
Utility functions for scenario execution and message handling.
|
3
|
+
|
4
|
+
This module provides various utility functions used throughout the Scenario framework,
|
5
|
+
including message formatting, validation, role reversal, and UI components like spinners
|
6
|
+
for better user experience during scenario execution.
|
7
|
+
"""
|
8
|
+
|
9
|
+
from .message_conversion import convert_agent_return_types_to_openai_messages
|
10
|
+
from .ids import get_or_create_batch_run_id, generate_scenario_run_id
|
11
|
+
from .utils import (
|
12
|
+
SerializableAndPydanticEncoder,
|
13
|
+
SerializableWithStringFallback,
|
14
|
+
print_openai_messages,
|
15
|
+
show_spinner,
|
16
|
+
check_valid_return_type,
|
17
|
+
reverse_roles,
|
18
|
+
await_if_awaitable,
|
19
|
+
)
|
20
|
+
|
21
|
+
__all__ = [
|
22
|
+
"convert_agent_return_types_to_openai_messages",
|
23
|
+
"get_or_create_batch_run_id",
|
24
|
+
"generate_scenario_run_id",
|
25
|
+
"SerializableAndPydanticEncoder",
|
26
|
+
"SerializableWithStringFallback",
|
27
|
+
"print_openai_messages",
|
28
|
+
"show_spinner",
|
29
|
+
"check_valid_return_type",
|
30
|
+
"reverse_roles",
|
31
|
+
"await_if_awaitable",
|
32
|
+
]
|
scenario/_utils/ids.py
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
"""
|
2
|
+
ID generation and management utilities for scenario execution.
|
3
|
+
|
4
|
+
This module provides functions for generating and managing unique identifiers
|
5
|
+
used throughout the scenario execution pipeline, particularly for batch runs
|
6
|
+
and scenario tracking.
|
7
|
+
"""
|
8
|
+
|
9
|
+
import os
|
10
|
+
import uuid
|
11
|
+
|
12
|
+
|
13
|
+
def get_or_create_batch_run_id() -> str:
|
14
|
+
"""
|
15
|
+
Gets or creates a batch run ID for the current scenario execution.
|
16
|
+
|
17
|
+
The batch run ID is consistent across all scenarios in the same process
|
18
|
+
execution, allowing grouping of related scenario runs. This is useful
|
19
|
+
for tracking and reporting on batches of scenarios run together.
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
str: A unique batch run ID that persists for the process lifetime
|
23
|
+
|
24
|
+
Example:
|
25
|
+
```python
|
26
|
+
# All scenarios in same process will share this ID
|
27
|
+
batch_id = get_or_create_batch_run_id()
|
28
|
+
print(f"Running scenario in batch: {batch_id}")
|
29
|
+
```
|
30
|
+
"""
|
31
|
+
|
32
|
+
# Check if batch ID already exists in environment
|
33
|
+
if not os.environ.get("SCENARIO_BATCH_ID"):
|
34
|
+
# Generate new batch ID if not set
|
35
|
+
os.environ["SCENARIO_BATCH_ID"] = f"batch-run-{uuid.uuid4()}"
|
36
|
+
|
37
|
+
return os.environ["SCENARIO_BATCH_ID"]
|
38
|
+
|
39
|
+
|
40
|
+
def generate_scenario_run_id() -> str:
|
41
|
+
"""
|
42
|
+
Generates a unique scenario run ID for a single scenario execution.
|
43
|
+
|
44
|
+
Each scenario run gets a unique identifier that distinguishes it from
|
45
|
+
other runs, even within the same batch. This is used for tracking
|
46
|
+
individual scenario executions and correlating events.
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
str: A unique scenario run ID
|
50
|
+
|
51
|
+
Example:
|
52
|
+
```python
|
53
|
+
# Each scenario gets its own unique ID
|
54
|
+
scenario_id = generate_scenario_run_id()
|
55
|
+
print(f"Running scenario with ID: {scenario_id}")
|
56
|
+
```
|
57
|
+
"""
|
58
|
+
return f"scenario-run-{uuid.uuid4()}"
|