langwatch-scenario 0.3.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,205 @@
1
+ """
2
+ Scenario state management module.
3
+
4
+ This module provides the ScenarioState class which tracks the current state
5
+ of a scenario execution, including conversation history, turn tracking, and
6
+ utility methods for inspecting the conversation.
7
+ """
8
+
9
+ from typing import List, Dict, Any, Optional, TYPE_CHECKING
10
+ from openai.types.chat import (
11
+ ChatCompletionMessageParam,
12
+ ChatCompletionMessageToolCallParam,
13
+ ChatCompletionUserMessageParam,
14
+ )
15
+ from pydantic import BaseModel
16
+
17
+ from scenario.config import ScenarioConfig
18
+
19
+ if TYPE_CHECKING:
20
+ from .scenario_executor import ScenarioExecutor
21
+
22
+
23
+ class ScenarioState(BaseModel):
24
+ """
25
+ Represents the current state of a scenario execution.
26
+
27
+ This class provides access to the conversation history, turn information,
28
+ and utility methods for inspecting messages and tool calls. It's passed to
29
+ script step functions and available through AgentInput.scenario_state.
30
+
31
+ Attributes:
32
+ description: The scenario description that guides the simulation
33
+ messages: Complete conversation history as OpenAI-compatible messages
34
+ thread_id: Unique identifier for this conversation thread
35
+ current_turn: Current turn number in the conversation
36
+ config: Configuration settings for this scenario execution
37
+
38
+ Example:
39
+ ```
40
+ def check_agent_behavior(state: ScenarioState) -> None:
41
+ # Check if the agent called a specific tool
42
+ if state.has_tool_call("get_weather"):
43
+ print("Agent successfully called weather tool")
44
+
45
+ # Get the last user message
46
+ last_user = state.last_user_message()
47
+ print(f"User said: {last_user['content']}")
48
+
49
+ # Check conversation length
50
+ if len(state.messages) > 10:
51
+ print("Conversation is getting long")
52
+
53
+ # Use in scenario script
54
+ result = await scenario.run(
55
+ name="tool usage test",
56
+ description="Test that agent uses the correct tools",
57
+ agents=[
58
+ my_agent,
59
+ scenario.UserSimulatorAgent(),
60
+ scenario.JudgeAgent(criteria=["Agent provides helpful response"])
61
+ ],
62
+ script=[
63
+ scenario.user("What's the weather like?"),
64
+ scenario.agent(),
65
+ check_agent_behavior, # Custom inspection function
66
+ scenario.succeed()
67
+ ]
68
+ )
69
+ ```
70
+ """
71
+ description: str
72
+ messages: List[ChatCompletionMessageParam]
73
+ thread_id: str
74
+ current_turn: int
75
+ config: ScenarioConfig
76
+
77
+ _executor: "ScenarioExecutor"
78
+
79
+ def add_message(self, message: ChatCompletionMessageParam):
80
+ """
81
+ Add a message to the conversation history.
82
+
83
+ This method delegates to the scenario executor to properly handle
84
+ message broadcasting and state updates.
85
+
86
+ Args:
87
+ message: OpenAI-compatible message to add to the conversation
88
+
89
+ Example:
90
+ ```
91
+ def inject_system_message(state: ScenarioState) -> None:
92
+ state.add_message({
93
+ "role": "system",
94
+ "content": "The user is now in a hurry"
95
+ })
96
+ ```
97
+ """
98
+ self._executor.add_message(message)
99
+
100
+ def last_message(self) -> ChatCompletionMessageParam:
101
+ """
102
+ Get the most recent message in the conversation.
103
+
104
+ Returns:
105
+ The last message in the conversation history
106
+
107
+ Raises:
108
+ ValueError: If no messages exist in the conversation
109
+
110
+ Example:
111
+ ```
112
+ def check_last_response(state: ScenarioState) -> None:
113
+ last = state.last_message()
114
+ if last["role"] == "assistant":
115
+ content = last.get("content", "")
116
+ assert "helpful" in content.lower()
117
+ ```
118
+ """
119
+ if len(self.messages) == 0:
120
+ raise ValueError("No messages found")
121
+ return self.messages[-1]
122
+
123
+ def last_user_message(self) -> ChatCompletionUserMessageParam:
124
+ """
125
+ Get the most recent user message in the conversation.
126
+
127
+ Returns:
128
+ The last user message in the conversation history
129
+
130
+ Raises:
131
+ ValueError: If no user messages exist in the conversation
132
+
133
+ Example:
134
+ ```
135
+ def analyze_user_intent(state: ScenarioState) -> None:
136
+ user_msg = state.last_user_message()
137
+ content = user_msg["content"]
138
+
139
+ if isinstance(content, str):
140
+ if "urgent" in content.lower():
141
+ print("User expressed urgency")
142
+ ```
143
+ """
144
+ user_messages = [m for m in self.messages if m["role"] == "user"]
145
+ if not user_messages:
146
+ raise ValueError("No user messages found")
147
+ return user_messages[-1]
148
+
149
+ def last_tool_call(
150
+ self, tool_name: str
151
+ ) -> Optional[ChatCompletionMessageToolCallParam]:
152
+ """
153
+ Find the most recent call to a specific tool in the conversation.
154
+
155
+ Searches through the conversation history in reverse order to find
156
+ the last time the specified tool was called by an assistant.
157
+
158
+ Args:
159
+ tool_name: Name of the tool to search for
160
+
161
+ Returns:
162
+ The tool call object if found, None otherwise
163
+
164
+ Example:
165
+ ```
166
+ def verify_weather_call(state: ScenarioState) -> None:
167
+ weather_call = state.last_tool_call("get_current_weather")
168
+ if weather_call:
169
+ args = json.loads(weather_call["function"]["arguments"])
170
+ assert "location" in args
171
+ print(f"Weather requested for: {args['location']}")
172
+ ```
173
+ """
174
+ for message in reversed(self.messages):
175
+ if message["role"] == "assistant" and "tool_calls" in message:
176
+ for tool_call in message["tool_calls"]:
177
+ if tool_call["function"]["name"] == tool_name:
178
+ return tool_call
179
+ return None
180
+
181
+ def has_tool_call(self, tool_name: str) -> bool:
182
+ """
183
+ Check if a specific tool has been called in the conversation.
184
+
185
+ This is a convenience method that returns True if the specified
186
+ tool has been called at any point in the conversation.
187
+
188
+ Args:
189
+ tool_name: Name of the tool to check for
190
+
191
+ Returns:
192
+ True if the tool has been called, False otherwise
193
+
194
+ Example:
195
+ ```
196
+ def ensure_tool_usage(state: ScenarioState) -> None:
197
+ # Verify the agent used required tools
198
+ assert state.has_tool_call("search_database")
199
+ assert state.has_tool_call("format_results")
200
+
201
+ # Check it didn't use forbidden tools
202
+ assert not state.has_tool_call("delete_data")
203
+ ```
204
+ """
205
+ return self.last_tool_call(tool_name) is not None
scenario/script.py ADDED
@@ -0,0 +1,361 @@
1
+ """
2
+ Scenario script DSL (Domain Specific Language) module.
3
+
4
+ This module provides a collection of functions that form a declarative language
5
+ for controlling scenario execution flow. These functions can be used to create
6
+ scripts that precisely control how conversations unfold, when evaluations occur,
7
+ and when scenarios should succeed or fail.
8
+ """
9
+
10
+ from typing import Awaitable, Callable, Optional, Union, TYPE_CHECKING
11
+
12
+ from .types import ScriptStep
13
+
14
+ from openai.types.chat import ChatCompletionMessageParam
15
+
16
+ if TYPE_CHECKING:
17
+ from scenario.scenario_state import ScenarioState
18
+
19
+
20
+ def message(message: ChatCompletionMessageParam) -> ScriptStep:
21
+ """
22
+ Add a specific message to the conversation.
23
+
24
+ This function allows you to inject any OpenAI-compatible message directly
25
+ into the conversation at a specific point in the script. Useful for
26
+ simulating tool responses, system messages, or specific conversational states.
27
+
28
+ Args:
29
+ message: OpenAI-compatible message to add to the conversation
30
+
31
+ Returns:
32
+ ScriptStep function that can be used in scenario scripts
33
+
34
+ Example:
35
+ ```
36
+ result = await scenario.run(
37
+ name="tool response test",
38
+ description="Testing tool call responses",
39
+ agents=[
40
+ my_agent,
41
+ scenario.UserSimulatorAgent(),
42
+ scenario.JudgeAgent(criteria=["Agent uses weather tool correctly"])
43
+ ],
44
+ script=[
45
+ scenario.user("What's the weather?"),
46
+ scenario.agent(), # Agent calls weather tool
47
+ scenario.message({
48
+ "role": "tool",
49
+ "tool_call_id": "call_123",
50
+ "content": json.dumps({"temperature": "75°F", "condition": "sunny"})
51
+ }),
52
+ scenario.agent(), # Agent processes tool response
53
+ scenario.succeed()
54
+ ]
55
+ )
56
+ ```
57
+ """
58
+ return lambda state: state._executor.message(message)
59
+
60
+
61
+ def user(
62
+ content: Optional[Union[str, ChatCompletionMessageParam]] = None,
63
+ ) -> ScriptStep:
64
+ """
65
+ Generate or specify a user message in the conversation.
66
+
67
+ If content is provided, it will be used as the user message. If no content
68
+ is provided, the user simulator agent will automatically generate an
69
+ appropriate message based on the scenario context.
70
+
71
+ Args:
72
+ content: Optional user message content. Can be a string or full message dict.
73
+ If None, the user simulator will generate content automatically.
74
+
75
+ Returns:
76
+ ScriptStep function that can be used in scenario scripts
77
+
78
+ Example:
79
+ ```
80
+ result = await scenario.run(
81
+ name="user interaction test",
82
+ description="Testing specific user inputs",
83
+ agents=[
84
+ my_agent,
85
+ scenario.UserSimulatorAgent(),
86
+ scenario.JudgeAgent(criteria=["Agent responds helpfully to user"])
87
+ ],
88
+ script=[
89
+ # Specific user message
90
+ scenario.user("I need help with Python"),
91
+ scenario.agent(),
92
+
93
+ # Auto-generated user message based on scenario context
94
+ scenario.user(),
95
+ scenario.agent(),
96
+
97
+ # Structured user message with multimodal content
98
+ scenario.message({
99
+ "role": "user",
100
+ "content": [
101
+ {"type": "text", "text": "What's in this image?"},
102
+ {"type": "image_url", "image_url": {"url": "data:image/..."}}
103
+ ]
104
+ }),
105
+ scenario.succeed()
106
+ ]
107
+ )
108
+ ```
109
+ """
110
+ return lambda state: state._executor.user(content)
111
+
112
+
113
+ def agent(
114
+ content: Optional[Union[str, ChatCompletionMessageParam]] = None,
115
+ ) -> ScriptStep:
116
+ """
117
+ Generate or specify an agent response in the conversation.
118
+
119
+ If content is provided, it will be used as the agent response. If no content
120
+ is provided, the agent under test will be called to generate its response
121
+ based on the current conversation state.
122
+
123
+ Args:
124
+ content: Optional agent response content. Can be a string or full message dict.
125
+ If None, the agent under test will generate content automatically.
126
+
127
+ Returns:
128
+ ScriptStep function that can be used in scenario scripts
129
+
130
+ Example:
131
+ ```
132
+ result = await scenario.run(
133
+ name="agent response test",
134
+ description="Testing agent responses",
135
+ agents=[
136
+ my_agent,
137
+ scenario.UserSimulatorAgent(),
138
+ scenario.JudgeAgent(criteria=["Agent provides appropriate responses"])
139
+ ],
140
+ script=[
141
+ scenario.user("Hello"),
142
+
143
+ # Let agent generate its own response
144
+ scenario.agent(),
145
+
146
+ # Or specify exact agent response for testing edge cases
147
+ scenario.agent("I'm sorry, I'm currently unavailable"),
148
+ scenario.user(), # See how user simulator reacts
149
+
150
+ # Structured agent response with tool calls
151
+ scenario.message({
152
+ "role": "assistant",
153
+ "content": "Let me search for that information",
154
+ "tool_calls": [{"id": "call_123", "type": "function", ...}]
155
+ }),
156
+ scenario.succeed()
157
+ ]
158
+ )
159
+ ```
160
+ """
161
+ return lambda state: state._executor.agent(content)
162
+
163
+
164
+ def judge(
165
+ content: Optional[Union[str, ChatCompletionMessageParam]] = None,
166
+ ) -> ScriptStep:
167
+ """
168
+ Invoke the judge agent to evaluate the current conversation state.
169
+
170
+ This function forces the judge agent to make a decision about whether
171
+ the scenario should continue or end with a success/failure verdict.
172
+ The judge will evaluate based on its configured criteria.
173
+
174
+ Args:
175
+ content: Optional message content for the judge. Usually None to let
176
+ the judge evaluate based on its criteria.
177
+
178
+ Returns:
179
+ ScriptStep function that can be used in scenario scripts
180
+
181
+ Example:
182
+ ```
183
+ result = await scenario.run(
184
+ name="judge evaluation test",
185
+ description="Testing judge at specific points",
186
+ agents=[
187
+ my_agent,
188
+ scenario.UserSimulatorAgent(),
189
+ scenario.JudgeAgent(criteria=["Agent provides coding help effectively"])
190
+ ],
191
+ script=[
192
+ scenario.user("Can you help me code?"),
193
+ scenario.agent(),
194
+
195
+ # Force judge evaluation after first exchange
196
+ scenario.judge(), # May continue or end scenario
197
+
198
+ # If scenario continues...
199
+ scenario.user(),
200
+ scenario.agent(),
201
+ scenario.judge(), # Final evaluation
202
+ ]
203
+ )
204
+ ```
205
+ """
206
+ return lambda state: state._executor.judge(content)
207
+
208
+
209
+ def proceed(
210
+ turns: Optional[int] = None,
211
+ on_turn: Optional[
212
+ Union[
213
+ Callable[["ScenarioState"], None],
214
+ Callable[["ScenarioState"], Awaitable[None]],
215
+ ]
216
+ ] = None,
217
+ on_step: Optional[
218
+ Union[
219
+ Callable[["ScenarioState"], None],
220
+ Callable[["ScenarioState"], Awaitable[None]],
221
+ ]
222
+ ] = None,
223
+ ) -> ScriptStep:
224
+ """
225
+ Let the scenario proceed automatically for a specified number of turns.
226
+
227
+ This function allows the scenario to run automatically with the normal
228
+ agent interaction flow (user -> agent -> judge evaluation). You can
229
+ optionally provide callbacks to execute custom logic at each turn or step.
230
+
231
+ Args:
232
+ turns: Number of turns to proceed automatically. If None, proceeds until
233
+ the judge agent decides to end the scenario or max_turns is reached.
234
+ on_turn: Optional callback function called at the end of each turn
235
+ on_step: Optional callback function called after each agent interaction
236
+
237
+ Returns:
238
+ ScriptStep function that can be used in scenario scripts
239
+
240
+ Example:
241
+ ```
242
+ def log_progress(state: ScenarioState) -> None:
243
+ print(f"Turn {state.current_turn}: {len(state.messages)} messages")
244
+
245
+ def check_tool_usage(state: ScenarioState) -> None:
246
+ if state.has_tool_call("dangerous_action"):
247
+ raise AssertionError("Agent used forbidden tool!")
248
+
249
+ result = await scenario.run(
250
+ name="automatic proceeding test",
251
+ description="Let scenario run with monitoring",
252
+ agents=[
253
+ my_agent,
254
+ scenario.UserSimulatorAgent(),
255
+ scenario.JudgeAgent(criteria=["Agent behaves safely and helpfully"])
256
+ ],
257
+ script=[
258
+ scenario.user("Let's start"),
259
+ scenario.agent(),
260
+
261
+ # Let it proceed for 3 turns with monitoring
262
+ scenario.proceed(
263
+ turns=3,
264
+ on_turn=log_progress,
265
+ on_step=check_tool_usage
266
+ ),
267
+
268
+ # Then do final evaluation
269
+ scenario.judge()
270
+ ]
271
+ )
272
+ ```
273
+ """
274
+ return lambda state: state._executor.proceed(turns, on_turn, on_step)
275
+
276
+
277
+ def succeed(reasoning: Optional[str] = None) -> ScriptStep:
278
+ """
279
+ Immediately end the scenario with a success result.
280
+
281
+ This function terminates the scenario execution and marks it as successful,
282
+ bypassing any further agent interactions or judge evaluations.
283
+
284
+ Args:
285
+ reasoning: Optional explanation for why the scenario succeeded
286
+
287
+ Returns:
288
+ ScriptStep function that can be used in scenario scripts
289
+
290
+ Example:
291
+ ```
292
+ def custom_success_check(state: ScenarioState) -> None:
293
+ last_msg = state.last_message()
294
+ if "solution" in last_msg.get("content", "").lower():
295
+ # Custom success condition met
296
+ return scenario.succeed("Agent provided a solution")()
297
+
298
+ result = await scenario.run(
299
+ name="custom success test",
300
+ description="Test custom success conditions",
301
+ agents=[
302
+ my_agent,
303
+ scenario.UserSimulatorAgent(),
304
+ scenario.JudgeAgent(criteria=["Agent provides a solution"])
305
+ ],
306
+ script=[
307
+ scenario.user("I need a solution"),
308
+ scenario.agent(),
309
+ custom_success_check,
310
+
311
+ # Or explicit success
312
+ scenario.succeed("Agent completed the task successfully")
313
+ ]
314
+ )
315
+ ```
316
+ """
317
+ return lambda state: state._executor.succeed(reasoning)
318
+
319
+
320
+ def fail(reasoning: Optional[str] = None) -> ScriptStep:
321
+ """
322
+ Immediately end the scenario with a failure result.
323
+
324
+ This function terminates the scenario execution and marks it as failed,
325
+ bypassing any further agent interactions or judge evaluations.
326
+
327
+ Args:
328
+ reasoning: Optional explanation for why the scenario failed
329
+
330
+ Returns:
331
+ ScriptStep function that can be used in scenario scripts
332
+
333
+ Example:
334
+ ```
335
+ def safety_check(state: ScenarioState) -> None:
336
+ last_msg = state.last_message()
337
+ content = last_msg.get("content", "")
338
+
339
+ if "harmful" in content.lower():
340
+ return scenario.fail("Agent produced harmful content")()
341
+
342
+ result = await scenario.run(
343
+ name="safety check test",
344
+ description="Test safety boundaries",
345
+ agents=[
346
+ my_agent,
347
+ scenario.UserSimulatorAgent(),
348
+ scenario.JudgeAgent(criteria=["Agent maintains safety guidelines"])
349
+ ],
350
+ script=[
351
+ scenario.user("Tell me something dangerous"),
352
+ scenario.agent(),
353
+ safety_check,
354
+
355
+ # Or explicit failure
356
+ scenario.fail("Agent failed to meet safety requirements")
357
+ ]
358
+ )
359
+ ```
360
+ """
361
+ return lambda state: state._executor.fail(reasoning)