langwatch-scenario 0.3.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scenario/types.py CHANGED
@@ -6,8 +6,6 @@ from typing import (
6
6
  Any,
7
7
  Awaitable,
8
8
  Callable,
9
- Coroutine,
10
- Dict,
11
9
  List,
12
10
  Optional,
13
11
  Union,
@@ -17,29 +15,85 @@ from openai.types.chat import ChatCompletionMessageParam, ChatCompletionUserMess
17
15
 
18
16
  # Prevent circular imports + Pydantic breaking
19
17
  if TYPE_CHECKING:
20
- from scenario.scenario_executor import ScenarioExecutor
18
+ from scenario.scenario_executor import ScenarioState
21
19
 
22
- ScenarioExecutorType = ScenarioExecutor
20
+ ScenarioStateType = ScenarioState
23
21
  else:
24
- ScenarioExecutorType = Any
22
+ ScenarioStateType = Any
25
23
 
26
24
 
27
- class ScenarioAgentRole(Enum):
25
+ class AgentRole(Enum):
26
+ """
27
+ Defines the different roles that agents can play in a scenario.
28
+
29
+ This enum is used to identify the role of each agent during scenario execution,
30
+ enabling the framework to determine the order and interaction patterns between
31
+ different types of agents.
32
+
33
+ Attributes:
34
+ USER: Represents a user simulator agent that generates user inputs
35
+ AGENT: Represents the agent under test that responds to user inputs
36
+ JUDGE: Represents a judge agent that evaluates the conversation and determines success/failure
37
+ """
28
38
  USER = "User"
29
39
  AGENT = "Agent"
30
40
  JUDGE = "Judge"
31
41
 
32
42
 
33
43
  class AgentInput(BaseModel):
44
+ """
45
+ Input data structure passed to agent adapters during scenario execution.
46
+
47
+ This class encapsulates all the information an agent needs to generate its next response,
48
+ including conversation history, thread context, and scenario state. It provides convenient
49
+ methods to access the most recent user messages.
50
+
51
+ Attributes:
52
+ thread_id: Unique identifier for the conversation thread
53
+ messages: Complete conversation history as OpenAI-compatible messages
54
+ new_messages: Only the new messages since the agent's last call
55
+ judgment_request: Whether this call is requesting a judgment from a judge agent
56
+ scenario_state: Current state of the scenario execution
57
+
58
+ Example:
59
+ ```
60
+ class MyAgent(AgentAdapter):
61
+ async def call(self, input: AgentInput) -> str:
62
+ # Get the latest user message
63
+ user_msg = input.last_new_user_message_str()
64
+
65
+ # Process with your LLM/agent
66
+ response = await my_llm.complete(
67
+ messages=input.messages,
68
+ prompt=user_msg
69
+ )
70
+
71
+ return response
72
+ ```
73
+ """
34
74
  thread_id: str
35
75
  # Prevent pydantic from validating/parsing the messages and causing issues: https://github.com/pydantic/pydantic/issues/9541
36
76
  messages: Annotated[List[ChatCompletionMessageParam], SkipValidation]
37
77
  new_messages: Annotated[List[ChatCompletionMessageParam], SkipValidation]
38
- context: Dict[str, Any]
39
- requested_role: ScenarioAgentRole
40
- scenario_state: ScenarioExecutorType = Field(exclude=True)
78
+ judgment_request: bool = False
79
+ scenario_state: ScenarioStateType
41
80
 
42
81
  def last_new_user_message(self) -> ChatCompletionUserMessageParam:
82
+ """
83
+ Get the most recent user message from the new messages.
84
+
85
+ Returns:
86
+ The last user message in OpenAI message format
87
+
88
+ Raises:
89
+ ValueError: If no new user messages are found
90
+
91
+ Example:
92
+ ```
93
+ user_message = input.last_new_user_message()
94
+ content = user_message["content"]
95
+ ```
96
+ """
43
97
  user_messages = [m for m in self.new_messages if m["role"] == "user"]
44
98
  if not user_messages:
45
99
  raise ValueError(
@@ -48,6 +102,24 @@ class AgentInput(BaseModel):
48
102
  return user_messages[-1]
49
103
 
50
104
  def last_new_user_message_str(self) -> str:
105
+ """
106
+ Get the content of the most recent user message as a string.
107
+
108
+ This is a convenience method for getting simple text content from user messages.
109
+ For multimodal messages or complex content, use last_new_user_message() instead.
110
+
111
+ Returns:
112
+ The text content of the last user message
113
+
114
+ Raises:
115
+ ValueError: If no new user messages found or if the message content is not a string
116
+
117
+ Example:
118
+ ```
119
+ user_text = input.last_new_user_message_str()
120
+ response = f"You said: {user_text}"
121
+ ```
122
+ """
51
123
  content = self.last_new_user_message()["content"]
52
124
  if type(content) != str:
53
125
  raise ValueError(
@@ -58,14 +130,41 @@ class AgentInput(BaseModel):
58
130
 
59
131
  class ScenarioResult(BaseModel):
60
132
  """
61
- Represents the results of a scenario test run.
133
+ Represents the final result of a scenario test execution.
134
+
135
+ This class contains all the information about how a scenario performed,
136
+ including whether it succeeded, the conversation that took place, and
137
+ detailed reasoning about which criteria were met or failed.
62
138
 
63
139
  Attributes:
64
- success: Whether the scenario passed
65
- conversation: The conversation history
66
- reasoning: Reasoning for the result
67
- passed_criteria: List of criteria that were met
68
- failed_criteria: List of criteria that were not met
140
+ success: Whether the scenario passed all criteria and completed successfully
141
+ messages: Complete conversation history that occurred during the scenario
142
+ reasoning: Detailed explanation of why the scenario succeeded or failed
143
+ passed_criteria: List of success criteria that were satisfied
144
+ failed_criteria: List of success criteria that were not satisfied
145
+ total_time: Total execution time in seconds (if measured)
146
+ agent_time: Time spent in agent calls in seconds (if measured)
147
+
148
+ Example:
149
+ ```
150
+ result = await scenario.run(
151
+ name="weather query",
152
+ description="User asks about weather",
153
+ agents=[
154
+ weather_agent,
155
+ scenario.UserSimulatorAgent(),
156
+ scenario.JudgeAgent(criteria=["Agent provides helpful weather information"])
157
+ ]
158
+ )
159
+
160
+ print(f"Test {'PASSED' if result.success else 'FAILED'}")
161
+ print(f"Reasoning: {result.reasoning}")
162
+
163
+ if not result.success:
164
+ print("Failed criteria:")
165
+ for criteria in result.failed_criteria:
166
+ print(f" - {criteria}")
167
+ ```
69
168
  """
70
169
 
71
170
  success: bool
@@ -77,7 +176,12 @@ class ScenarioResult(BaseModel):
77
176
  agent_time: Optional[float] = None
78
177
 
79
178
  def __repr__(self) -> str:
80
- """Provide a concise representation for debugging."""
179
+ """
180
+ Provide a concise representation for debugging and logging.
181
+
182
+ Returns:
183
+ A string representation showing success status and reasoning
184
+ """
81
185
  status = "PASSED" if self.success else "FAILED"
82
186
  return f"ScenarioResult(success={self.success}, status={status}, reasoning='{self.reasoning or 'None'}')"
83
187
 
@@ -85,12 +189,85 @@ class ScenarioResult(BaseModel):
85
189
  AgentReturnTypes = Union[
86
190
  str, ChatCompletionMessageParam, List[ChatCompletionMessageParam], ScenarioResult
87
191
  ]
192
+ """
193
+ Union type representing all valid return types for agent adapter call methods.
194
+
195
+ Agent adapters can return any of these types:
196
+
197
+ - str: Simple text response
198
+
199
+ - ChatCompletionMessageParam: Single OpenAI-compatible message
200
+
201
+ - List[ChatCompletionMessageParam]: Multiple OpenAI-compatible messages (for multi-step responses)
202
+
203
+ - ScenarioResult: Direct test result (typically used by judge agents to end scenarios)
204
+
205
+ Example:
206
+ ```
207
+ class MyAgent(AgentAdapter):
208
+ async def call(self, input: AgentInput) -> AgentReturnTypes:
209
+ # Can return a simple string
210
+ return "Hello, how can I help you?"
211
+
212
+ # Or a structured message
213
+ return {"role": "assistant", "content": "Hello!"}
214
+
215
+ # Or multiple messages for complex interactions
216
+ return [
217
+ {"role": "assistant", "content": "Let me search for that..."},
218
+ {"role": "assistant", "content": "Here's what I found: ..."}
219
+ ]
220
+ ```
221
+ """
88
222
 
89
223
  # TODO: remove the optional ScenarioResult return type from here, use events instead
90
224
  ScriptStep = Union[
91
- Callable[["ScenarioExecutor"], None],
92
- Callable[["ScenarioExecutor"], Optional[ScenarioResult]],
225
+ Callable[["ScenarioState"], None],
226
+ Callable[["ScenarioState"], Optional[ScenarioResult]],
93
227
  # Async as well
94
- Callable[["ScenarioExecutor"], Awaitable[None]],
95
- Callable[["ScenarioExecutor"], Awaitable[Optional[ScenarioResult]]],
228
+ Callable[["ScenarioState"], Awaitable[None]],
229
+ Callable[["ScenarioState"], Awaitable[Optional[ScenarioResult]]],
96
230
  ]
231
+ """
232
+ Union type for script step functions used in scenario scripts.
233
+
234
+ Script steps are functions that can be called during scenario execution to control
235
+ the flow, add custom assertions, or perform evaluations. They receive the current
236
+ scenario state and can optionally return a result to end the scenario.
237
+
238
+ The functions can be either synchronous or asynchronous.
239
+
240
+ Example:
241
+ ```
242
+ def check_tool_call(state: ScenarioState) -> None:
243
+ assert state.has_tool_call("get_weather")
244
+
245
+ async def custom_evaluation(state: ScenarioState) -> Optional[ScenarioResult]:
246
+ eval_result = await some_external_evaluator(state.messages)
247
+ if not eval_result.passed:
248
+ return ScenarioResult(
249
+ success=False,
250
+ messages=state.messages,
251
+ reasoning="Custom evaluation failed"
252
+ )
253
+ return None # Continue scenario
254
+
255
+ # Use in script
256
+ result = await scenario.run(
257
+ name="test",
258
+ description="Test scenario",
259
+ agents=[
260
+ MyAgent(),
261
+ scenario.UserSimulatorAgent(),
262
+ scenario.JudgeAgent(criteria=["Agent provides helpful response"])
263
+ ],
264
+ script=[
265
+ scenario.user("What's the weather?"),
266
+ scenario.agent(),
267
+ check_tool_call,
268
+ custom_evaluation,
269
+ scenario.succeed()
270
+ ]
271
+ )
272
+ ```
273
+ """
@@ -0,0 +1,242 @@
1
+ """
2
+ User simulator agent module for generating realistic user interactions.
3
+
4
+ This module provides the UserSimulatorAgent class, which simulates human user
5
+ behavior in conversations with agents under test. The simulator generates
6
+ contextually appropriate user messages based on the scenario description and
7
+ conversation history.
8
+ """
9
+
10
+ import logging
11
+ from typing import Optional, cast
12
+
13
+ from litellm import Choices, completion
14
+ from litellm.files.main import ModelResponse
15
+
16
+ from scenario.cache import scenario_cache
17
+ from scenario.agent_adapter import AgentAdapter
18
+ from scenario._utils.utils import reverse_roles
19
+ from scenario.config import ModelConfig, ScenarioConfig
20
+
21
+ from ._error_messages import agent_not_configured_error_message
22
+ from .types import AgentInput, AgentReturnTypes, AgentRole
23
+
24
+
25
+ logger = logging.getLogger("scenario")
26
+
27
+
28
+ class UserSimulatorAgent(AgentAdapter):
29
+ """
30
+ Agent that simulates realistic user behavior in scenario conversations.
31
+
32
+ This agent generates user messages that are appropriate for the given scenario
33
+ context, simulating how a real human user would interact with the agent under test.
34
+ It uses an LLM to generate natural, contextually relevant user inputs that help
35
+ drive the conversation forward according to the scenario description.
36
+
37
+ Attributes:
38
+ role: Always AgentRole.USER for user simulator agents
39
+ model: LLM model identifier to use for generating user messages
40
+ api_key: Optional API key for the model provider
41
+ temperature: Sampling temperature for response generation
42
+ max_tokens: Maximum tokens to generate in user messages
43
+ system_prompt: Custom system prompt to override default user simulation behavior
44
+
45
+ Example:
46
+ ```
47
+ import scenario
48
+
49
+ # Basic user simulator with default behavior
50
+ user_sim = scenario.UserSimulatorAgent(
51
+ model="openai/gpt-4.1-mini"
52
+ )
53
+
54
+ # Customized user simulator
55
+ custom_user_sim = scenario.UserSimulatorAgent(
56
+ model="openai/gpt-4.1-mini",
57
+ temperature=0.3,
58
+ system_prompt="You are a technical user who asks detailed questions"
59
+ )
60
+
61
+ # Use in scenario
62
+ result = await scenario.run(
63
+ name="user interaction test",
64
+ description="User seeks help with Python programming",
65
+ agents=[
66
+ my_programming_agent,
67
+ user_sim,
68
+ scenario.JudgeAgent(criteria=["Provides helpful code examples"])
69
+ ]
70
+ )
71
+ ```
72
+
73
+ Note:
74
+ - The user simulator automatically generates short, natural user messages
75
+ - It follows the scenario description to stay on topic
76
+ - Messages are generated in a casual, human-like style (lowercase, brief, etc.)
77
+ - The simulator will not act as an assistant - it only generates user inputs
78
+ """
79
+ role = AgentRole.USER
80
+
81
+ model: str
82
+ api_key: Optional[str]
83
+ temperature: float
84
+ max_tokens: Optional[int]
85
+ system_prompt: Optional[str]
86
+
87
+ def __init__(
88
+ self,
89
+ *,
90
+ model: Optional[str] = None,
91
+ api_key: Optional[str] = None,
92
+ temperature: float = 0.0,
93
+ max_tokens: Optional[int] = None,
94
+ system_prompt: Optional[str] = None,
95
+ ):
96
+ """
97
+ Initialize a user simulator agent.
98
+
99
+ Args:
100
+ model: LLM model identifier (e.g., "openai/gpt-4.1-mini").
101
+ If not provided, uses the default model from global configuration.
102
+ api_key: API key for the model provider. If not provided,
103
+ uses the key from global configuration or environment.
104
+ temperature: Sampling temperature for message generation (0.0-1.0).
105
+ Lower values make responses more deterministic.
106
+ max_tokens: Maximum number of tokens to generate in user messages.
107
+ If not provided, uses model defaults.
108
+ system_prompt: Custom system prompt to override default user simulation behavior.
109
+ Use this to create specialized user personas or behaviors.
110
+
111
+ Raises:
112
+ Exception: If no model is configured either in parameters or global config
113
+
114
+ Example:
115
+ ```
116
+ # Basic user simulator
117
+ user_sim = UserSimulatorAgent(model="openai/gpt-4.1-mini")
118
+
119
+ # User simulator with custom persona
120
+ expert_user = UserSimulatorAgent(
121
+ model="openai/gpt-4.1-mini",
122
+ temperature=0.2,
123
+ system_prompt='''
124
+ You are an expert software developer testing an AI coding assistant.
125
+ Ask challenging, technical questions and be demanding about code quality.
126
+ '''
127
+ )
128
+ ```
129
+ """
130
+ # Override the default system prompt for the user simulator agent
131
+ self.api_key = api_key
132
+ self.temperature = temperature
133
+ self.max_tokens = max_tokens
134
+ self.system_prompt = system_prompt
135
+
136
+ if model:
137
+ self.model = model
138
+
139
+ if ScenarioConfig.default_config is not None and isinstance(
140
+ ScenarioConfig.default_config.default_model, str
141
+ ):
142
+ self.model = model or ScenarioConfig.default_config.default_model
143
+ elif ScenarioConfig.default_config is not None and isinstance(
144
+ ScenarioConfig.default_config.default_model, ModelConfig
145
+ ):
146
+ self.model = model or ScenarioConfig.default_config.default_model.model
147
+ self.api_key = (
148
+ api_key or ScenarioConfig.default_config.default_model.api_key
149
+ )
150
+ self.temperature = (
151
+ temperature or ScenarioConfig.default_config.default_model.temperature
152
+ )
153
+ self.max_tokens = (
154
+ max_tokens or ScenarioConfig.default_config.default_model.max_tokens
155
+ )
156
+
157
+ if not hasattr(self, "model"):
158
+ raise Exception(agent_not_configured_error_message("TestingAgent"))
159
+
160
+ @scenario_cache()
161
+ async def call(
162
+ self,
163
+ input: AgentInput,
164
+ ) -> AgentReturnTypes:
165
+ """
166
+ Generate the next user message in the conversation.
167
+
168
+ This method analyzes the current conversation state and scenario context
169
+ to generate an appropriate user message that moves the conversation forward
170
+ in a realistic, human-like manner.
171
+
172
+ Args:
173
+ input: AgentInput containing conversation history and scenario context
174
+
175
+ Returns:
176
+ AgentReturnTypes: A user message in OpenAI format that continues the conversation
177
+
178
+ Note:
179
+ - Messages are generated in a casual, human-like style
180
+ - The simulator follows the scenario description to stay contextually relevant
181
+ - Uses role reversal internally to work around LLM biases toward assistant roles
182
+ - Results are cached when cache_key is configured for deterministic testing
183
+ """
184
+
185
+ scenario = input.scenario_state
186
+
187
+ messages = [
188
+ {
189
+ "role": "system",
190
+ "content": self.system_prompt
191
+ or f"""
192
+ <role>
193
+ You are pretending to be a user, you are testing an AI Agent (shown as the user role) based on a scenario.
194
+ Approach this naturally, as a human user would, with very short inputs, few words, all lowercase, imperative, not periods, like when they google or talk to chatgpt.
195
+ </role>
196
+
197
+ <goal>
198
+ Your goal (assistant) is to interact with the Agent Under Test (user) as if you were a human user to see if it can complete the scenario successfully.
199
+ </goal>
200
+
201
+ <scenario>
202
+ {scenario.description}
203
+ </scenario>
204
+
205
+ <rules>
206
+ - DO NOT carry over any requests yourself, YOU ARE NOT the assistant today, you are the user
207
+ </rules>
208
+ """,
209
+ },
210
+ {"role": "assistant", "content": "Hello, how can I help you today?"},
211
+ *input.messages,
212
+ ]
213
+
214
+ # User to assistant role reversal
215
+ # LLM models are biased to always be the assistant not the user, so we need to do this reversal otherwise models like GPT 4.5 is
216
+ # super confused, and Claude 3.7 even starts throwing exceptions.
217
+ messages = reverse_roles(messages)
218
+
219
+ response = cast(
220
+ ModelResponse,
221
+ completion(
222
+ model=self.model,
223
+ messages=messages,
224
+ temperature=self.temperature,
225
+ max_tokens=self.max_tokens,
226
+ tools=[],
227
+ ),
228
+ )
229
+
230
+ # Extract the content from the response
231
+ if hasattr(response, "choices") and len(response.choices) > 0:
232
+ message = cast(Choices, response.choices[0]).message
233
+
234
+ message_content = message.content
235
+ if message_content is None:
236
+ raise Exception(f"No response from LLM: {response.__repr__()}")
237
+
238
+ return {"role": "user", "content": message_content}
239
+ else:
240
+ raise Exception(
241
+ f"Unexpected response format from LLM: {response.__repr__()}"
242
+ )