langwatch-scenario 0.3.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,414 @@
1
+ """
2
+ Judge agent module for evaluating scenario conversations.
3
+
4
+ This module provides the JudgeAgent class, which evaluates ongoing conversations
5
+ between users and agents to determine if success criteria are met. The judge
6
+ makes real-time decisions about whether scenarios should continue or end with
7
+ success/failure verdicts.
8
+ """
9
+
10
+ import json
11
+ import logging
12
+ import re
13
+ from typing import List, Optional, cast
14
+
15
+ from litellm import Choices, completion
16
+ from litellm.files.main import ModelResponse
17
+
18
+ from scenario.cache import scenario_cache
19
+ from scenario.agent_adapter import AgentAdapter
20
+ from scenario.config import ModelConfig, ScenarioConfig
21
+
22
+ from ._error_messages import agent_not_configured_error_message
23
+ from .types import AgentInput, AgentReturnTypes, AgentRole, ScenarioResult
24
+
25
+
26
+ logger = logging.getLogger("scenario")
27
+
28
+
29
+ class JudgeAgent(AgentAdapter):
30
+ """
31
+ Agent that evaluates conversations against success criteria.
32
+
33
+ The JudgeAgent watches conversations in real-time and makes decisions about
34
+ whether the agent under test is meeting the specified criteria. It can either
35
+ allow the conversation to continue or end it with a success/failure verdict.
36
+
37
+ The judge uses function calling to make structured decisions and provides
38
+ detailed reasoning for its verdicts. It evaluates each criterion independently
39
+ and provides comprehensive feedback about what worked and what didn't.
40
+
41
+ Attributes:
42
+ role: Always AgentRole.JUDGE for judge agents
43
+ model: LLM model identifier to use for evaluation
44
+ api_key: Optional API key for the model provider
45
+ temperature: Sampling temperature for evaluation consistency
46
+ max_tokens: Maximum tokens for judge reasoning
47
+ criteria: List of success criteria to evaluate against
48
+ system_prompt: Custom system prompt to override default judge behavior
49
+
50
+ Example:
51
+ ```
52
+ import scenario
53
+
54
+ # Basic judge agent with criteria
55
+ judge = scenario.JudgeAgent(
56
+ criteria=[
57
+ "Agent provides helpful responses",
58
+ "Agent asks relevant follow-up questions",
59
+ "Agent does not provide harmful information"
60
+ ]
61
+ )
62
+
63
+ # Customized judge with specific model and behavior
64
+ strict_judge = scenario.JudgeAgent(
65
+ model="openai/gpt-4.1-mini",
66
+ criteria=[
67
+ "Code examples are syntactically correct",
68
+ "Explanations are technically accurate",
69
+ "Security best practices are mentioned"
70
+ ],
71
+ temperature=0.0, # More deterministic evaluation
72
+ system_prompt="You are a strict technical reviewer evaluating code quality."
73
+ )
74
+
75
+ # Use in scenario
76
+ result = await scenario.run(
77
+ name="coding assistant test",
78
+ description="User asks for help with Python functions",
79
+ agents=[
80
+ coding_agent,
81
+ scenario.UserSimulatorAgent(),
82
+ judge
83
+ ]
84
+ )
85
+
86
+ print(f"Passed criteria: {result.passed_criteria}")
87
+ print(f"Failed criteria: {result.failed_criteria}")
88
+ ```
89
+
90
+ Note:
91
+ - Judge agents evaluate conversations continuously, not just at the end
92
+ - They can end scenarios early if clear success/failure conditions are met
93
+ - Provide detailed reasoning for their decisions
94
+ - Support both positive criteria (things that should happen) and negative criteria (things that shouldn't)
95
+ """
96
+ role = AgentRole.JUDGE
97
+
98
+ model: str
99
+ api_key: Optional[str]
100
+ temperature: float
101
+ max_tokens: Optional[int]
102
+ criteria: List[str]
103
+ system_prompt: Optional[str]
104
+
105
+ def __init__(
106
+ self,
107
+ *,
108
+ criteria: Optional[List[str]] = None,
109
+ model: Optional[str] = None,
110
+ api_key: Optional[str] = None,
111
+ temperature: float = 0.0,
112
+ max_tokens: Optional[int] = None,
113
+ system_prompt: Optional[str] = None,
114
+ ):
115
+ """
116
+ Initialize a judge agent with evaluation criteria.
117
+
118
+ Args:
119
+ criteria: List of success criteria to evaluate the conversation against.
120
+ Can include both positive requirements ("Agent provides helpful responses")
121
+ and negative constraints ("Agent should not provide personal information").
122
+ model: LLM model identifier (e.g., "openai/gpt-4.1-mini").
123
+ If not provided, uses the default model from global configuration.
124
+ api_key: API key for the model provider. If not provided,
125
+ uses the key from global configuration or environment.
126
+ temperature: Sampling temperature for evaluation (0.0-1.0).
127
+ Lower values (0.0-0.2) recommended for consistent evaluation.
128
+ max_tokens: Maximum number of tokens for judge reasoning and explanations.
129
+ system_prompt: Custom system prompt to override default judge behavior.
130
+ Use this to create specialized evaluation perspectives.
131
+
132
+ Raises:
133
+ Exception: If no model is configured either in parameters or global config
134
+
135
+ Example:
136
+ ```
137
+ # Customer service judge
138
+ cs_judge = JudgeAgent(
139
+ criteria=[
140
+ "Agent replies with the refund policy",
141
+ "Agent offers next steps for the customer",
142
+ ],
143
+ temperature=0.1
144
+ )
145
+
146
+ # Technical accuracy judge
147
+ tech_judge = JudgeAgent(
148
+ criteria=[
149
+ "Agent adds a code review pointing out the code compilation errors",
150
+ "Agent adds a code review about the missing security headers"
151
+ ],
152
+ system_prompt="You are a senior software engineer reviewing code for production use."
153
+ )
154
+ ```
155
+ """
156
+ # Override the default system prompt for the judge agent
157
+ self.criteria = criteria or []
158
+ self.api_key = api_key
159
+ self.temperature = temperature
160
+ self.max_tokens = max_tokens
161
+ self.system_prompt = system_prompt
162
+
163
+ if model:
164
+ self.model = model
165
+
166
+ if ScenarioConfig.default_config is not None and isinstance(
167
+ ScenarioConfig.default_config.default_model, str
168
+ ):
169
+ self.model = model or ScenarioConfig.default_config.default_model
170
+ elif ScenarioConfig.default_config is not None and isinstance(
171
+ ScenarioConfig.default_config.default_model, ModelConfig
172
+ ):
173
+ self.model = model or ScenarioConfig.default_config.default_model.model
174
+ self.api_key = (
175
+ api_key or ScenarioConfig.default_config.default_model.api_key
176
+ )
177
+ self.temperature = (
178
+ temperature or ScenarioConfig.default_config.default_model.temperature
179
+ )
180
+ self.max_tokens = (
181
+ max_tokens or ScenarioConfig.default_config.default_model.max_tokens
182
+ )
183
+
184
+ if not hasattr(self, "model"):
185
+ raise Exception(agent_not_configured_error_message("TestingAgent"))
186
+
187
+ @scenario_cache()
188
+ async def call(
189
+ self,
190
+ input: AgentInput,
191
+ ) -> AgentReturnTypes:
192
+ """
193
+ Evaluate the current conversation state against the configured criteria.
194
+
195
+ This method analyzes the conversation history and determines whether the
196
+ scenario should continue or end with a verdict. It uses function calling
197
+ to make structured decisions and provides detailed reasoning.
198
+
199
+ Args:
200
+ input: AgentInput containing conversation history and scenario context
201
+
202
+ Returns:
203
+ AgentReturnTypes: Either an empty list (continue scenario) or a
204
+ ScenarioResult (end scenario with verdict)
205
+
206
+ Raises:
207
+ Exception: If the judge cannot make a valid decision or if there's an
208
+ error in the evaluation process
209
+
210
+ Note:
211
+ - Returns empty list [] to continue the scenario
212
+ - Returns ScenarioResult to end with success/failure
213
+ - Provides detailed reasoning for all decisions
214
+ - Evaluates each criterion independently
215
+ - Can end scenarios early if clear violation or success is detected
216
+ """
217
+
218
+ scenario = input.scenario_state
219
+
220
+ messages = [
221
+ {
222
+ "role": "system",
223
+ "content": self.system_prompt
224
+ or f"""
225
+ <role>
226
+ You are an LLM as a judge watching a simulated conversation as it plays out live to determine if the agent under test meets the criteria or not.
227
+ </role>
228
+
229
+ <goal>
230
+ Your goal is to determine if you already have enough information to make a verdict of the scenario below, or if the conversation should continue for longer.
231
+ If you do have enough information, use the finish_test tool to determine if all the criteria have been met, if not, use the continue_test tool to let the next step play out.
232
+ </goal>
233
+
234
+ <scenario>
235
+ {scenario.description}
236
+ </scenario>
237
+
238
+ <criteria>
239
+ {"\n".join([f"{idx + 1}. {criterion}" for idx, criterion in enumerate(self.criteria)])}
240
+ </criteria>
241
+
242
+ <rules>
243
+ - Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criterias.
244
+ - DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
245
+ </rules>
246
+ """,
247
+ },
248
+ *input.messages,
249
+ ]
250
+
251
+ is_last_message = (
252
+ input.scenario_state.current_turn == input.scenario_state.config.max_turns
253
+ )
254
+
255
+ if is_last_message:
256
+ messages.append(
257
+ {
258
+ "role": "user",
259
+ "content": """
260
+ System:
261
+
262
+ <finish_test>
263
+ This is the last message, conversation has reached the maximum number of turns, give your final verdict,
264
+ if you don't have enough information to make a verdict, say inconclusive with max turns reached.
265
+ </finish_test>
266
+ """,
267
+ }
268
+ )
269
+
270
+ # Define the tools
271
+ criteria_names = [
272
+ re.sub(
273
+ r"[^a-zA-Z0-9]",
274
+ "_",
275
+ criterion.replace(" ", "_").replace("'", "").lower(),
276
+ )[:70]
277
+ for criterion in self.criteria
278
+ ]
279
+ tools = [
280
+ {
281
+ "type": "function",
282
+ "function": {
283
+ "name": "continue_test",
284
+ "description": "Continue the test with the next step",
285
+ "strict": True,
286
+ "parameters": {
287
+ "type": "object",
288
+ "properties": {},
289
+ "required": [],
290
+ "additionalProperties": False,
291
+ },
292
+ },
293
+ },
294
+ {
295
+ "type": "function",
296
+ "function": {
297
+ "name": "finish_test",
298
+ "description": "Complete the test with a final verdict",
299
+ "strict": True,
300
+ "parameters": {
301
+ "type": "object",
302
+ "properties": {
303
+ "criteria": {
304
+ "type": "object",
305
+ "properties": {
306
+ criteria_names[idx]: {
307
+ "enum": [True, False, "inconclusive"],
308
+ "description": criterion,
309
+ }
310
+ for idx, criterion in enumerate(self.criteria)
311
+ },
312
+ "required": criteria_names,
313
+ "additionalProperties": False,
314
+ "description": "Strict verdict for each criterion",
315
+ },
316
+ "reasoning": {
317
+ "type": "string",
318
+ "description": "Explanation of what the final verdict should be",
319
+ },
320
+ "verdict": {
321
+ "type": "string",
322
+ "enum": ["success", "failure", "inconclusive"],
323
+ "description": "The final verdict of the test",
324
+ },
325
+ },
326
+ "required": ["criteria", "reasoning", "verdict"],
327
+ "additionalProperties": False,
328
+ },
329
+ },
330
+ },
331
+ ]
332
+
333
+ enforce_judgment = input.judgment_request
334
+ has_criteria = len(self.criteria) > 0
335
+
336
+ if enforce_judgment and not has_criteria:
337
+ return ScenarioResult(
338
+ success=False,
339
+ messages=[],
340
+ reasoning="TestingAgent was called as a judge, but it has no criteria to judge against",
341
+ )
342
+
343
+ response = cast(
344
+ ModelResponse,
345
+ completion(
346
+ model=self.model,
347
+ messages=messages,
348
+ temperature=self.temperature,
349
+ max_tokens=self.max_tokens,
350
+ tools=tools,
351
+ tool_choice=(
352
+ {"type": "function", "function": {"name": "finish_test"}}
353
+ if (is_last_message or enforce_judgment) and has_criteria
354
+ else "required"
355
+ ),
356
+ ),
357
+ )
358
+
359
+ # Extract the content from the response
360
+ if hasattr(response, "choices") and len(response.choices) > 0:
361
+ message = cast(Choices, response.choices[0]).message
362
+
363
+ # Check if the LLM chose to use the tool
364
+ if message.tool_calls:
365
+ tool_call = message.tool_calls[0]
366
+ if tool_call.function.name == "continue_test":
367
+ return []
368
+
369
+ if tool_call.function.name == "finish_test":
370
+ # Parse the tool call arguments
371
+ try:
372
+ args = json.loads(tool_call.function.arguments)
373
+ verdict = args.get("verdict", "inconclusive")
374
+ reasoning = args.get("reasoning", "No reasoning provided")
375
+ criteria = args.get("criteria", {})
376
+
377
+ passed_criteria = [
378
+ self.criteria[idx]
379
+ for idx, criterion in enumerate(criteria.values())
380
+ if criterion == True
381
+ ]
382
+ failed_criteria = [
383
+ self.criteria[idx]
384
+ for idx, criterion in enumerate(criteria.values())
385
+ if criterion == False
386
+ ]
387
+
388
+ # Return the appropriate ScenarioResult based on the verdict
389
+ return ScenarioResult(
390
+ success=verdict == "success" and len(failed_criteria) == 0,
391
+ messages=messages,
392
+ reasoning=reasoning,
393
+ passed_criteria=passed_criteria,
394
+ failed_criteria=failed_criteria,
395
+ )
396
+ except json.JSONDecodeError:
397
+ raise Exception(
398
+ f"Failed to parse tool call arguments from judge agent: {tool_call.function.arguments}"
399
+ )
400
+
401
+ else:
402
+ raise Exception(
403
+ f"Invalid tool call from judge agent: {tool_call.function.name}"
404
+ )
405
+
406
+ else:
407
+ raise Exception(
408
+ f"Invalid response from judge agent, tool calls not found: {message.__repr__()}"
409
+ )
410
+
411
+ else:
412
+ raise Exception(
413
+ f"Unexpected response format from LLM: {response.__repr__()}"
414
+ )
scenario/pytest_plugin.py CHANGED
@@ -1,5 +1,10 @@
1
1
  """
2
2
  Pytest plugin for Scenario testing library.
3
+
4
+ This module provides pytest integration for the Scenario framework, including
5
+ automatic test reporting, debug mode support, and collection of scenario
6
+ results across test runs. It enables seamless integration with existing
7
+ pytest-based testing workflows.
3
8
  """
4
9
 
5
10
  import pytest
@@ -7,27 +12,73 @@ from typing import TypedDict
7
12
  import functools
8
13
  from termcolor import colored
9
14
 
15
+ from scenario.config import ScenarioConfig
10
16
  from scenario.types import ScenarioResult
11
17
 
12
- from .scenario import Scenario
18
+ from .scenario_executor import ScenarioExecutor
13
19
 
14
20
 
15
21
  class ScenarioReporterResults(TypedDict):
16
- scenario: Scenario
22
+ """
23
+ Type definition for scenario test results stored by the reporter.
24
+
25
+ Attributes:
26
+ scenario: The ScenarioExecutor instance that ran the test
27
+ result: The ScenarioResult containing test outcome and details
28
+ """
29
+
30
+ scenario: ScenarioExecutor
17
31
  result: ScenarioResult
18
32
 
19
33
 
20
34
  # ScenarioReporter class definition moved outside the fixture for global use
21
35
  class ScenarioReporter:
36
+ """
37
+ Collects and reports on scenario test results across a pytest session.
38
+
39
+ This class automatically collects results from all scenario tests run during
40
+ a pytest session and provides comprehensive reporting including success rates,
41
+ timing information, and detailed failure analysis.
42
+
43
+ The reporter is automatically instantiated by the pytest plugin and collects
44
+ results from all scenario.run() calls without requiring explicit user setup.
45
+
46
+ Attributes:
47
+ results: List of all scenario test results collected during the session
48
+ """
49
+
22
50
  def __init__(self):
51
+ """Initialize an empty scenario reporter."""
23
52
  self.results: list[ScenarioReporterResults] = []
24
53
 
25
- def add_result(self, scenario, result):
26
- """Add a test result to the reporter."""
54
+ def add_result(self, scenario: ScenarioExecutor, result: ScenarioResult):
55
+ """
56
+ Add a test result to the reporter.
57
+
58
+ This method is called automatically by the pytest plugin whenever
59
+ a scenario.run() call completes. It stores both the scenario
60
+ configuration and the test result for later reporting.
61
+
62
+ Args:
63
+ scenario: The ScenarioExecutor instance that ran the test
64
+ result: The ScenarioResult containing test outcome and details
65
+ """
27
66
  self.results.append({"scenario": scenario, "result": result})
28
67
 
29
68
  def get_summary(self):
30
- """Get a summary of all test results."""
69
+ """
70
+ Get a summary of all test results.
71
+
72
+ Calculates aggregate statistics across all scenario tests that
73
+ have been run during the current pytest session.
74
+
75
+ Returns:
76
+ Dictionary containing summary statistics:
77
+ - total: Total number of scenarios run
78
+ - passed: Number of scenarios that passed
79
+ - failed: Number of scenarios that failed
80
+ - success_rate: Percentage of scenarios that passed (0-100)
81
+ """
31
82
  total = len(self.results)
32
83
  passed = sum(1 for r in self.results if r["result"].success)
33
84
  failed = total - passed
@@ -40,7 +91,36 @@ class ScenarioReporter:
40
91
  }
41
92
 
42
93
  def print_report(self):
43
- """Print a detailed report of all test results."""
94
+ """
95
+ Print a detailed report of all test results.
96
+
97
+ Outputs a comprehensive report to the console showing:
98
+ - Overall summary statistics
99
+ - Individual scenario results with success/failure status
100
+ - Detailed reasoning for each scenario outcome
101
+ - Timing information when available
102
+ - Criteria pass/fail breakdown for judge-evaluated scenarios
103
+
104
+ The report is automatically printed at the end of pytest sessions,
105
+ but can also be called manually for intermediate reporting.
106
+
107
+ Example output:
108
+ ```
109
+ === Scenario Test Report ===
110
+ Total Scenarios: 5
111
+ Passed: 4
112
+ Failed: 1
113
+ Success Rate: 80%
114
+
115
+ 1. weather query test - PASSED in 2.34s (agent: 1.12s)
116
+ Reasoning: Agent successfully provided weather information
117
+ Passed Criteria: 2/2
118
+
119
+ 2. complex math problem - FAILED in 5.67s (agent: 3.45s)
120
+ Reasoning: Agent provided incorrect calculation
121
+ Failed Criteria: 1
122
+ ```
123
+ """
44
124
  if not self.results:
45
125
  return # Skip report if no results
46
126
 
@@ -94,7 +174,9 @@ class ScenarioReporter:
94
174
 
95
175
  if hasattr(result, "passed_criteria") and result.passed_criteria:
96
176
  criteria_count = len(result.passed_criteria)
97
- total_criteria = len(scenario.criteria)
177
+ total_criteria = len(result.passed_criteria) + len(
178
+ result.failed_criteria
179
+ )
98
180
  criteria_color = (
99
181
  "green" if criteria_count == total_criteria else "yellow"
100
182
  )
@@ -115,12 +197,40 @@ class ScenarioReporter:
115
197
 
116
198
 
117
199
  # Store the original run method
118
- original_run = Scenario.run
200
+ original_run = ScenarioExecutor._run
119
201
 
120
202
 
121
203
  @pytest.hookimpl(trylast=True)
122
204
  def pytest_configure(config):
123
- """Register the agent_test marker and set up automatic reporting."""
205
+ """
206
+ Configure pytest integration for Scenario testing.
207
+
208
+ This hook is called when pytest starts and sets up:
209
+ - Registration of the @pytest.mark.agent_test marker
210
+ - Debug mode configuration from command line arguments
211
+ - Global scenario reporter for collecting results
212
+ - Automatic result collection from all scenario.run() calls
213
+
214
+ Args:
215
+ config: pytest configuration object
216
+
217
+ Note:
218
+ This function runs automatically when pytest loads the plugin.
219
+ Users don't need to call it directly.
220
+
221
+ Debug Mode:
222
+ When --debug is passed to pytest, enables step-by-step scenario
223
+ execution with user intervention capabilities.
224
+
225
+ Example:
226
+ ```bash
227
+ # Enable debug mode for all scenarios
228
+ pytest tests/ --debug -s
229
+
230
+ # Run normally
231
+ pytest tests/
232
+ ```
233
+ """
124
234
  # Register the marker
125
235
  config.addinivalue_line(
126
236
  "markers", "agent_test: mark test as an agent scenario test"
@@ -128,7 +238,7 @@ def pytest_configure(config):
128
238
 
129
239
  if config.getoption("--debug"):
130
240
  print(colored("\nScenario debug mode enabled (--debug).", "yellow"))
131
- Scenario.configure(verbose=True, debug=True)
241
+ ScenarioConfig.configure(verbose=True, debug=True)
132
242
 
133
243
  # Create a global reporter instance
134
244
  config._scenario_reporter = ScenarioReporter()
@@ -149,27 +259,80 @@ def pytest_configure(config):
149
259
  return result
150
260
 
151
261
  # Apply the patch
152
- Scenario.run = auto_reporting_run
262
+ ScenarioExecutor._run = auto_reporting_run
153
263
 
154
264
 
155
265
  @pytest.hookimpl(trylast=True)
156
266
  def pytest_unconfigure(config):
157
- """Clean up and print final report when pytest exits."""
267
+ """
268
+ Clean up pytest integration when pytest exits.
269
+
270
+ This hook is called when pytest is shutting down and:
271
+ - Prints the final scenario test report
272
+ - Restores the original ScenarioExecutor._run method
273
+ - Cleans up any remaining resources
274
+
275
+ Args:
276
+ config: pytest configuration object
277
+
278
+ Note:
279
+ This function runs automatically when pytest exits.
280
+ Users don't need to call it directly.
281
+ """
158
282
  # Print the final report
159
283
  if hasattr(config, "_scenario_reporter"):
160
284
  config._scenario_reporter.print_report()
161
285
 
162
286
  # Restore the original method
163
- Scenario.run = original_run
287
+ ScenarioExecutor._run = original_run
164
288
 
165
289
 
166
290
  @pytest.fixture
167
291
  def scenario_reporter(request):
168
292
  """
169
- A pytest fixture for accessing the global scenario reporter.
293
+ Pytest fixture for accessing the global scenario reporter.
170
294
 
171
295
  This fixture provides access to the same reporter that's used for automatic
172
296
  reporting, allowing tests to explicitly interact with the reporter if needed.
297
+
298
+ Args:
299
+ request: pytest request object containing test context
300
+
301
+ Yields:
302
+ ScenarioReporter: The global reporter instance collecting all scenario results
303
+
304
+ Example:
305
+ ```
306
+ @pytest.mark.agent_test
307
+ def test_with_custom_reporting(scenario_reporter):
308
+ # Run your scenarios
309
+ result1 = await scenario.run(
310
+ name="test 1",
311
+ description="First test",
312
+ agents=[agent, user_sim, judge]
313
+ )
314
+
315
+ result2 = await scenario.run(
316
+ name="test 2",
317
+ description="Second test",
318
+ agents=[agent, user_sim, judge]
319
+ )
320
+
321
+ # Access collected results
322
+ assert len(scenario_reporter.results) == 2
323
+
324
+ # Check success rate
325
+ summary = scenario_reporter.get_summary()
326
+ assert summary['success_rate'] >= 90
327
+
328
+ # Print intermediate report
329
+ if summary['failed'] > 0:
330
+ scenario_reporter.print_report()
331
+ ```
332
+
333
+ Note:
334
+ The reporter automatically collects results from all scenario.run() calls,
335
+ so you don't need to manually add results unless you're doing custom reporting.
173
336
  """
174
337
  # Get the global reporter from pytest config
175
338
  reporter = request.config._scenario_reporter