langwatch-scenario 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,435 @@
1
+ """
2
+ Judge agent module for evaluating scenario conversations.
3
+
4
+ This module provides the JudgeAgent class, which evaluates ongoing conversations
5
+ between users and agents to determine if success criteria are met. The judge
6
+ makes real-time decisions about whether scenarios should continue or end with
7
+ success/failure verdicts.
8
+ """
9
+
10
+ import json
11
+ import logging
12
+ import re
13
+ from typing import List, Optional, cast
14
+
15
+ from litellm import Choices, completion
16
+ from litellm.files.main import ModelResponse
17
+
18
+ from scenario.cache import scenario_cache
19
+ from scenario.agent_adapter import AgentAdapter
20
+ from scenario.config import ModelConfig, ScenarioConfig
21
+
22
+ from .error_messages import agent_not_configured_error_message
23
+ from .types import AgentInput, AgentReturnTypes, AgentRole, ScenarioResult
24
+
25
+
26
+ logger = logging.getLogger("scenario")
27
+
28
+
29
+ class JudgeAgent(AgentAdapter):
30
+ """
31
+ Agent that evaluates conversations against success criteria.
32
+
33
+ The JudgeAgent watches conversations in real-time and makes decisions about
34
+ whether the agent under test is meeting the specified criteria. It can either
35
+ allow the conversation to continue or end it with a success/failure verdict.
36
+
37
+ The judge uses function calling to make structured decisions and provides
38
+ detailed reasoning for its verdicts. It evaluates each criterion independently
39
+ and provides comprehensive feedback about what worked and what didn't.
40
+
41
+ Attributes:
42
+ role: Always AgentRole.JUDGE for judge agents
43
+ model: LLM model identifier to use for evaluation
44
+ api_key: Optional API key for the model provider
45
+ temperature: Sampling temperature for evaluation consistency
46
+ max_tokens: Maximum tokens for judge reasoning
47
+ criteria: List of success criteria to evaluate against
48
+ system_prompt: Custom system prompt to override default judge behavior
49
+
50
+ Example:
51
+ ```python
52
+ import scenario
53
+
54
+ # Basic judge agent with criteria
55
+ judge = scenario.JudgeAgent(
56
+ criteria=[
57
+ "Agent provides helpful responses",
58
+ "Agent asks relevant follow-up questions",
59
+ "Agent does not provide harmful information"
60
+ ]
61
+ )
62
+
63
+ # Customized judge with specific model and behavior
64
+ strict_judge = scenario.JudgeAgent(
65
+ model="openai/gpt-4.1-mini",
66
+ criteria=[
67
+ "Code examples are syntactically correct",
68
+ "Explanations are technically accurate",
69
+ "Security best practices are mentioned"
70
+ ],
71
+ temperature=0.0, # More deterministic evaluation
72
+ system_prompt="You are a strict technical reviewer evaluating code quality."
73
+ )
74
+
75
+ # Use in scenario
76
+ result = await scenario.run(
77
+ name="coding assistant test",
78
+ description="User asks for help with Python functions",
79
+ agents=[
80
+ coding_agent,
81
+ scenario.UserSimulatorAgent(),
82
+ judge
83
+ ]
84
+ )
85
+
86
+ print(f"Passed criteria: {result.passed_criteria}")
87
+ print(f"Failed criteria: {result.failed_criteria}")
88
+ ```
89
+
90
+ Note:
91
+ - Judge agents evaluate conversations continuously, not just at the end
92
+ - They can end scenarios early if clear success/failure conditions are met
93
+ - Provide detailed reasoning for their decisions
94
+ - Support both positive criteria (things that should happen) and negative criteria (things that shouldn't)
95
+ """
96
+ role = AgentRole.JUDGE
97
+
98
+ model: str
99
+ api_key: Optional[str]
100
+ temperature: float
101
+ max_tokens: Optional[int]
102
+ criteria: List[str]
103
+ system_prompt: Optional[str]
104
+
105
+ def __init__(
106
+ self,
107
+ *,
108
+ criteria: Optional[List[str]] = None,
109
+ model: Optional[str] = None,
110
+ api_key: Optional[str] = None,
111
+ temperature: float = 0.0,
112
+ max_tokens: Optional[int] = None,
113
+ system_prompt: Optional[str] = None,
114
+ ):
115
+ """
116
+ Initialize a judge agent with evaluation criteria.
117
+
118
+ Args:
119
+ criteria: List of success criteria to evaluate the conversation against.
120
+ Can include both positive requirements ("Agent provides helpful responses")
121
+ and negative constraints ("Agent should not provide personal information").
122
+ model: LLM model identifier (e.g., "openai/gpt-4.1-mini").
123
+ If not provided, uses the default model from global configuration.
124
+ api_key: API key for the model provider. If not provided,
125
+ uses the key from global configuration or environment.
126
+ temperature: Sampling temperature for evaluation (0.0-1.0).
127
+ Lower values (0.0-0.2) recommended for consistent evaluation.
128
+ max_tokens: Maximum number of tokens for judge reasoning and explanations.
129
+ system_prompt: Custom system prompt to override default judge behavior.
130
+ Use this to create specialized evaluation perspectives.
131
+
132
+ Raises:
133
+ Exception: If no model is configured either in parameters or global config
134
+
135
+ Example:
136
+ ```python
137
+ # Customer service judge
138
+ cs_judge = JudgeAgent(
139
+ criteria=[
140
+ "Agent is polite and professional",
141
+ "Agent addresses the customer's specific concern",
142
+ "Agent offers appropriate solutions or next steps",
143
+ "Agent does not make promises the company cannot keep"
144
+ ],
145
+ temperature=0.1
146
+ )
147
+
148
+ # Technical accuracy judge
149
+ tech_judge = JudgeAgent(
150
+ criteria=[
151
+ "Code examples compile without errors",
152
+ "Security vulnerabilities are not introduced",
153
+ "Best practices are recommended"
154
+ ],
155
+ system_prompt="You are a senior software engineer reviewing code for production use."
156
+ )
157
+ ```
158
+ """
159
+ # Override the default system prompt for the judge agent
160
+ self.criteria = criteria or []
161
+ self.api_key = api_key
162
+ self.temperature = temperature
163
+ self.max_tokens = max_tokens
164
+ self.system_prompt = system_prompt
165
+
166
+ if model:
167
+ self.model = model
168
+
169
+ if ScenarioConfig.default_config is not None and isinstance(
170
+ ScenarioConfig.default_config.default_model, str
171
+ ):
172
+ self.model = model or ScenarioConfig.default_config.default_model
173
+ elif ScenarioConfig.default_config is not None and isinstance(
174
+ ScenarioConfig.default_config.default_model, ModelConfig
175
+ ):
176
+ self.model = model or ScenarioConfig.default_config.default_model.model
177
+ self.api_key = (
178
+ api_key or ScenarioConfig.default_config.default_model.api_key
179
+ )
180
+ self.temperature = (
181
+ temperature or ScenarioConfig.default_config.default_model.temperature
182
+ )
183
+ self.max_tokens = (
184
+ max_tokens or ScenarioConfig.default_config.default_model.max_tokens
185
+ )
186
+
187
+ if not hasattr(self, "model"):
188
+ raise Exception(agent_not_configured_error_message("TestingAgent"))
189
+
190
+ @scenario_cache()
191
+ async def call(
192
+ self,
193
+ input: AgentInput,
194
+ ) -> AgentReturnTypes:
195
+ """
196
+ Evaluate the current conversation state against the configured criteria.
197
+
198
+ This method analyzes the conversation history and determines whether the
199
+ scenario should continue or end with a verdict. It uses function calling
200
+ to make structured decisions and provides detailed reasoning.
201
+
202
+ Args:
203
+ input: AgentInput containing conversation history and scenario context
204
+
205
+ Returns:
206
+ AgentReturnTypes: Either an empty list (continue scenario) or a
207
+ ScenarioResult (end scenario with verdict)
208
+
209
+ Raises:
210
+ Exception: If the judge cannot make a valid decision or if there's an
211
+ error in the evaluation process
212
+
213
+ Example:
214
+ The judge evaluates conversations like this:
215
+
216
+ ```
217
+ Conversation so far:
218
+ User: "I need help with authentication"
219
+ Agent: "I can help! What authentication method are you using?"
220
+ User: "JWT tokens"
221
+ Agent: "Here's how to implement JWT securely: [detailed code example]"
222
+
223
+ Judge evaluation:
224
+ - ✓ Agent provides helpful responses
225
+ - ✓ Agent asks relevant follow-up questions
226
+ - ✓ Security best practices are mentioned
227
+
228
+ Decision: CONTINUE (all criteria being met so far)
229
+ ```
230
+
231
+ Note:
232
+ - Returns empty list [] to continue the scenario
233
+ - Returns ScenarioResult to end with success/failure
234
+ - Provides detailed reasoning for all decisions
235
+ - Evaluates each criterion independently
236
+ - Can end scenarios early if clear violation or success is detected
237
+ """
238
+
239
+ scenario = input.scenario_state
240
+
241
+ messages = [
242
+ {
243
+ "role": "system",
244
+ "content": self.system_prompt
245
+ or f"""
246
+ <role>
247
+ You are an LLM as a judge watching a simulated conversation as it plays out live to determine if the agent under test meets the criteria or not.
248
+ </role>
249
+
250
+ <goal>
251
+ Your goal is to determine if you already have enough information to make a verdict of the scenario below, or if the conversation should continue for longer.
252
+ If you do have enough information, use the finish_test tool to determine if all the criteria have been met, if not, use the continue_test tool to let the next step play out.
253
+ </goal>
254
+
255
+ <scenario>
256
+ {scenario.description}
257
+ </scenario>
258
+
259
+ <criteria>
260
+ {"\n".join([f"{idx + 1}. {criterion}" for idx, criterion in enumerate(self.criteria)])}
261
+ </criteria>
262
+
263
+ <rules>
264
+ - Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criterias.
265
+ - DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
266
+ </rules>
267
+ """,
268
+ },
269
+ *input.messages,
270
+ ]
271
+
272
+ is_last_message = (
273
+ input.scenario_state.current_turn == input.scenario_state.config.max_turns
274
+ )
275
+
276
+ if is_last_message:
277
+ messages.append(
278
+ {
279
+ "role": "user",
280
+ "content": """
281
+ System:
282
+
283
+ <finish_test>
284
+ This is the last message, conversation has reached the maximum number of turns, give your final verdict,
285
+ if you don't have enough information to make a verdict, say inconclusive with max turns reached.
286
+ </finish_test>
287
+ """,
288
+ }
289
+ )
290
+
291
+ # Define the tools
292
+ criteria_names = [
293
+ re.sub(
294
+ r"[^a-zA-Z0-9]",
295
+ "_",
296
+ criterion.replace(" ", "_").replace("'", "").lower(),
297
+ )[:70]
298
+ for criterion in self.criteria
299
+ ]
300
+ tools = [
301
+ {
302
+ "type": "function",
303
+ "function": {
304
+ "name": "continue_test",
305
+ "description": "Continue the test with the next step",
306
+ "strict": True,
307
+ "parameters": {
308
+ "type": "object",
309
+ "properties": {},
310
+ "required": [],
311
+ "additionalProperties": False,
312
+ },
313
+ },
314
+ },
315
+ {
316
+ "type": "function",
317
+ "function": {
318
+ "name": "finish_test",
319
+ "description": "Complete the test with a final verdict",
320
+ "strict": True,
321
+ "parameters": {
322
+ "type": "object",
323
+ "properties": {
324
+ "criteria": {
325
+ "type": "object",
326
+ "properties": {
327
+ criteria_names[idx]: {
328
+ "enum": [True, False, "inconclusive"],
329
+ "description": criterion,
330
+ }
331
+ for idx, criterion in enumerate(self.criteria)
332
+ },
333
+ "required": criteria_names,
334
+ "additionalProperties": False,
335
+ "description": "Strict verdict for each criterion",
336
+ },
337
+ "reasoning": {
338
+ "type": "string",
339
+ "description": "Explanation of what the final verdict should be",
340
+ },
341
+ "verdict": {
342
+ "type": "string",
343
+ "enum": ["success", "failure", "inconclusive"],
344
+ "description": "The final verdict of the test",
345
+ },
346
+ },
347
+ "required": ["criteria", "reasoning", "verdict"],
348
+ "additionalProperties": False,
349
+ },
350
+ },
351
+ },
352
+ ]
353
+
354
+ enforce_judgment = input.judgment_request
355
+ has_criteria = len(self.criteria) > 0
356
+
357
+ if enforce_judgment and not has_criteria:
358
+ return ScenarioResult(
359
+ success=False,
360
+ messages=[],
361
+ reasoning="TestingAgent was called as a judge, but it has no criteria to judge against",
362
+ )
363
+
364
+ response = cast(
365
+ ModelResponse,
366
+ completion(
367
+ model=self.model,
368
+ messages=messages,
369
+ temperature=self.temperature,
370
+ max_tokens=self.max_tokens,
371
+ tools=tools,
372
+ tool_choice=(
373
+ {"type": "function", "function": {"name": "finish_test"}}
374
+ if (is_last_message or enforce_judgment) and has_criteria
375
+ else "required"
376
+ ),
377
+ ),
378
+ )
379
+
380
+ # Extract the content from the response
381
+ if hasattr(response, "choices") and len(response.choices) > 0:
382
+ message = cast(Choices, response.choices[0]).message
383
+
384
+ # Check if the LLM chose to use the tool
385
+ if message.tool_calls:
386
+ tool_call = message.tool_calls[0]
387
+ if tool_call.function.name == "continue_test":
388
+ return []
389
+
390
+ if tool_call.function.name == "finish_test":
391
+ # Parse the tool call arguments
392
+ try:
393
+ args = json.loads(tool_call.function.arguments)
394
+ verdict = args.get("verdict", "inconclusive")
395
+ reasoning = args.get("reasoning", "No reasoning provided")
396
+ criteria = args.get("criteria", {})
397
+
398
+ passed_criteria = [
399
+ self.criteria[idx]
400
+ for idx, criterion in enumerate(criteria.values())
401
+ if criterion == True
402
+ ]
403
+ failed_criteria = [
404
+ self.criteria[idx]
405
+ for idx, criterion in enumerate(criteria.values())
406
+ if criterion == False
407
+ ]
408
+
409
+ # Return the appropriate ScenarioResult based on the verdict
410
+ return ScenarioResult(
411
+ success=verdict == "success" and len(failed_criteria) == 0,
412
+ messages=messages,
413
+ reasoning=reasoning,
414
+ passed_criteria=passed_criteria,
415
+ failed_criteria=failed_criteria,
416
+ )
417
+ except json.JSONDecodeError:
418
+ raise Exception(
419
+ f"Failed to parse tool call arguments from judge agent: {tool_call.function.arguments}"
420
+ )
421
+
422
+ else:
423
+ raise Exception(
424
+ f"Invalid tool call from judge agent: {tool_call.function.name}"
425
+ )
426
+
427
+ else:
428
+ raise Exception(
429
+ f"Invalid response from judge agent, tool calls not found: {message.__repr__()}"
430
+ )
431
+
432
+ else:
433
+ raise Exception(
434
+ f"Unexpected response format from LLM: {response.__repr__()}"
435
+ )