evaldeck 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,146 @@
1
+ """Base grader classes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from abc import ABC, abstractmethod
7
+ from typing import TYPE_CHECKING
8
+
9
+ from evaldeck.results import GradeResult, GradeStatus
10
+
11
+ if TYPE_CHECKING:
12
+ from evaldeck.test_case import EvalCase
13
+ from evaldeck.trace import Trace
14
+
15
+
16
+ class BaseGrader(ABC):
17
+ """Base class for all graders.
18
+
19
+ Graders evaluate a trace against expected behavior and return a grade result.
20
+ Supports both sync and async evaluation.
21
+
22
+ Async behavior:
23
+ - Default grade_async() runs sync grade() in a thread pool
24
+ - Override grade_async() for true async I/O (e.g., LLMGrader)
25
+ - When using Evaluator.evaluate_async(), all graders run concurrently
26
+
27
+ Creating a custom async grader::
28
+
29
+ class MyAPIGrader(BaseGrader):
30
+ name = "my_api"
31
+
32
+ def grade(self, trace, test_case):
33
+ # Sync fallback (blocking)
34
+ return requests.post(...).json()
35
+
36
+ async def grade_async(self, trace, test_case):
37
+ # Async implementation (non-blocking)
38
+ async with httpx.AsyncClient() as client:
39
+ response = await client.post(...)
40
+ return GradeResult.from_api(response.json())
41
+ """
42
+
43
+ name: str = "base"
44
+
45
+ @abstractmethod
46
+ def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
47
+ """Evaluate the trace and return a grade result.
48
+
49
+ Args:
50
+ trace: The execution trace to evaluate.
51
+ test_case: The test case with expected behavior.
52
+
53
+ Returns:
54
+ GradeResult indicating pass/fail and details.
55
+ """
56
+ pass
57
+
58
+ async def grade_async(self, trace: Trace, test_case: EvalCase) -> GradeResult:
59
+ """Async version of grade.
60
+
61
+ Default implementation runs sync grade() in a thread pool.
62
+ Override this method for true async behavior (e.g., async API calls).
63
+
64
+ Args:
65
+ trace: The execution trace to evaluate.
66
+ test_case: The test case with expected behavior.
67
+
68
+ Returns:
69
+ GradeResult indicating pass/fail and details.
70
+ """
71
+ return await asyncio.to_thread(self.grade, trace, test_case)
72
+
73
+ def __repr__(self) -> str:
74
+ return f"{self.__class__.__name__}()"
75
+
76
+
77
+ class CompositeGrader(BaseGrader):
78
+ """A grader that combines multiple graders.
79
+
80
+ By default, all graders must pass for the composite to pass.
81
+ """
82
+
83
+ name = "composite"
84
+
85
+ def __init__(
86
+ self,
87
+ graders: list[BaseGrader],
88
+ require_all: bool = True,
89
+ ) -> None:
90
+ """Initialize composite grader.
91
+
92
+ Args:
93
+ graders: List of graders to run.
94
+ require_all: If True, all must pass. If False, any can pass.
95
+ """
96
+ self.graders = graders
97
+ self.require_all = require_all
98
+
99
+ def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
100
+ """Run all graders and combine results."""
101
+ results: list[GradeResult] = []
102
+ for grader in self.graders:
103
+ result = grader.grade(trace, test_case)
104
+ results.append(result)
105
+
106
+ return self._combine_results(results)
107
+
108
+ async def grade_async(self, trace: Trace, test_case: EvalCase) -> GradeResult:
109
+ """Run all graders concurrently and combine results."""
110
+ tasks = [grader.grade_async(trace, test_case) for grader in self.graders]
111
+ results = await asyncio.gather(*tasks, return_exceptions=True)
112
+
113
+ # Handle any exceptions
114
+ grade_results: list[GradeResult] = []
115
+ for i, result in enumerate(results):
116
+ if isinstance(result, Exception):
117
+ grade_results.append(
118
+ GradeResult.error_result(self.graders[i].name, f"Grader error: {result}")
119
+ )
120
+ else:
121
+ grade_results.append(result)
122
+
123
+ return self._combine_results(grade_results)
124
+
125
+ def _combine_results(self, results: list[GradeResult]) -> GradeResult:
126
+ """Combine multiple grader results into one."""
127
+ passed_count = sum(1 for r in results if r.passed)
128
+ total = len(results)
129
+
130
+ if self.require_all:
131
+ # All must pass
132
+ all_passed = passed_count == total
133
+ status = GradeStatus.PASS if all_passed else GradeStatus.FAIL
134
+ message = f"{passed_count}/{total} graders passed"
135
+ else:
136
+ # Any can pass
137
+ any_passed = passed_count > 0
138
+ status = GradeStatus.PASS if any_passed else GradeStatus.FAIL
139
+ message = f"{passed_count}/{total} graders passed (require any)"
140
+
141
+ return GradeResult(
142
+ grader_name=self.name,
143
+ status=status,
144
+ message=message,
145
+ details={"results": [r.model_dump() for r in results]},
146
+ )
@@ -0,0 +1,484 @@
1
+ """Code-based graders for deterministic evaluation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import importlib
7
+ import re
8
+ from collections.abc import Callable
9
+ from typing import TYPE_CHECKING
10
+
11
+ from evaldeck.graders.base import BaseGrader
12
+ from evaldeck.results import GradeResult
13
+
14
+ if TYPE_CHECKING:
15
+ from evaldeck.test_case import EvalCase
16
+ from evaldeck.trace import Trace
17
+
18
+
19
+ class ContainsGrader(BaseGrader):
20
+ """Check if output contains expected values."""
21
+
22
+ name = "contains"
23
+
24
+ def __init__(
25
+ self,
26
+ values: list[str] | None = None,
27
+ field: str = "output",
28
+ case_sensitive: bool = False,
29
+ ) -> None:
30
+ """Initialize contains grader.
31
+
32
+ Args:
33
+ values: Strings that must be present. If None, uses test_case.expected.
34
+ field: Field to check ("output" or "reasoning").
35
+ case_sensitive: Whether to do case-sensitive matching.
36
+ """
37
+ self.values = values
38
+ self.field = field
39
+ self.case_sensitive = case_sensitive
40
+
41
+ def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
42
+ """Check if all values are present in the output."""
43
+ # Get values to check
44
+ values = self.values
45
+ if values is None:
46
+ values = test_case.expected.output_contains or []
47
+
48
+ if not values:
49
+ return GradeResult.passed_result(self.name, "No values to check")
50
+
51
+ # Get content to check
52
+ content = trace.output or ""
53
+ if not self.case_sensitive:
54
+ content = content.lower()
55
+
56
+ # Check each value
57
+ missing = []
58
+ for value in values:
59
+ check_value = value if self.case_sensitive else value.lower()
60
+ if check_value not in content:
61
+ missing.append(value)
62
+
63
+ if missing:
64
+ return GradeResult.failed_result(
65
+ self.name,
66
+ f"Missing values in output: {missing}",
67
+ expected=values,
68
+ actual=trace.output,
69
+ )
70
+
71
+ return GradeResult.passed_result(
72
+ self.name,
73
+ f"All {len(values)} values found in output",
74
+ )
75
+
76
+
77
+ class NotContainsGrader(BaseGrader):
78
+ """Check that output does NOT contain certain values."""
79
+
80
+ name = "not_contains"
81
+
82
+ def __init__(
83
+ self,
84
+ values: list[str] | None = None,
85
+ field: str = "output",
86
+ case_sensitive: bool = False,
87
+ ) -> None:
88
+ self.values = values
89
+ self.field = field
90
+ self.case_sensitive = case_sensitive
91
+
92
+ def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
93
+ """Check that no forbidden values are present."""
94
+ values = self.values
95
+ if values is None:
96
+ values = test_case.expected.output_not_contains or []
97
+
98
+ if not values:
99
+ return GradeResult.passed_result(self.name, "No values to check")
100
+
101
+ content = trace.output or ""
102
+ if not self.case_sensitive:
103
+ content = content.lower()
104
+
105
+ found = []
106
+ for value in values:
107
+ check_value = value if self.case_sensitive else value.lower()
108
+ if check_value in content:
109
+ found.append(value)
110
+
111
+ if found:
112
+ return GradeResult.failed_result(
113
+ self.name,
114
+ f"Forbidden values found in output: {found}",
115
+ expected=f"None of: {values}",
116
+ actual=trace.output,
117
+ )
118
+
119
+ return GradeResult.passed_result(self.name, "No forbidden values found")
120
+
121
+
122
+ class EqualsGrader(BaseGrader):
123
+ """Check if output exactly equals expected value."""
124
+
125
+ name = "equals"
126
+
127
+ def __init__(
128
+ self,
129
+ expected: str | None = None,
130
+ field: str = "output",
131
+ normalize_whitespace: bool = True,
132
+ ) -> None:
133
+ self.expected = expected
134
+ self.field = field
135
+ self.normalize_whitespace = normalize_whitespace
136
+
137
+ def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
138
+ """Check exact equality."""
139
+ expected = self.expected or test_case.expected.output_equals
140
+ if expected is None:
141
+ return GradeResult.passed_result(self.name, "No expected value to check")
142
+
143
+ actual = trace.output or ""
144
+
145
+ if self.normalize_whitespace:
146
+ expected = " ".join(expected.split())
147
+ actual = " ".join(actual.split())
148
+
149
+ if actual == expected:
150
+ return GradeResult.passed_result(self.name, "Output matches expected")
151
+
152
+ return GradeResult.failed_result(
153
+ self.name,
154
+ "Output does not match expected",
155
+ expected=expected,
156
+ actual=actual,
157
+ )
158
+
159
+
160
+ class RegexGrader(BaseGrader):
161
+ """Check if output matches a regex pattern."""
162
+
163
+ name = "regex"
164
+
165
+ def __init__(
166
+ self,
167
+ pattern: str | None = None,
168
+ field: str = "output",
169
+ flags: int = 0,
170
+ ) -> None:
171
+ self.pattern = pattern
172
+ self.field = field
173
+ self.flags = flags
174
+
175
+ def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
176
+ """Check regex match."""
177
+ pattern = self.pattern or test_case.expected.output_matches
178
+ if pattern is None:
179
+ return GradeResult.passed_result(self.name, "No pattern to check")
180
+
181
+ content = trace.output or ""
182
+
183
+ try:
184
+ if re.search(pattern, content, self.flags):
185
+ return GradeResult.passed_result(
186
+ self.name,
187
+ f"Output matches pattern: {pattern}",
188
+ )
189
+ return GradeResult.failed_result(
190
+ self.name,
191
+ f"Output does not match pattern: {pattern}",
192
+ expected=pattern,
193
+ actual=content,
194
+ )
195
+ except re.error as e:
196
+ return GradeResult.error_result(self.name, f"Invalid regex: {e}")
197
+
198
+
199
+ class ToolCalledGrader(BaseGrader):
200
+ """Check that required tools were called."""
201
+
202
+ name = "tool_called"
203
+
204
+ def __init__(self, required: list[str] | None = None) -> None:
205
+ """Initialize tool called grader.
206
+
207
+ Args:
208
+ required: List of tool names that must be called.
209
+ If None, uses test_case.expected.tools_called.
210
+ """
211
+ self.required = required
212
+
213
+ def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
214
+ """Check that all required tools were called."""
215
+ required = self.required
216
+ if required is None:
217
+ required = test_case.expected.tools_called or []
218
+
219
+ if not required:
220
+ return GradeResult.passed_result(self.name, "No required tools to check")
221
+
222
+ called = set(trace.tools_called)
223
+ required_set = set(required)
224
+ missing = required_set - called
225
+
226
+ if missing:
227
+ return GradeResult.failed_result(
228
+ self.name,
229
+ f"Required tools not called: {sorted(missing)}",
230
+ expected=sorted(required),
231
+ actual=sorted(called),
232
+ )
233
+
234
+ return GradeResult.passed_result(
235
+ self.name,
236
+ f"All {len(required)} required tools were called",
237
+ )
238
+
239
+
240
+ class ToolNotCalledGrader(BaseGrader):
241
+ """Check that certain tools were NOT called."""
242
+
243
+ name = "tool_not_called"
244
+
245
+ def __init__(self, forbidden: list[str] | None = None) -> None:
246
+ self.forbidden = forbidden
247
+
248
+ def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
249
+ """Check that forbidden tools were not called."""
250
+ forbidden = self.forbidden
251
+ if forbidden is None:
252
+ forbidden = test_case.expected.tools_not_called or []
253
+
254
+ if not forbidden:
255
+ return GradeResult.passed_result(self.name, "No forbidden tools to check")
256
+
257
+ called = set(trace.tools_called)
258
+ forbidden_set = set(forbidden)
259
+ violated = called & forbidden_set
260
+
261
+ if violated:
262
+ return GradeResult.failed_result(
263
+ self.name,
264
+ f"Forbidden tools were called: {sorted(violated)}",
265
+ expected=f"None of: {sorted(forbidden)}",
266
+ actual=sorted(called),
267
+ )
268
+
269
+ return GradeResult.passed_result(self.name, "No forbidden tools were called")
270
+
271
+
272
+ class ToolOrderGrader(BaseGrader):
273
+ """Check that tools were called in the correct order."""
274
+
275
+ name = "tool_order"
276
+
277
+ def __init__(self, expected_order: list[str] | None = None) -> None:
278
+ self.expected_order = expected_order
279
+
280
+ def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
281
+ """Check tool call ordering."""
282
+ expected = self.expected_order
283
+ if expected is None:
284
+ expected = test_case.expected.tool_call_order or []
285
+
286
+ if not expected:
287
+ return GradeResult.passed_result(self.name, "No expected order to check")
288
+
289
+ actual = trace.tools_called
290
+
291
+ # Check if expected is a subsequence of actual
292
+ expected_idx = 0
293
+ for tool in actual:
294
+ if expected_idx < len(expected) and tool == expected[expected_idx]:
295
+ expected_idx += 1
296
+
297
+ if expected_idx == len(expected):
298
+ return GradeResult.passed_result(
299
+ self.name,
300
+ "Tools called in correct order",
301
+ )
302
+
303
+ return GradeResult.failed_result(
304
+ self.name,
305
+ "Tools not called in expected order",
306
+ expected=expected,
307
+ actual=actual,
308
+ )
309
+
310
+
311
+ class MaxStepsGrader(BaseGrader):
312
+ """Check that agent completed within maximum steps."""
313
+
314
+ name = "max_steps"
315
+
316
+ def __init__(self, max_steps: int | None = None) -> None:
317
+ self.max_steps = max_steps
318
+
319
+ def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
320
+ """Check step count."""
321
+ max_steps = self.max_steps
322
+ if max_steps is None:
323
+ max_steps = test_case.expected.max_steps
324
+
325
+ if max_steps is None:
326
+ return GradeResult.passed_result(self.name, "No max steps defined")
327
+
328
+ actual = trace.step_count
329
+
330
+ if actual <= max_steps:
331
+ return GradeResult.passed_result(
332
+ self.name,
333
+ f"Completed in {actual} steps (max: {max_steps})",
334
+ )
335
+
336
+ return GradeResult.failed_result(
337
+ self.name,
338
+ f"Too many steps: {actual} > {max_steps}",
339
+ expected=max_steps,
340
+ actual=actual,
341
+ )
342
+
343
+
344
+ class TaskCompletedGrader(BaseGrader):
345
+ """Check if the agent completed the task (based on trace status)."""
346
+
347
+ name = "task_completed"
348
+
349
+ def __init__(self, require_success: bool = True) -> None:
350
+ self.require_success = require_success
351
+
352
+ def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
353
+ """Check task completion status."""
354
+ from evaldeck.trace import TraceStatus
355
+
356
+ # Check expected from test case
357
+ expected = test_case.expected.task_completed
358
+ if expected is None:
359
+ expected = self.require_success
360
+
361
+ is_success = trace.status == TraceStatus.SUCCESS
362
+ has_output = bool(trace.output)
363
+
364
+ completed = is_success and has_output
365
+
366
+ if expected and completed:
367
+ return GradeResult.passed_result(self.name, "Task completed successfully")
368
+ elif expected and not completed:
369
+ return GradeResult.failed_result(
370
+ self.name,
371
+ f"Task not completed. Status: {trace.status}, Output: {bool(trace.output)}",
372
+ expected="completed",
373
+ actual=f"status={trace.status}",
374
+ )
375
+ elif not expected and not completed:
376
+ return GradeResult.passed_result(
377
+ self.name,
378
+ "Task correctly did not complete (as expected)",
379
+ )
380
+ else:
381
+ return GradeResult.failed_result(
382
+ self.name,
383
+ "Task completed but was expected to fail",
384
+ expected="not completed",
385
+ actual="completed",
386
+ )
387
+
388
+
389
+ class CustomGrader(BaseGrader):
390
+ """Run a custom grading function.
391
+
392
+ Supports both synchronous and asynchronous custom functions. When using
393
+ evaluate_async(), async functions are awaited directly while sync functions
394
+ run in a thread pool to avoid blocking the event loop.
395
+
396
+ Example with sync function::
397
+
398
+ def my_grader(trace, test_case):
399
+ if "error" in trace.output:
400
+ return GradeResult.failed_result("custom", "Found error")
401
+ return GradeResult.passed_result("custom", "No errors")
402
+
403
+ grader = CustomGrader(func=my_grader)
404
+
405
+ Example with async function::
406
+
407
+ async def my_async_grader(trace, test_case):
408
+ # Can make async API calls here
409
+ result = await external_validation_api(trace.output)
410
+ if result.valid:
411
+ return GradeResult.passed_result("custom", "Valid")
412
+ return GradeResult.failed_result("custom", "Invalid")
413
+
414
+ grader = CustomGrader(func=my_async_grader)
415
+ """
416
+
417
+ name = "custom"
418
+
419
+ def __init__(
420
+ self,
421
+ func: Callable[[Trace, EvalCase], GradeResult] | None = None,
422
+ module: str | None = None,
423
+ function: str | None = None,
424
+ ) -> None:
425
+ """Initialize custom grader.
426
+
427
+ Args:
428
+ func: Custom grading function. Can be sync or async.
429
+ Signature: (trace, test_case) -> GradeResult
430
+ module: Module path to import function from (alternative to func).
431
+ function: Function name to import from module.
432
+
433
+ Provide either `func` directly, or `module` and `function` to import.
434
+ """
435
+ self.func = func
436
+ self.module_name = module
437
+ self.function_name = function
438
+ self._loaded_func: Callable | None = None
439
+
440
+ def _get_func(self) -> Callable[[Trace, EvalCase], GradeResult]:
441
+ """Get the grading function."""
442
+ if self.func is not None:
443
+ return self.func
444
+
445
+ if self._loaded_func is not None:
446
+ return self._loaded_func
447
+
448
+ if self.module_name and self.function_name:
449
+ module = importlib.import_module(self.module_name)
450
+ self._loaded_func = getattr(module, self.function_name)
451
+ return self._loaded_func
452
+
453
+ raise ValueError("CustomGrader requires either func or module+function")
454
+
455
+ def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
456
+ """Run the custom grading function (sync).
457
+
458
+ Note: If your custom function is async, use grade_async() instead,
459
+ which will properly await the function.
460
+ """
461
+ try:
462
+ func = self._get_func()
463
+ return func(trace, test_case)
464
+ except Exception as e:
465
+ return GradeResult.error_result(self.name, f"Custom grader error: {e}")
466
+
467
+ async def grade_async(self, trace: Trace, test_case: EvalCase) -> GradeResult:
468
+ """Run the custom grading function (async).
469
+
470
+ Automatically detects if the custom function is async or sync:
471
+ - Async functions are awaited directly
472
+ - Sync functions run in a thread pool to avoid blocking the event loop
473
+
474
+ This allows custom graders to make async API calls (e.g., external
475
+ validation services) without blocking other concurrent evaluations.
476
+ """
477
+ try:
478
+ func = self._get_func()
479
+ if asyncio.iscoroutinefunction(func):
480
+ return await func(trace, test_case)
481
+ else:
482
+ return await asyncio.to_thread(func, trace, test_case)
483
+ except Exception as e:
484
+ return GradeResult.error_result(self.name, f"Custom grader error: {e}")