evaldeck 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
evaldeck/evaluator.py ADDED
@@ -0,0 +1,566 @@
1
+ """Main evaluation engine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from collections.abc import Awaitable, Callable
7
+ from contextlib import asynccontextmanager
8
+ from datetime import datetime
9
+ from typing import TYPE_CHECKING, Any
10
+
11
+ from evaldeck.graders import (
12
+ BaseGrader,
13
+ ContainsGrader,
14
+ LLMGrader,
15
+ MaxStepsGrader,
16
+ TaskCompletedGrader,
17
+ ToolCalledGrader,
18
+ ToolNotCalledGrader,
19
+ ToolOrderGrader,
20
+ )
21
+ from evaldeck.graders.code import NotContainsGrader
22
+ from evaldeck.metrics import (
23
+ BaseMetric,
24
+ DurationMetric,
25
+ StepCountMetric,
26
+ TokenUsageMetric,
27
+ ToolCallCountMetric,
28
+ )
29
+ from evaldeck.results import (
30
+ EvaluationResult,
31
+ GradeResult,
32
+ GradeStatus,
33
+ RunResult,
34
+ SuiteResult,
35
+ )
36
+
37
+ if TYPE_CHECKING:
38
+ from evaldeck.config import EvaldeckConfig
39
+ from evaldeck.test_case import EvalCase, EvalSuite
40
+ from evaldeck.trace import Trace
41
+
42
+
43
+ class Evaluator:
44
+ """Main evaluation engine.
45
+
46
+ Evaluates agent traces against test cases using graders and metrics.
47
+
48
+ Choosing sync vs async methods:
49
+
50
+ Use **evaluate()** (sync) when:
51
+ - Running a single quick evaluation with code-based graders
52
+ - Your graders are all CPU-bound (ContainsGrader, RegexGrader, etc.)
53
+ - You're in a sync context without an event loop
54
+
55
+ Use **evaluate_async()** when:
56
+ - Using LLMGrader or other I/O-bound graders
57
+ - Running multiple graders that make API calls
58
+ - You want concurrent grader execution for better throughput
59
+ - Your custom graders/metrics make async API calls
60
+
61
+ Use **evaluate_suite_async()** when:
62
+ - Running multiple test cases (concurrent execution)
63
+ - Your agent function is async
64
+ - You want to control concurrency with max_concurrent
65
+
66
+ Performance comparison::
67
+
68
+ # Sync: graders run sequentially
69
+ # 3 LLMGraders × 2 seconds each = ~6 seconds total
70
+ result = evaluator.evaluate(trace, test_case)
71
+
72
+ # Async: graders run concurrently
73
+ # 3 LLMGraders × 2 seconds each = ~2 seconds total
74
+ result = await evaluator.evaluate_async(trace, test_case)
75
+ """
76
+
77
+ def __init__(
78
+ self,
79
+ graders: list[BaseGrader] | None = None,
80
+ metrics: list[BaseMetric] | None = None,
81
+ config: EvaldeckConfig | None = None,
82
+ ) -> None:
83
+ """Initialize the evaluator.
84
+
85
+ Args:
86
+ graders: List of graders to use. If None, uses defaults based on test case.
87
+ metrics: List of metrics to calculate. If None, uses defaults.
88
+ config: Evaldeck configuration.
89
+ """
90
+ self.graders = graders
91
+ self.metrics = metrics or self._default_metrics()
92
+ self.config = config
93
+
94
+ def _default_metrics(self) -> list[BaseMetric]:
95
+ """Get default metrics."""
96
+ return [
97
+ StepCountMetric(),
98
+ TokenUsageMetric(),
99
+ ToolCallCountMetric(),
100
+ DurationMetric(),
101
+ ]
102
+
103
+ def _build_graders(self, test_case: EvalCase) -> list[BaseGrader]:
104
+ """Build graders from test case expected behavior."""
105
+ graders: list[BaseGrader] = []
106
+ expected = test_case.expected
107
+
108
+ # Add graders based on expected behavior
109
+ if expected.output_contains:
110
+ graders.append(ContainsGrader())
111
+
112
+ if expected.output_not_contains:
113
+ graders.append(NotContainsGrader())
114
+
115
+ if expected.tools_called:
116
+ graders.append(ToolCalledGrader())
117
+
118
+ if expected.tools_not_called:
119
+ graders.append(ToolNotCalledGrader())
120
+
121
+ if expected.tool_call_order:
122
+ graders.append(ToolOrderGrader())
123
+
124
+ if expected.max_steps is not None:
125
+ graders.append(MaxStepsGrader())
126
+
127
+ if expected.task_completed is not None:
128
+ graders.append(TaskCompletedGrader())
129
+
130
+ # Add graders from test case config
131
+ for grader_config in test_case.graders:
132
+ grader = self._create_grader_from_config(grader_config)
133
+ if grader:
134
+ graders.append(grader)
135
+
136
+ # If no graders, add basic task completion check
137
+ if not graders:
138
+ graders.append(TaskCompletedGrader())
139
+
140
+ return graders
141
+
142
+ def _create_grader_from_config(self, config: Any) -> BaseGrader | None:
143
+ """Create a grader from configuration."""
144
+ from evaldeck.test_case import GraderConfig
145
+
146
+ if isinstance(config, GraderConfig):
147
+ grader_type = config.type.lower()
148
+
149
+ if grader_type == "llm":
150
+ return LLMGrader(
151
+ prompt=config.prompt,
152
+ model=config.model or "gpt-4o-mini",
153
+ threshold=config.threshold,
154
+ )
155
+ elif grader_type == "contains":
156
+ return ContainsGrader(**config.params)
157
+ elif grader_type == "tool_called":
158
+ return ToolCalledGrader(**config.params)
159
+ # Add more grader types as needed
160
+
161
+ return None
162
+
163
+ def evaluate(
164
+ self,
165
+ trace: Trace,
166
+ test_case: EvalCase,
167
+ ) -> EvaluationResult:
168
+ """Evaluate a single trace against a test case (sync).
169
+
170
+ Runs graders and metrics sequentially. Best for:
171
+ - Code-based graders (ContainsGrader, RegexGrader, etc.)
172
+ - Quick evaluations without I/O-bound operations
173
+ - Contexts without an async event loop
174
+
175
+ For I/O-bound graders (LLMGrader) or concurrent execution,
176
+ use evaluate_async() instead.
177
+
178
+ Args:
179
+ trace: The execution trace to evaluate.
180
+ test_case: The test case defining expected behavior.
181
+
182
+ Returns:
183
+ EvaluationResult with grades and metrics.
184
+ """
185
+ started_at = datetime.now()
186
+
187
+ # Build graders
188
+ graders = self.graders if self.graders else self._build_graders(test_case)
189
+
190
+ # Create result
191
+ result = EvaluationResult(
192
+ test_case_name=test_case.name,
193
+ status=GradeStatus.PASS, # Start optimistic
194
+ started_at=started_at,
195
+ trace_id=trace.id,
196
+ )
197
+
198
+ # Run graders sequentially
199
+ for grader in graders:
200
+ try:
201
+ grade = grader.grade(trace, test_case)
202
+ result.add_grade(grade)
203
+ except Exception as e:
204
+ result.add_grade(GradeResult.error_result(grader.name, f"Grader error: {e}"))
205
+
206
+ # Calculate metrics
207
+ for metric in self.metrics:
208
+ try:
209
+ metric_result = metric.calculate(trace, test_case)
210
+ result.add_metric(metric_result)
211
+ except Exception:
212
+ pass # Metrics are optional, don't fail on error
213
+
214
+ # Finalize
215
+ result.completed_at = datetime.now()
216
+ result.duration_ms = (result.completed_at - started_at).total_seconds() * 1000
217
+
218
+ return result
219
+
220
+ async def evaluate_async(
221
+ self,
222
+ trace: Trace,
223
+ test_case: EvalCase,
224
+ ) -> EvaluationResult:
225
+ """Evaluate a single trace against a test case (async).
226
+
227
+ Runs graders and metrics concurrently using asyncio.gather().
228
+ Recommended for:
229
+ - LLMGrader (makes async API calls to OpenAI/Anthropic)
230
+ - Custom async graders that call external services
231
+ - Custom async metrics that fetch benchmark data
232
+ - Any scenario with multiple I/O-bound operations
233
+
234
+ Performance benefit: With 3 LLMGraders each taking 2 seconds,
235
+ sync evaluate() takes ~6 seconds while evaluate_async() takes ~2 seconds.
236
+
237
+ Code-based graders (ContainsGrader, etc.) automatically run in a
238
+ thread pool via asyncio.to_thread() to avoid blocking the event loop.
239
+
240
+ Args:
241
+ trace: The execution trace to evaluate.
242
+ test_case: The test case defining expected behavior.
243
+
244
+ Returns:
245
+ EvaluationResult with grades and metrics.
246
+ """
247
+ started_at = datetime.now()
248
+
249
+ # Build graders
250
+ graders = self.graders if self.graders else self._build_graders(test_case)
251
+
252
+ # Create result
253
+ result = EvaluationResult(
254
+ test_case_name=test_case.name,
255
+ status=GradeStatus.PASS, # Start optimistic
256
+ started_at=started_at,
257
+ trace_id=trace.id,
258
+ )
259
+
260
+ # Run graders concurrently
261
+ async def run_grader(grader):
262
+ try:
263
+ return await grader.grade_async(trace, test_case)
264
+ except Exception as e:
265
+ return GradeResult.error_result(grader.name, f"Grader error: {e}")
266
+
267
+ grade_results = await asyncio.gather(*[run_grader(g) for g in graders])
268
+
269
+ for grade in grade_results:
270
+ result.add_grade(grade)
271
+
272
+ # Calculate metrics concurrently (supports async custom metrics)
273
+ async def run_metric(metric):
274
+ try:
275
+ return await metric.calculate_async(trace, test_case)
276
+ except Exception:
277
+ return None # Metrics are optional, don't fail on error
278
+
279
+ metric_results = await asyncio.gather(*[run_metric(m) for m in self.metrics])
280
+
281
+ for metric_result in metric_results:
282
+ if metric_result is not None:
283
+ result.add_metric(metric_result)
284
+
285
+ # Finalize
286
+ result.completed_at = datetime.now()
287
+ result.duration_ms = (result.completed_at - started_at).total_seconds() * 1000
288
+
289
+ return result
290
+
291
+ def evaluate_suite(
292
+ self,
293
+ suite: EvalSuite,
294
+ agent_func: Callable[[str], Trace] | Callable[[str], Awaitable[Trace]],
295
+ on_result: Callable[[EvaluationResult], None] | None = None,
296
+ max_concurrent: int = 0,
297
+ ) -> SuiteResult:
298
+ """Evaluate all test cases in a suite (sync wrapper).
299
+
300
+ Args:
301
+ suite: The test suite to evaluate.
302
+ agent_func: Function that takes input string and returns a Trace.
303
+ Can be sync or async.
304
+ on_result: Optional callback called after each test case.
305
+ max_concurrent: Maximum concurrent tests. 0 = unlimited.
306
+
307
+ Returns:
308
+ SuiteResult with all evaluation results.
309
+ """
310
+ return asyncio.run(self.evaluate_suite_async(suite, agent_func, on_result, max_concurrent))
311
+
312
+ async def evaluate_suite_async(
313
+ self,
314
+ suite: EvalSuite,
315
+ agent_func: Callable[[str], Trace] | Callable[[str], Awaitable[Trace]],
316
+ on_result: Callable[[EvaluationResult], None] | None = None,
317
+ max_concurrent: int = 0,
318
+ ) -> SuiteResult:
319
+ """Evaluate all test cases in a suite concurrently.
320
+
321
+ Args:
322
+ suite: The test suite to evaluate.
323
+ agent_func: Function that takes input string and returns a Trace.
324
+ Can be sync or async.
325
+ on_result: Optional callback called after each test case.
326
+ max_concurrent: Maximum concurrent tests. 0 = unlimited.
327
+
328
+ Returns:
329
+ SuiteResult with all evaluation results.
330
+ """
331
+ suite_result = SuiteResult(
332
+ suite_name=suite.name,
333
+ started_at=datetime.now(),
334
+ )
335
+
336
+ # Detect if agent is async
337
+ is_async = asyncio.iscoroutinefunction(agent_func)
338
+
339
+ # Create semaphore if limiting concurrency
340
+ semaphore = asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
341
+
342
+ @asynccontextmanager
343
+ async def maybe_semaphore():
344
+ """Context manager that optionally acquires semaphore."""
345
+ if semaphore:
346
+ async with semaphore:
347
+ yield
348
+ else:
349
+ yield
350
+
351
+ async def run_test(index: int, test_case: EvalCase) -> tuple[int, EvaluationResult]:
352
+ """Run a single test case."""
353
+ async with maybe_semaphore():
354
+ result = await self._evaluate_single_async(test_case, agent_func, is_async)
355
+ if on_result:
356
+ on_result(result)
357
+ return index, result
358
+
359
+ # Run all tests concurrently
360
+ tasks = [run_test(i, tc) for i, tc in enumerate(suite.test_cases)]
361
+ results = await asyncio.gather(*tasks, return_exceptions=True)
362
+
363
+ # Add results in original order
364
+ results_by_index: dict[int, EvaluationResult] = {}
365
+ for item in results:
366
+ if isinstance(item, Exception):
367
+ # This shouldn't happen since _evaluate_single_async catches exceptions
368
+ continue
369
+ index, result = item
370
+ results_by_index[index] = result
371
+
372
+ for i in range(len(suite.test_cases)):
373
+ if i in results_by_index:
374
+ suite_result.add_result(results_by_index[i])
375
+ else:
376
+ # Handle case where gather returned an exception
377
+ suite_result.add_result(
378
+ EvaluationResult(
379
+ test_case_name=suite.test_cases[i].name,
380
+ status=GradeStatus.ERROR,
381
+ error="Test execution failed unexpectedly",
382
+ )
383
+ )
384
+
385
+ suite_result.completed_at = datetime.now()
386
+ return suite_result
387
+
388
+ async def _evaluate_single_async(
389
+ self,
390
+ test_case: EvalCase,
391
+ agent_func: Callable[[str], Trace] | Callable[[str], Awaitable[Trace]],
392
+ is_async: bool,
393
+ ) -> EvaluationResult:
394
+ """Evaluate a single test case asynchronously.
395
+
396
+ Args:
397
+ test_case: The test case to evaluate.
398
+ agent_func: Function to run the agent.
399
+ is_async: Whether agent_func is async.
400
+
401
+ Returns:
402
+ EvaluationResult for this test case.
403
+ """
404
+ try:
405
+ if is_async:
406
+ trace = await agent_func(test_case.input) # type: ignore
407
+ else:
408
+ # Run sync function in thread pool to not block event loop
409
+ trace = await asyncio.to_thread(agent_func, test_case.input) # type: ignore
410
+
411
+ # Use async evaluate to run graders concurrently
412
+ return await self.evaluate_async(trace, test_case)
413
+
414
+ except Exception as e:
415
+ return EvaluationResult(
416
+ test_case_name=test_case.name,
417
+ status=GradeStatus.ERROR,
418
+ error=str(e),
419
+ )
420
+
421
+
422
+ class EvaluationRunner:
423
+ """High-level runner for executing evaluations."""
424
+
425
+ def __init__(self, config: EvaldeckConfig | None = None) -> None:
426
+ """Initialize the runner.
427
+
428
+ Args:
429
+ config: Evaldeck configuration. If None, loads from file.
430
+ """
431
+ if config is None:
432
+ from evaldeck.config import EvaldeckConfig
433
+
434
+ config = EvaldeckConfig.load()
435
+ self.config = config
436
+ self.evaluator = Evaluator(config=config)
437
+
438
+ def run(
439
+ self,
440
+ suites: list[EvalSuite] | None = None,
441
+ agent_func: Callable[[str], Trace] | Callable[[str], Awaitable[Trace]] | None = None,
442
+ tags: list[str] | None = None,
443
+ on_result: Callable[[EvaluationResult], None] | None = None,
444
+ max_concurrent: int | None = None,
445
+ ) -> RunResult:
446
+ """Run evaluation on multiple suites (sync wrapper).
447
+
448
+ Args:
449
+ suites: Test suites to run. If None, discovers from config.
450
+ agent_func: Function to run agent. If None, loads from config.
451
+ Can be sync or async.
452
+ tags: Filter test cases by tags.
453
+ on_result: Callback for each result.
454
+ max_concurrent: Max concurrent tests per suite. None = use config.
455
+
456
+ Returns:
457
+ RunResult with all suite results.
458
+ """
459
+ return asyncio.run(self.run_async(suites, agent_func, tags, on_result, max_concurrent))
460
+
461
+ async def run_async(
462
+ self,
463
+ suites: list[EvalSuite] | None = None,
464
+ agent_func: Callable[[str], Trace] | Callable[[str], Awaitable[Trace]] | None = None,
465
+ tags: list[str] | None = None,
466
+ on_result: Callable[[EvaluationResult], None] | None = None,
467
+ max_concurrent: int | None = None,
468
+ ) -> RunResult:
469
+ """Run evaluation on multiple suites asynchronously.
470
+
471
+ Args:
472
+ suites: Test suites to run. If None, discovers from config.
473
+ agent_func: Function to run agent. If None, loads from config.
474
+ Can be sync or async.
475
+ tags: Filter test cases by tags.
476
+ on_result: Callback for each result.
477
+ max_concurrent: Max concurrent tests per suite. None = use config.
478
+
479
+ Returns:
480
+ RunResult with all suite results.
481
+ """
482
+ # Load suites if not provided
483
+ if suites is None:
484
+ suites = self._discover_suites()
485
+
486
+ # Load agent function if not provided
487
+ if agent_func is None:
488
+ agent_func = self._load_agent_func()
489
+
490
+ # Filter by tags if specified
491
+ if tags:
492
+ suites = [s.filter_by_tags(tags) for s in suites]
493
+
494
+ # Determine worker count
495
+ effective_max_concurrent = (
496
+ max_concurrent if max_concurrent is not None else self.config.execution.workers
497
+ )
498
+
499
+ # Run evaluation
500
+ run_result = RunResult(
501
+ started_at=datetime.now(),
502
+ config=self.config.model_dump(),
503
+ )
504
+
505
+ for suite in suites:
506
+ if not suite.test_cases:
507
+ continue
508
+
509
+ suite_result = await self.evaluator.evaluate_suite_async(
510
+ suite=suite,
511
+ agent_func=agent_func,
512
+ on_result=on_result,
513
+ max_concurrent=effective_max_concurrent,
514
+ )
515
+ run_result.add_suite(suite_result)
516
+
517
+ run_result.completed_at = datetime.now()
518
+ return run_result
519
+
520
+ def _discover_suites(self) -> list[EvalSuite]:
521
+ """Discover test suites from configuration."""
522
+ from pathlib import Path
523
+
524
+ from evaldeck.test_case import EvalSuite
525
+
526
+ suites = []
527
+
528
+ # Use configured suites
529
+ if self.config.suites:
530
+ for suite_config in self.config.suites:
531
+ path = Path(suite_config.path)
532
+ if path.is_dir():
533
+ suite = EvalSuite.from_directory(path, name=suite_config.name)
534
+ suites.append(suite)
535
+
536
+ # Or discover from test_dir
537
+ else:
538
+ test_dir = Path(self.config.test_dir)
539
+ if test_dir.is_dir():
540
+ # Check for subdirectories (each is a suite)
541
+ subdirs = [d for d in test_dir.iterdir() if d.is_dir()]
542
+ if subdirs:
543
+ for subdir in subdirs:
544
+ suite = EvalSuite.from_directory(subdir)
545
+ suites.append(suite)
546
+ else:
547
+ # Single suite from test_dir
548
+ suite = EvalSuite.from_directory(test_dir, name="default")
549
+ suites.append(suite)
550
+
551
+ return suites
552
+
553
+ def _load_agent_func(self) -> Callable[[str], Trace]:
554
+ """Load agent function from configuration."""
555
+ import importlib
556
+
557
+ agent_config = self.config.agent
558
+
559
+ if not agent_config.module or not agent_config.function:
560
+ raise ValueError(
561
+ "Agent module and function must be specified in config or provided directly"
562
+ )
563
+
564
+ module = importlib.import_module(agent_config.module)
565
+ func = getattr(module, agent_config.function)
566
+ return func
@@ -0,0 +1,36 @@
1
+ """Graders for evaluating agent traces."""
2
+
3
+ from evaldeck.graders.base import BaseGrader, CompositeGrader
4
+ from evaldeck.graders.code import (
5
+ ContainsGrader,
6
+ CustomGrader,
7
+ EqualsGrader,
8
+ MaxStepsGrader,
9
+ NotContainsGrader,
10
+ RegexGrader,
11
+ TaskCompletedGrader,
12
+ ToolCalledGrader,
13
+ ToolNotCalledGrader,
14
+ ToolOrderGrader,
15
+ )
16
+ from evaldeck.graders.llm import LLMGrader, LLMRubricGrader
17
+
18
+ __all__ = [
19
+ # Base
20
+ "BaseGrader",
21
+ "CompositeGrader",
22
+ # Code-based
23
+ "ContainsGrader",
24
+ "NotContainsGrader",
25
+ "EqualsGrader",
26
+ "RegexGrader",
27
+ "ToolCalledGrader",
28
+ "ToolNotCalledGrader",
29
+ "ToolOrderGrader",
30
+ "MaxStepsGrader",
31
+ "TaskCompletedGrader",
32
+ "CustomGrader",
33
+ # Model-based
34
+ "LLMGrader",
35
+ "LLMRubricGrader",
36
+ ]