evaldeck 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evaldeck/__init__.py +88 -0
- evaldeck/cli.py +324 -0
- evaldeck/config.py +223 -0
- evaldeck/evaluator.py +566 -0
- evaldeck/graders/__init__.py +36 -0
- evaldeck/graders/base.py +146 -0
- evaldeck/graders/code.py +484 -0
- evaldeck/graders/llm.py +344 -0
- evaldeck/integrations/__init__.py +29 -0
- evaldeck/integrations/opentelemetry.py +416 -0
- evaldeck/metrics/__init__.py +25 -0
- evaldeck/metrics/base.py +62 -0
- evaldeck/metrics/builtin.py +195 -0
- evaldeck/results.py +211 -0
- evaldeck/test_case.py +162 -0
- evaldeck/trace.py +215 -0
- evaldeck-0.1.0.dist-info/METADATA +363 -0
- evaldeck-0.1.0.dist-info/RECORD +21 -0
- evaldeck-0.1.0.dist-info/WHEEL +4 -0
- evaldeck-0.1.0.dist-info/entry_points.txt +2 -0
- evaldeck-0.1.0.dist-info/licenses/LICENSE +190 -0
evaldeck/evaluator.py
ADDED
|
@@ -0,0 +1,566 @@
|
|
|
1
|
+
"""Main evaluation engine."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from collections.abc import Awaitable, Callable
|
|
7
|
+
from contextlib import asynccontextmanager
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from typing import TYPE_CHECKING, Any
|
|
10
|
+
|
|
11
|
+
from evaldeck.graders import (
|
|
12
|
+
BaseGrader,
|
|
13
|
+
ContainsGrader,
|
|
14
|
+
LLMGrader,
|
|
15
|
+
MaxStepsGrader,
|
|
16
|
+
TaskCompletedGrader,
|
|
17
|
+
ToolCalledGrader,
|
|
18
|
+
ToolNotCalledGrader,
|
|
19
|
+
ToolOrderGrader,
|
|
20
|
+
)
|
|
21
|
+
from evaldeck.graders.code import NotContainsGrader
|
|
22
|
+
from evaldeck.metrics import (
|
|
23
|
+
BaseMetric,
|
|
24
|
+
DurationMetric,
|
|
25
|
+
StepCountMetric,
|
|
26
|
+
TokenUsageMetric,
|
|
27
|
+
ToolCallCountMetric,
|
|
28
|
+
)
|
|
29
|
+
from evaldeck.results import (
|
|
30
|
+
EvaluationResult,
|
|
31
|
+
GradeResult,
|
|
32
|
+
GradeStatus,
|
|
33
|
+
RunResult,
|
|
34
|
+
SuiteResult,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
if TYPE_CHECKING:
|
|
38
|
+
from evaldeck.config import EvaldeckConfig
|
|
39
|
+
from evaldeck.test_case import EvalCase, EvalSuite
|
|
40
|
+
from evaldeck.trace import Trace
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Evaluator:
|
|
44
|
+
"""Main evaluation engine.
|
|
45
|
+
|
|
46
|
+
Evaluates agent traces against test cases using graders and metrics.
|
|
47
|
+
|
|
48
|
+
Choosing sync vs async methods:
|
|
49
|
+
|
|
50
|
+
Use **evaluate()** (sync) when:
|
|
51
|
+
- Running a single quick evaluation with code-based graders
|
|
52
|
+
- Your graders are all CPU-bound (ContainsGrader, RegexGrader, etc.)
|
|
53
|
+
- You're in a sync context without an event loop
|
|
54
|
+
|
|
55
|
+
Use **evaluate_async()** when:
|
|
56
|
+
- Using LLMGrader or other I/O-bound graders
|
|
57
|
+
- Running multiple graders that make API calls
|
|
58
|
+
- You want concurrent grader execution for better throughput
|
|
59
|
+
- Your custom graders/metrics make async API calls
|
|
60
|
+
|
|
61
|
+
Use **evaluate_suite_async()** when:
|
|
62
|
+
- Running multiple test cases (concurrent execution)
|
|
63
|
+
- Your agent function is async
|
|
64
|
+
- You want to control concurrency with max_concurrent
|
|
65
|
+
|
|
66
|
+
Performance comparison::
|
|
67
|
+
|
|
68
|
+
# Sync: graders run sequentially
|
|
69
|
+
# 3 LLMGraders × 2 seconds each = ~6 seconds total
|
|
70
|
+
result = evaluator.evaluate(trace, test_case)
|
|
71
|
+
|
|
72
|
+
# Async: graders run concurrently
|
|
73
|
+
# 3 LLMGraders × 2 seconds each = ~2 seconds total
|
|
74
|
+
result = await evaluator.evaluate_async(trace, test_case)
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
graders: list[BaseGrader] | None = None,
|
|
80
|
+
metrics: list[BaseMetric] | None = None,
|
|
81
|
+
config: EvaldeckConfig | None = None,
|
|
82
|
+
) -> None:
|
|
83
|
+
"""Initialize the evaluator.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
graders: List of graders to use. If None, uses defaults based on test case.
|
|
87
|
+
metrics: List of metrics to calculate. If None, uses defaults.
|
|
88
|
+
config: Evaldeck configuration.
|
|
89
|
+
"""
|
|
90
|
+
self.graders = graders
|
|
91
|
+
self.metrics = metrics or self._default_metrics()
|
|
92
|
+
self.config = config
|
|
93
|
+
|
|
94
|
+
def _default_metrics(self) -> list[BaseMetric]:
|
|
95
|
+
"""Get default metrics."""
|
|
96
|
+
return [
|
|
97
|
+
StepCountMetric(),
|
|
98
|
+
TokenUsageMetric(),
|
|
99
|
+
ToolCallCountMetric(),
|
|
100
|
+
DurationMetric(),
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
def _build_graders(self, test_case: EvalCase) -> list[BaseGrader]:
|
|
104
|
+
"""Build graders from test case expected behavior."""
|
|
105
|
+
graders: list[BaseGrader] = []
|
|
106
|
+
expected = test_case.expected
|
|
107
|
+
|
|
108
|
+
# Add graders based on expected behavior
|
|
109
|
+
if expected.output_contains:
|
|
110
|
+
graders.append(ContainsGrader())
|
|
111
|
+
|
|
112
|
+
if expected.output_not_contains:
|
|
113
|
+
graders.append(NotContainsGrader())
|
|
114
|
+
|
|
115
|
+
if expected.tools_called:
|
|
116
|
+
graders.append(ToolCalledGrader())
|
|
117
|
+
|
|
118
|
+
if expected.tools_not_called:
|
|
119
|
+
graders.append(ToolNotCalledGrader())
|
|
120
|
+
|
|
121
|
+
if expected.tool_call_order:
|
|
122
|
+
graders.append(ToolOrderGrader())
|
|
123
|
+
|
|
124
|
+
if expected.max_steps is not None:
|
|
125
|
+
graders.append(MaxStepsGrader())
|
|
126
|
+
|
|
127
|
+
if expected.task_completed is not None:
|
|
128
|
+
graders.append(TaskCompletedGrader())
|
|
129
|
+
|
|
130
|
+
# Add graders from test case config
|
|
131
|
+
for grader_config in test_case.graders:
|
|
132
|
+
grader = self._create_grader_from_config(grader_config)
|
|
133
|
+
if grader:
|
|
134
|
+
graders.append(grader)
|
|
135
|
+
|
|
136
|
+
# If no graders, add basic task completion check
|
|
137
|
+
if not graders:
|
|
138
|
+
graders.append(TaskCompletedGrader())
|
|
139
|
+
|
|
140
|
+
return graders
|
|
141
|
+
|
|
142
|
+
def _create_grader_from_config(self, config: Any) -> BaseGrader | None:
|
|
143
|
+
"""Create a grader from configuration."""
|
|
144
|
+
from evaldeck.test_case import GraderConfig
|
|
145
|
+
|
|
146
|
+
if isinstance(config, GraderConfig):
|
|
147
|
+
grader_type = config.type.lower()
|
|
148
|
+
|
|
149
|
+
if grader_type == "llm":
|
|
150
|
+
return LLMGrader(
|
|
151
|
+
prompt=config.prompt,
|
|
152
|
+
model=config.model or "gpt-4o-mini",
|
|
153
|
+
threshold=config.threshold,
|
|
154
|
+
)
|
|
155
|
+
elif grader_type == "contains":
|
|
156
|
+
return ContainsGrader(**config.params)
|
|
157
|
+
elif grader_type == "tool_called":
|
|
158
|
+
return ToolCalledGrader(**config.params)
|
|
159
|
+
# Add more grader types as needed
|
|
160
|
+
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
def evaluate(
|
|
164
|
+
self,
|
|
165
|
+
trace: Trace,
|
|
166
|
+
test_case: EvalCase,
|
|
167
|
+
) -> EvaluationResult:
|
|
168
|
+
"""Evaluate a single trace against a test case (sync).
|
|
169
|
+
|
|
170
|
+
Runs graders and metrics sequentially. Best for:
|
|
171
|
+
- Code-based graders (ContainsGrader, RegexGrader, etc.)
|
|
172
|
+
- Quick evaluations without I/O-bound operations
|
|
173
|
+
- Contexts without an async event loop
|
|
174
|
+
|
|
175
|
+
For I/O-bound graders (LLMGrader) or concurrent execution,
|
|
176
|
+
use evaluate_async() instead.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
trace: The execution trace to evaluate.
|
|
180
|
+
test_case: The test case defining expected behavior.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
EvaluationResult with grades and metrics.
|
|
184
|
+
"""
|
|
185
|
+
started_at = datetime.now()
|
|
186
|
+
|
|
187
|
+
# Build graders
|
|
188
|
+
graders = self.graders if self.graders else self._build_graders(test_case)
|
|
189
|
+
|
|
190
|
+
# Create result
|
|
191
|
+
result = EvaluationResult(
|
|
192
|
+
test_case_name=test_case.name,
|
|
193
|
+
status=GradeStatus.PASS, # Start optimistic
|
|
194
|
+
started_at=started_at,
|
|
195
|
+
trace_id=trace.id,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# Run graders sequentially
|
|
199
|
+
for grader in graders:
|
|
200
|
+
try:
|
|
201
|
+
grade = grader.grade(trace, test_case)
|
|
202
|
+
result.add_grade(grade)
|
|
203
|
+
except Exception as e:
|
|
204
|
+
result.add_grade(GradeResult.error_result(grader.name, f"Grader error: {e}"))
|
|
205
|
+
|
|
206
|
+
# Calculate metrics
|
|
207
|
+
for metric in self.metrics:
|
|
208
|
+
try:
|
|
209
|
+
metric_result = metric.calculate(trace, test_case)
|
|
210
|
+
result.add_metric(metric_result)
|
|
211
|
+
except Exception:
|
|
212
|
+
pass # Metrics are optional, don't fail on error
|
|
213
|
+
|
|
214
|
+
# Finalize
|
|
215
|
+
result.completed_at = datetime.now()
|
|
216
|
+
result.duration_ms = (result.completed_at - started_at).total_seconds() * 1000
|
|
217
|
+
|
|
218
|
+
return result
|
|
219
|
+
|
|
220
|
+
async def evaluate_async(
|
|
221
|
+
self,
|
|
222
|
+
trace: Trace,
|
|
223
|
+
test_case: EvalCase,
|
|
224
|
+
) -> EvaluationResult:
|
|
225
|
+
"""Evaluate a single trace against a test case (async).
|
|
226
|
+
|
|
227
|
+
Runs graders and metrics concurrently using asyncio.gather().
|
|
228
|
+
Recommended for:
|
|
229
|
+
- LLMGrader (makes async API calls to OpenAI/Anthropic)
|
|
230
|
+
- Custom async graders that call external services
|
|
231
|
+
- Custom async metrics that fetch benchmark data
|
|
232
|
+
- Any scenario with multiple I/O-bound operations
|
|
233
|
+
|
|
234
|
+
Performance benefit: With 3 LLMGraders each taking 2 seconds,
|
|
235
|
+
sync evaluate() takes ~6 seconds while evaluate_async() takes ~2 seconds.
|
|
236
|
+
|
|
237
|
+
Code-based graders (ContainsGrader, etc.) automatically run in a
|
|
238
|
+
thread pool via asyncio.to_thread() to avoid blocking the event loop.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
trace: The execution trace to evaluate.
|
|
242
|
+
test_case: The test case defining expected behavior.
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
EvaluationResult with grades and metrics.
|
|
246
|
+
"""
|
|
247
|
+
started_at = datetime.now()
|
|
248
|
+
|
|
249
|
+
# Build graders
|
|
250
|
+
graders = self.graders if self.graders else self._build_graders(test_case)
|
|
251
|
+
|
|
252
|
+
# Create result
|
|
253
|
+
result = EvaluationResult(
|
|
254
|
+
test_case_name=test_case.name,
|
|
255
|
+
status=GradeStatus.PASS, # Start optimistic
|
|
256
|
+
started_at=started_at,
|
|
257
|
+
trace_id=trace.id,
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
# Run graders concurrently
|
|
261
|
+
async def run_grader(grader):
|
|
262
|
+
try:
|
|
263
|
+
return await grader.grade_async(trace, test_case)
|
|
264
|
+
except Exception as e:
|
|
265
|
+
return GradeResult.error_result(grader.name, f"Grader error: {e}")
|
|
266
|
+
|
|
267
|
+
grade_results = await asyncio.gather(*[run_grader(g) for g in graders])
|
|
268
|
+
|
|
269
|
+
for grade in grade_results:
|
|
270
|
+
result.add_grade(grade)
|
|
271
|
+
|
|
272
|
+
# Calculate metrics concurrently (supports async custom metrics)
|
|
273
|
+
async def run_metric(metric):
|
|
274
|
+
try:
|
|
275
|
+
return await metric.calculate_async(trace, test_case)
|
|
276
|
+
except Exception:
|
|
277
|
+
return None # Metrics are optional, don't fail on error
|
|
278
|
+
|
|
279
|
+
metric_results = await asyncio.gather(*[run_metric(m) for m in self.metrics])
|
|
280
|
+
|
|
281
|
+
for metric_result in metric_results:
|
|
282
|
+
if metric_result is not None:
|
|
283
|
+
result.add_metric(metric_result)
|
|
284
|
+
|
|
285
|
+
# Finalize
|
|
286
|
+
result.completed_at = datetime.now()
|
|
287
|
+
result.duration_ms = (result.completed_at - started_at).total_seconds() * 1000
|
|
288
|
+
|
|
289
|
+
return result
|
|
290
|
+
|
|
291
|
+
def evaluate_suite(
|
|
292
|
+
self,
|
|
293
|
+
suite: EvalSuite,
|
|
294
|
+
agent_func: Callable[[str], Trace] | Callable[[str], Awaitable[Trace]],
|
|
295
|
+
on_result: Callable[[EvaluationResult], None] | None = None,
|
|
296
|
+
max_concurrent: int = 0,
|
|
297
|
+
) -> SuiteResult:
|
|
298
|
+
"""Evaluate all test cases in a suite (sync wrapper).
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
suite: The test suite to evaluate.
|
|
302
|
+
agent_func: Function that takes input string and returns a Trace.
|
|
303
|
+
Can be sync or async.
|
|
304
|
+
on_result: Optional callback called after each test case.
|
|
305
|
+
max_concurrent: Maximum concurrent tests. 0 = unlimited.
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
SuiteResult with all evaluation results.
|
|
309
|
+
"""
|
|
310
|
+
return asyncio.run(self.evaluate_suite_async(suite, agent_func, on_result, max_concurrent))
|
|
311
|
+
|
|
312
|
+
async def evaluate_suite_async(
|
|
313
|
+
self,
|
|
314
|
+
suite: EvalSuite,
|
|
315
|
+
agent_func: Callable[[str], Trace] | Callable[[str], Awaitable[Trace]],
|
|
316
|
+
on_result: Callable[[EvaluationResult], None] | None = None,
|
|
317
|
+
max_concurrent: int = 0,
|
|
318
|
+
) -> SuiteResult:
|
|
319
|
+
"""Evaluate all test cases in a suite concurrently.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
suite: The test suite to evaluate.
|
|
323
|
+
agent_func: Function that takes input string and returns a Trace.
|
|
324
|
+
Can be sync or async.
|
|
325
|
+
on_result: Optional callback called after each test case.
|
|
326
|
+
max_concurrent: Maximum concurrent tests. 0 = unlimited.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
SuiteResult with all evaluation results.
|
|
330
|
+
"""
|
|
331
|
+
suite_result = SuiteResult(
|
|
332
|
+
suite_name=suite.name,
|
|
333
|
+
started_at=datetime.now(),
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
# Detect if agent is async
|
|
337
|
+
is_async = asyncio.iscoroutinefunction(agent_func)
|
|
338
|
+
|
|
339
|
+
# Create semaphore if limiting concurrency
|
|
340
|
+
semaphore = asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
|
|
341
|
+
|
|
342
|
+
@asynccontextmanager
|
|
343
|
+
async def maybe_semaphore():
|
|
344
|
+
"""Context manager that optionally acquires semaphore."""
|
|
345
|
+
if semaphore:
|
|
346
|
+
async with semaphore:
|
|
347
|
+
yield
|
|
348
|
+
else:
|
|
349
|
+
yield
|
|
350
|
+
|
|
351
|
+
async def run_test(index: int, test_case: EvalCase) -> tuple[int, EvaluationResult]:
|
|
352
|
+
"""Run a single test case."""
|
|
353
|
+
async with maybe_semaphore():
|
|
354
|
+
result = await self._evaluate_single_async(test_case, agent_func, is_async)
|
|
355
|
+
if on_result:
|
|
356
|
+
on_result(result)
|
|
357
|
+
return index, result
|
|
358
|
+
|
|
359
|
+
# Run all tests concurrently
|
|
360
|
+
tasks = [run_test(i, tc) for i, tc in enumerate(suite.test_cases)]
|
|
361
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
362
|
+
|
|
363
|
+
# Add results in original order
|
|
364
|
+
results_by_index: dict[int, EvaluationResult] = {}
|
|
365
|
+
for item in results:
|
|
366
|
+
if isinstance(item, Exception):
|
|
367
|
+
# This shouldn't happen since _evaluate_single_async catches exceptions
|
|
368
|
+
continue
|
|
369
|
+
index, result = item
|
|
370
|
+
results_by_index[index] = result
|
|
371
|
+
|
|
372
|
+
for i in range(len(suite.test_cases)):
|
|
373
|
+
if i in results_by_index:
|
|
374
|
+
suite_result.add_result(results_by_index[i])
|
|
375
|
+
else:
|
|
376
|
+
# Handle case where gather returned an exception
|
|
377
|
+
suite_result.add_result(
|
|
378
|
+
EvaluationResult(
|
|
379
|
+
test_case_name=suite.test_cases[i].name,
|
|
380
|
+
status=GradeStatus.ERROR,
|
|
381
|
+
error="Test execution failed unexpectedly",
|
|
382
|
+
)
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
suite_result.completed_at = datetime.now()
|
|
386
|
+
return suite_result
|
|
387
|
+
|
|
388
|
+
async def _evaluate_single_async(
|
|
389
|
+
self,
|
|
390
|
+
test_case: EvalCase,
|
|
391
|
+
agent_func: Callable[[str], Trace] | Callable[[str], Awaitable[Trace]],
|
|
392
|
+
is_async: bool,
|
|
393
|
+
) -> EvaluationResult:
|
|
394
|
+
"""Evaluate a single test case asynchronously.
|
|
395
|
+
|
|
396
|
+
Args:
|
|
397
|
+
test_case: The test case to evaluate.
|
|
398
|
+
agent_func: Function to run the agent.
|
|
399
|
+
is_async: Whether agent_func is async.
|
|
400
|
+
|
|
401
|
+
Returns:
|
|
402
|
+
EvaluationResult for this test case.
|
|
403
|
+
"""
|
|
404
|
+
try:
|
|
405
|
+
if is_async:
|
|
406
|
+
trace = await agent_func(test_case.input) # type: ignore
|
|
407
|
+
else:
|
|
408
|
+
# Run sync function in thread pool to not block event loop
|
|
409
|
+
trace = await asyncio.to_thread(agent_func, test_case.input) # type: ignore
|
|
410
|
+
|
|
411
|
+
# Use async evaluate to run graders concurrently
|
|
412
|
+
return await self.evaluate_async(trace, test_case)
|
|
413
|
+
|
|
414
|
+
except Exception as e:
|
|
415
|
+
return EvaluationResult(
|
|
416
|
+
test_case_name=test_case.name,
|
|
417
|
+
status=GradeStatus.ERROR,
|
|
418
|
+
error=str(e),
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
class EvaluationRunner:
|
|
423
|
+
"""High-level runner for executing evaluations."""
|
|
424
|
+
|
|
425
|
+
def __init__(self, config: EvaldeckConfig | None = None) -> None:
|
|
426
|
+
"""Initialize the runner.
|
|
427
|
+
|
|
428
|
+
Args:
|
|
429
|
+
config: Evaldeck configuration. If None, loads from file.
|
|
430
|
+
"""
|
|
431
|
+
if config is None:
|
|
432
|
+
from evaldeck.config import EvaldeckConfig
|
|
433
|
+
|
|
434
|
+
config = EvaldeckConfig.load()
|
|
435
|
+
self.config = config
|
|
436
|
+
self.evaluator = Evaluator(config=config)
|
|
437
|
+
|
|
438
|
+
def run(
|
|
439
|
+
self,
|
|
440
|
+
suites: list[EvalSuite] | None = None,
|
|
441
|
+
agent_func: Callable[[str], Trace] | Callable[[str], Awaitable[Trace]] | None = None,
|
|
442
|
+
tags: list[str] | None = None,
|
|
443
|
+
on_result: Callable[[EvaluationResult], None] | None = None,
|
|
444
|
+
max_concurrent: int | None = None,
|
|
445
|
+
) -> RunResult:
|
|
446
|
+
"""Run evaluation on multiple suites (sync wrapper).
|
|
447
|
+
|
|
448
|
+
Args:
|
|
449
|
+
suites: Test suites to run. If None, discovers from config.
|
|
450
|
+
agent_func: Function to run agent. If None, loads from config.
|
|
451
|
+
Can be sync or async.
|
|
452
|
+
tags: Filter test cases by tags.
|
|
453
|
+
on_result: Callback for each result.
|
|
454
|
+
max_concurrent: Max concurrent tests per suite. None = use config.
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
RunResult with all suite results.
|
|
458
|
+
"""
|
|
459
|
+
return asyncio.run(self.run_async(suites, agent_func, tags, on_result, max_concurrent))
|
|
460
|
+
|
|
461
|
+
async def run_async(
|
|
462
|
+
self,
|
|
463
|
+
suites: list[EvalSuite] | None = None,
|
|
464
|
+
agent_func: Callable[[str], Trace] | Callable[[str], Awaitable[Trace]] | None = None,
|
|
465
|
+
tags: list[str] | None = None,
|
|
466
|
+
on_result: Callable[[EvaluationResult], None] | None = None,
|
|
467
|
+
max_concurrent: int | None = None,
|
|
468
|
+
) -> RunResult:
|
|
469
|
+
"""Run evaluation on multiple suites asynchronously.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
suites: Test suites to run. If None, discovers from config.
|
|
473
|
+
agent_func: Function to run agent. If None, loads from config.
|
|
474
|
+
Can be sync or async.
|
|
475
|
+
tags: Filter test cases by tags.
|
|
476
|
+
on_result: Callback for each result.
|
|
477
|
+
max_concurrent: Max concurrent tests per suite. None = use config.
|
|
478
|
+
|
|
479
|
+
Returns:
|
|
480
|
+
RunResult with all suite results.
|
|
481
|
+
"""
|
|
482
|
+
# Load suites if not provided
|
|
483
|
+
if suites is None:
|
|
484
|
+
suites = self._discover_suites()
|
|
485
|
+
|
|
486
|
+
# Load agent function if not provided
|
|
487
|
+
if agent_func is None:
|
|
488
|
+
agent_func = self._load_agent_func()
|
|
489
|
+
|
|
490
|
+
# Filter by tags if specified
|
|
491
|
+
if tags:
|
|
492
|
+
suites = [s.filter_by_tags(tags) for s in suites]
|
|
493
|
+
|
|
494
|
+
# Determine worker count
|
|
495
|
+
effective_max_concurrent = (
|
|
496
|
+
max_concurrent if max_concurrent is not None else self.config.execution.workers
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
# Run evaluation
|
|
500
|
+
run_result = RunResult(
|
|
501
|
+
started_at=datetime.now(),
|
|
502
|
+
config=self.config.model_dump(),
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
for suite in suites:
|
|
506
|
+
if not suite.test_cases:
|
|
507
|
+
continue
|
|
508
|
+
|
|
509
|
+
suite_result = await self.evaluator.evaluate_suite_async(
|
|
510
|
+
suite=suite,
|
|
511
|
+
agent_func=agent_func,
|
|
512
|
+
on_result=on_result,
|
|
513
|
+
max_concurrent=effective_max_concurrent,
|
|
514
|
+
)
|
|
515
|
+
run_result.add_suite(suite_result)
|
|
516
|
+
|
|
517
|
+
run_result.completed_at = datetime.now()
|
|
518
|
+
return run_result
|
|
519
|
+
|
|
520
|
+
def _discover_suites(self) -> list[EvalSuite]:
|
|
521
|
+
"""Discover test suites from configuration."""
|
|
522
|
+
from pathlib import Path
|
|
523
|
+
|
|
524
|
+
from evaldeck.test_case import EvalSuite
|
|
525
|
+
|
|
526
|
+
suites = []
|
|
527
|
+
|
|
528
|
+
# Use configured suites
|
|
529
|
+
if self.config.suites:
|
|
530
|
+
for suite_config in self.config.suites:
|
|
531
|
+
path = Path(suite_config.path)
|
|
532
|
+
if path.is_dir():
|
|
533
|
+
suite = EvalSuite.from_directory(path, name=suite_config.name)
|
|
534
|
+
suites.append(suite)
|
|
535
|
+
|
|
536
|
+
# Or discover from test_dir
|
|
537
|
+
else:
|
|
538
|
+
test_dir = Path(self.config.test_dir)
|
|
539
|
+
if test_dir.is_dir():
|
|
540
|
+
# Check for subdirectories (each is a suite)
|
|
541
|
+
subdirs = [d for d in test_dir.iterdir() if d.is_dir()]
|
|
542
|
+
if subdirs:
|
|
543
|
+
for subdir in subdirs:
|
|
544
|
+
suite = EvalSuite.from_directory(subdir)
|
|
545
|
+
suites.append(suite)
|
|
546
|
+
else:
|
|
547
|
+
# Single suite from test_dir
|
|
548
|
+
suite = EvalSuite.from_directory(test_dir, name="default")
|
|
549
|
+
suites.append(suite)
|
|
550
|
+
|
|
551
|
+
return suites
|
|
552
|
+
|
|
553
|
+
def _load_agent_func(self) -> Callable[[str], Trace]:
|
|
554
|
+
"""Load agent function from configuration."""
|
|
555
|
+
import importlib
|
|
556
|
+
|
|
557
|
+
agent_config = self.config.agent
|
|
558
|
+
|
|
559
|
+
if not agent_config.module or not agent_config.function:
|
|
560
|
+
raise ValueError(
|
|
561
|
+
"Agent module and function must be specified in config or provided directly"
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
module = importlib.import_module(agent_config.module)
|
|
565
|
+
func = getattr(module, agent_config.function)
|
|
566
|
+
return func
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Graders for evaluating agent traces."""
|
|
2
|
+
|
|
3
|
+
from evaldeck.graders.base import BaseGrader, CompositeGrader
|
|
4
|
+
from evaldeck.graders.code import (
|
|
5
|
+
ContainsGrader,
|
|
6
|
+
CustomGrader,
|
|
7
|
+
EqualsGrader,
|
|
8
|
+
MaxStepsGrader,
|
|
9
|
+
NotContainsGrader,
|
|
10
|
+
RegexGrader,
|
|
11
|
+
TaskCompletedGrader,
|
|
12
|
+
ToolCalledGrader,
|
|
13
|
+
ToolNotCalledGrader,
|
|
14
|
+
ToolOrderGrader,
|
|
15
|
+
)
|
|
16
|
+
from evaldeck.graders.llm import LLMGrader, LLMRubricGrader
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
# Base
|
|
20
|
+
"BaseGrader",
|
|
21
|
+
"CompositeGrader",
|
|
22
|
+
# Code-based
|
|
23
|
+
"ContainsGrader",
|
|
24
|
+
"NotContainsGrader",
|
|
25
|
+
"EqualsGrader",
|
|
26
|
+
"RegexGrader",
|
|
27
|
+
"ToolCalledGrader",
|
|
28
|
+
"ToolNotCalledGrader",
|
|
29
|
+
"ToolOrderGrader",
|
|
30
|
+
"MaxStepsGrader",
|
|
31
|
+
"TaskCompletedGrader",
|
|
32
|
+
"CustomGrader",
|
|
33
|
+
# Model-based
|
|
34
|
+
"LLMGrader",
|
|
35
|
+
"LLMRubricGrader",
|
|
36
|
+
]
|