evaldeck 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evaldeck/__init__.py +88 -0
- evaldeck/cli.py +324 -0
- evaldeck/config.py +223 -0
- evaldeck/evaluator.py +566 -0
- evaldeck/graders/__init__.py +36 -0
- evaldeck/graders/base.py +146 -0
- evaldeck/graders/code.py +484 -0
- evaldeck/graders/llm.py +344 -0
- evaldeck/integrations/__init__.py +29 -0
- evaldeck/integrations/opentelemetry.py +416 -0
- evaldeck/metrics/__init__.py +25 -0
- evaldeck/metrics/base.py +62 -0
- evaldeck/metrics/builtin.py +195 -0
- evaldeck/results.py +211 -0
- evaldeck/test_case.py +162 -0
- evaldeck/trace.py +215 -0
- evaldeck-0.1.0.dist-info/METADATA +363 -0
- evaldeck-0.1.0.dist-info/RECORD +21 -0
- evaldeck-0.1.0.dist-info/WHEEL +4 -0
- evaldeck-0.1.0.dist-info/entry_points.txt +2 -0
- evaldeck-0.1.0.dist-info/licenses/LICENSE +190 -0
evaldeck/__init__.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Evaldeck - The evaluation framework for AI agents.
|
|
2
|
+
|
|
3
|
+
Evaldeck helps you answer one question: "Is my agent actually working?"
|
|
4
|
+
|
|
5
|
+
Basic usage:
|
|
6
|
+
from evaldeck import Trace, Step, Evaluator, EvalCase
|
|
7
|
+
|
|
8
|
+
# Create a trace (or capture with LangChain adapter)
|
|
9
|
+
trace = Trace(
|
|
10
|
+
input="Book a flight to NYC",
|
|
11
|
+
steps=[
|
|
12
|
+
Step.tool_call("search_flights", {"to": "NYC"}),
|
|
13
|
+
Step.tool_call("book_flight", {"flight_id": "123"}),
|
|
14
|
+
],
|
|
15
|
+
output="Booked flight 123 to NYC",
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
# Define test case
|
|
19
|
+
test_case = EvalCase(
|
|
20
|
+
name="book_flight",
|
|
21
|
+
input="Book a flight to NYC",
|
|
22
|
+
expected=ExpectedBehavior(
|
|
23
|
+
tools_called=["search_flights", "book_flight"],
|
|
24
|
+
output_contains=["booked"],
|
|
25
|
+
),
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# Evaluate
|
|
29
|
+
evaluator = Evaluator()
|
|
30
|
+
result = evaluator.evaluate(trace, test_case)
|
|
31
|
+
print(f"Passed: {result.passed}")
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from evaldeck.config import EvaldeckConfig
|
|
35
|
+
from evaldeck.evaluator import EvaluationRunner, Evaluator
|
|
36
|
+
from evaldeck.results import (
|
|
37
|
+
EvaluationResult,
|
|
38
|
+
GradeResult,
|
|
39
|
+
GradeStatus,
|
|
40
|
+
MetricResult,
|
|
41
|
+
RunResult,
|
|
42
|
+
SuiteResult,
|
|
43
|
+
)
|
|
44
|
+
from evaldeck.test_case import (
|
|
45
|
+
EvalCase,
|
|
46
|
+
EvalSuite,
|
|
47
|
+
ExpectedBehavior,
|
|
48
|
+
GraderConfig,
|
|
49
|
+
)
|
|
50
|
+
from evaldeck.trace import (
|
|
51
|
+
Step,
|
|
52
|
+
StepStatus,
|
|
53
|
+
StepType,
|
|
54
|
+
TokenUsage,
|
|
55
|
+
Trace,
|
|
56
|
+
TraceStatus,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
__version__ = "0.1.0"
|
|
60
|
+
|
|
61
|
+
__all__ = [
|
|
62
|
+
# Version
|
|
63
|
+
"__version__",
|
|
64
|
+
# Trace
|
|
65
|
+
"Trace",
|
|
66
|
+
"Step",
|
|
67
|
+
"StepType",
|
|
68
|
+
"StepStatus",
|
|
69
|
+
"TraceStatus",
|
|
70
|
+
"TokenUsage",
|
|
71
|
+
# Test Case
|
|
72
|
+
"EvalCase",
|
|
73
|
+
"EvalSuite",
|
|
74
|
+
"ExpectedBehavior",
|
|
75
|
+
"GraderConfig",
|
|
76
|
+
# Results
|
|
77
|
+
"GradeResult",
|
|
78
|
+
"GradeStatus",
|
|
79
|
+
"MetricResult",
|
|
80
|
+
"EvaluationResult",
|
|
81
|
+
"SuiteResult",
|
|
82
|
+
"RunResult",
|
|
83
|
+
# Evaluator
|
|
84
|
+
"Evaluator",
|
|
85
|
+
"EvaluationRunner",
|
|
86
|
+
# Config
|
|
87
|
+
"EvaldeckConfig",
|
|
88
|
+
]
|
evaldeck/cli.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
"""Command-line interface for evaldeck."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
from rich import box
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.panel import Panel
|
|
12
|
+
from rich.table import Table
|
|
13
|
+
|
|
14
|
+
from evaldeck.config import EvaldeckConfig, generate_default_config, generate_example_test
|
|
15
|
+
from evaldeck.results import EvaluationResult, GradeStatus, RunResult
|
|
16
|
+
|
|
17
|
+
console = Console()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@click.group()
|
|
21
|
+
@click.version_option()
|
|
22
|
+
def main() -> None:
|
|
23
|
+
"""Evaldeck - The evaluation framework for AI agents."""
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@main.command()
|
|
28
|
+
@click.option("--force", "-f", is_flag=True, help="Overwrite existing files")
|
|
29
|
+
def init(force: bool) -> None:
|
|
30
|
+
"""Initialize a new evaldeck project."""
|
|
31
|
+
config_path = Path("evaldeck.yaml")
|
|
32
|
+
test_dir = Path("tests/evals")
|
|
33
|
+
example_test = test_dir / "example.yaml"
|
|
34
|
+
|
|
35
|
+
# Check for existing files
|
|
36
|
+
if config_path.exists() and not force:
|
|
37
|
+
console.print(f"[yellow]Config file already exists: {config_path}[/yellow]")
|
|
38
|
+
console.print("Use --force to overwrite")
|
|
39
|
+
return
|
|
40
|
+
|
|
41
|
+
# Create config
|
|
42
|
+
config_path.write_text(generate_default_config())
|
|
43
|
+
console.print(f"[green]Created:[/green] {config_path}")
|
|
44
|
+
|
|
45
|
+
# Create test directory
|
|
46
|
+
test_dir.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
console.print(f"[green]Created:[/green] {test_dir}/")
|
|
48
|
+
|
|
49
|
+
# Create example test
|
|
50
|
+
if not example_test.exists() or force:
|
|
51
|
+
example_test.write_text(generate_example_test())
|
|
52
|
+
console.print(f"[green]Created:[/green] {example_test}")
|
|
53
|
+
|
|
54
|
+
# Create output directory
|
|
55
|
+
output_dir = Path(".evaldeck")
|
|
56
|
+
output_dir.mkdir(exist_ok=True)
|
|
57
|
+
|
|
58
|
+
console.print()
|
|
59
|
+
console.print(
|
|
60
|
+
Panel(
|
|
61
|
+
"[bold]Project initialized![/bold]\n\n"
|
|
62
|
+
"Next steps:\n"
|
|
63
|
+
"1. Edit [cyan]evaldeck.yaml[/cyan] to configure your agent\n"
|
|
64
|
+
"2. Add test cases to [cyan]tests/evals/[/cyan]\n"
|
|
65
|
+
"3. Run [cyan]evaldeck run[/cyan] to evaluate",
|
|
66
|
+
title="Evaldeck",
|
|
67
|
+
border_style="green",
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@main.command()
|
|
73
|
+
@click.option("--config", "-c", type=click.Path(exists=True), help="Config file path")
|
|
74
|
+
@click.option("--suite", "-s", multiple=True, help="Run specific suite(s)")
|
|
75
|
+
@click.option("--tag", "-t", multiple=True, help="Filter by tag(s)")
|
|
76
|
+
@click.option("--output", "-o", type=click.Choice(["text", "json", "junit"]), default="text")
|
|
77
|
+
@click.option("--output-file", type=click.Path(), help="Output file path")
|
|
78
|
+
@click.option("--verbose", "-v", is_flag=True, help="Verbose output")
|
|
79
|
+
@click.option(
|
|
80
|
+
"--workers",
|
|
81
|
+
"-w",
|
|
82
|
+
type=int,
|
|
83
|
+
default=None,
|
|
84
|
+
help="Max concurrent tests (0=unlimited, 1=sequential). Default: from config or 0",
|
|
85
|
+
)
|
|
86
|
+
def run(
|
|
87
|
+
config: str | None,
|
|
88
|
+
suite: tuple[str, ...],
|
|
89
|
+
tag: tuple[str, ...],
|
|
90
|
+
output: str,
|
|
91
|
+
output_file: str | None,
|
|
92
|
+
verbose: bool,
|
|
93
|
+
workers: int | None,
|
|
94
|
+
) -> None:
|
|
95
|
+
"""Run evaluations."""
|
|
96
|
+
try:
|
|
97
|
+
# Load config
|
|
98
|
+
cfg = EvaldeckConfig.load(config)
|
|
99
|
+
except FileNotFoundError:
|
|
100
|
+
console.print("[red]No evaldeck.yaml found. Run 'evaldeck init' first.[/red]")
|
|
101
|
+
sys.exit(1)
|
|
102
|
+
|
|
103
|
+
console.print("[bold]Evaldeck[/bold] - Running evaluations...\n")
|
|
104
|
+
|
|
105
|
+
# Discover test suites
|
|
106
|
+
from evaldeck.evaluator import EvaluationRunner
|
|
107
|
+
|
|
108
|
+
runner = EvaluationRunner(config=cfg)
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
suites = runner._discover_suites()
|
|
112
|
+
except Exception as e:
|
|
113
|
+
console.print(f"[red]Error discovering test suites: {e}[/red]")
|
|
114
|
+
sys.exit(1)
|
|
115
|
+
|
|
116
|
+
if not suites:
|
|
117
|
+
console.print("[yellow]No test suites found.[/yellow]")
|
|
118
|
+
console.print(f"Add test cases to: {cfg.test_dir}/")
|
|
119
|
+
sys.exit(0)
|
|
120
|
+
|
|
121
|
+
# Filter suites if specified
|
|
122
|
+
if suite:
|
|
123
|
+
suites = [s for s in suites if s.name in suite]
|
|
124
|
+
|
|
125
|
+
# Count total tests
|
|
126
|
+
total_tests = sum(len(s.test_cases) for s in suites)
|
|
127
|
+
console.print(
|
|
128
|
+
f"Found [cyan]{total_tests}[/cyan] test(s) in [cyan]{len(suites)}[/cyan] suite(s)\n"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Check if agent is configured
|
|
132
|
+
if not cfg.agent.module or not cfg.agent.function:
|
|
133
|
+
console.print("[yellow]No agent configured in evaldeck.yaml[/yellow]")
|
|
134
|
+
console.print("Running in dry-run mode (no agent execution)\n")
|
|
135
|
+
|
|
136
|
+
# Show what would be run
|
|
137
|
+
for s in suites:
|
|
138
|
+
console.print(f"Suite: [bold]{s.name}[/bold]")
|
|
139
|
+
for tc in s.test_cases:
|
|
140
|
+
console.print(f" - {tc.name}")
|
|
141
|
+
sys.exit(0)
|
|
142
|
+
|
|
143
|
+
# Run evaluations
|
|
144
|
+
def on_result(result: EvaluationResult) -> None:
|
|
145
|
+
"""Print result as it completes."""
|
|
146
|
+
if result.passed:
|
|
147
|
+
icon = "[green]✓[/green]"
|
|
148
|
+
elif result.status == GradeStatus.ERROR:
|
|
149
|
+
icon = "[red]![/red]"
|
|
150
|
+
else:
|
|
151
|
+
icon = "[red]✗[/red]"
|
|
152
|
+
|
|
153
|
+
duration = f"({result.duration_ms:.1f}ms)" if result.duration_ms else ""
|
|
154
|
+
console.print(f" {icon} {result.test_case_name} {duration}")
|
|
155
|
+
|
|
156
|
+
if verbose and not result.passed:
|
|
157
|
+
for grade in result.failed_grades:
|
|
158
|
+
console.print(f" [dim]└─ {grade.grader_name}: {grade.message}[/dim]")
|
|
159
|
+
|
|
160
|
+
# Show concurrency info
|
|
161
|
+
effective_workers = workers if workers is not None else cfg.execution.workers
|
|
162
|
+
if effective_workers == 0:
|
|
163
|
+
console.print("[dim]Running with unlimited concurrency[/dim]\n")
|
|
164
|
+
elif effective_workers == 1:
|
|
165
|
+
console.print("[dim]Running sequentially[/dim]\n")
|
|
166
|
+
else:
|
|
167
|
+
console.print(f"[dim]Running with max {effective_workers} concurrent tests[/dim]\n")
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
run_result = runner.run(
|
|
171
|
+
suites=suites,
|
|
172
|
+
tags=list(tag) if tag else None,
|
|
173
|
+
on_result=on_result,
|
|
174
|
+
max_concurrent=workers,
|
|
175
|
+
)
|
|
176
|
+
except ValueError as e:
|
|
177
|
+
console.print(f"[red]Error: {e}[/red]")
|
|
178
|
+
sys.exit(1)
|
|
179
|
+
except Exception as e:
|
|
180
|
+
console.print(f"[red]Evaluation error: {e}[/red]")
|
|
181
|
+
if verbose:
|
|
182
|
+
import traceback
|
|
183
|
+
|
|
184
|
+
console.print(traceback.format_exc())
|
|
185
|
+
sys.exit(1)
|
|
186
|
+
|
|
187
|
+
# Print summary
|
|
188
|
+
console.print()
|
|
189
|
+
_print_summary(run_result)
|
|
190
|
+
|
|
191
|
+
# Output to file if requested
|
|
192
|
+
if output_file:
|
|
193
|
+
_write_output(run_result, output, output_file)
|
|
194
|
+
|
|
195
|
+
# Exit with appropriate code
|
|
196
|
+
if cfg.thresholds.min_pass_rate > 0:
|
|
197
|
+
if run_result.pass_rate < cfg.thresholds.min_pass_rate:
|
|
198
|
+
console.print(
|
|
199
|
+
f"\n[red]Pass rate {run_result.pass_rate:.1%} < "
|
|
200
|
+
f"threshold {cfg.thresholds.min_pass_rate:.1%}[/red]"
|
|
201
|
+
)
|
|
202
|
+
sys.exit(1)
|
|
203
|
+
|
|
204
|
+
sys.exit(0 if run_result.all_passed else 1)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _print_summary(result: RunResult) -> None:
|
|
208
|
+
"""Print evaluation summary."""
|
|
209
|
+
table = Table(box=box.SIMPLE)
|
|
210
|
+
table.add_column("Metric", style="bold")
|
|
211
|
+
table.add_column("Value", justify="right")
|
|
212
|
+
|
|
213
|
+
table.add_row("Total", str(result.total))
|
|
214
|
+
table.add_row("Passed", f"[green]{result.passed}[/green]")
|
|
215
|
+
table.add_row("Failed", f"[red]{result.failed}[/red]" if result.failed else "0")
|
|
216
|
+
table.add_row("Pass Rate", f"{result.pass_rate:.1%}")
|
|
217
|
+
|
|
218
|
+
console.print(Panel(table, title="Results", border_style="blue"))
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _write_output(result: RunResult, format: str, path: str) -> None:
|
|
222
|
+
"""Write results to file."""
|
|
223
|
+
if format == "json":
|
|
224
|
+
import json
|
|
225
|
+
|
|
226
|
+
with open(path, "w") as f:
|
|
227
|
+
json.dump(result.model_dump(mode="json"), f, indent=2, default=str)
|
|
228
|
+
elif format == "junit":
|
|
229
|
+
_write_junit(result, path)
|
|
230
|
+
|
|
231
|
+
console.print(f"[dim]Output written to: {path}[/dim]")
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _write_junit(result: RunResult, path: str) -> None:
|
|
235
|
+
"""Write results in JUnit XML format."""
|
|
236
|
+
import xml.etree.ElementTree as ET
|
|
237
|
+
|
|
238
|
+
testsuites = ET.Element("testsuites")
|
|
239
|
+
testsuites.set("tests", str(result.total))
|
|
240
|
+
testsuites.set("failures", str(result.failed))
|
|
241
|
+
|
|
242
|
+
for suite_result in result.suites:
|
|
243
|
+
testsuite = ET.SubElement(testsuites, "testsuite")
|
|
244
|
+
testsuite.set("name", suite_result.suite_name)
|
|
245
|
+
testsuite.set("tests", str(suite_result.total))
|
|
246
|
+
testsuite.set("failures", str(suite_result.failed))
|
|
247
|
+
testsuite.set("errors", str(suite_result.errors))
|
|
248
|
+
|
|
249
|
+
for eval_result in suite_result.results:
|
|
250
|
+
testcase = ET.SubElement(testsuite, "testcase")
|
|
251
|
+
testcase.set("name", eval_result.test_case_name)
|
|
252
|
+
testcase.set("time", str((eval_result.duration_ms or 0) / 1000))
|
|
253
|
+
|
|
254
|
+
if eval_result.status == GradeStatus.FAIL:
|
|
255
|
+
failure = ET.SubElement(testcase, "failure")
|
|
256
|
+
messages = [g.message for g in eval_result.failed_grades if g.message]
|
|
257
|
+
failure.set("message", "; ".join(messages) or "Test failed")
|
|
258
|
+
|
|
259
|
+
elif eval_result.status == GradeStatus.ERROR:
|
|
260
|
+
error = ET.SubElement(testcase, "error")
|
|
261
|
+
error.set("message", eval_result.error or "Unknown error")
|
|
262
|
+
|
|
263
|
+
tree = ET.ElementTree(testsuites)
|
|
264
|
+
ET.indent(tree, space=" ")
|
|
265
|
+
tree.write(path, encoding="unicode", xml_declaration=True)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
@main.command()
|
|
269
|
+
@click.argument("name")
|
|
270
|
+
@click.option("--input", "-i", "input_text", required=True, help="Test input")
|
|
271
|
+
@click.option("--output-contains", "-c", multiple=True, help="Expected output contains")
|
|
272
|
+
@click.option("--tools", "-t", multiple=True, help="Expected tool calls")
|
|
273
|
+
def create(
|
|
274
|
+
name: str,
|
|
275
|
+
input_text: str,
|
|
276
|
+
output_contains: tuple[str, ...],
|
|
277
|
+
tools: tuple[str, ...],
|
|
278
|
+
) -> None:
|
|
279
|
+
"""Create a new test case."""
|
|
280
|
+
from evaldeck.test_case import EvalCase, ExpectedBehavior
|
|
281
|
+
|
|
282
|
+
expected = ExpectedBehavior(
|
|
283
|
+
output_contains=list(output_contains) if output_contains else None,
|
|
284
|
+
tools_called=list(tools) if tools else None,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
test_case = EvalCase(
|
|
288
|
+
name=name,
|
|
289
|
+
input=input_text,
|
|
290
|
+
expected=expected,
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
console.print(test_case.to_yaml())
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
@main.command()
|
|
297
|
+
def validate() -> None:
|
|
298
|
+
"""Validate configuration and test cases."""
|
|
299
|
+
try:
|
|
300
|
+
cfg = EvaldeckConfig.load()
|
|
301
|
+
console.print("[green]✓[/green] Config file is valid")
|
|
302
|
+
except FileNotFoundError:
|
|
303
|
+
console.print("[red]✗[/red] No config file found")
|
|
304
|
+
sys.exit(1)
|
|
305
|
+
except Exception as e:
|
|
306
|
+
console.print(f"[red]✗[/red] Config error: {e}")
|
|
307
|
+
sys.exit(1)
|
|
308
|
+
|
|
309
|
+
# Validate test cases
|
|
310
|
+
from evaldeck.evaluator import EvaluationRunner
|
|
311
|
+
|
|
312
|
+
runner = EvaluationRunner(config=cfg)
|
|
313
|
+
|
|
314
|
+
try:
|
|
315
|
+
suites = runner._discover_suites()
|
|
316
|
+
total_tests = sum(len(s.test_cases) for s in suites)
|
|
317
|
+
console.print(f"[green]✓[/green] Found {total_tests} valid test case(s)")
|
|
318
|
+
except Exception as e:
|
|
319
|
+
console.print(f"[red]✗[/red] Test case error: {e}")
|
|
320
|
+
sys.exit(1)
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
if __name__ == "__main__":
|
|
324
|
+
main()
|
evaldeck/config.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""Configuration loading and management."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
from pydantic import BaseModel, Field
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class AgentConfig(BaseModel):
|
|
13
|
+
"""Configuration for the agent to test."""
|
|
14
|
+
|
|
15
|
+
module: str | None = None
|
|
16
|
+
function: str | None = None
|
|
17
|
+
class_name: str | None = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class GraderDefaults(BaseModel):
|
|
21
|
+
"""Default configuration for graders."""
|
|
22
|
+
|
|
23
|
+
llm_model: str = "gpt-4o-mini"
|
|
24
|
+
llm_provider: str | None = None
|
|
25
|
+
timeout: float = 30.0
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ThresholdConfig(BaseModel):
|
|
29
|
+
"""Threshold configuration for pass/fail."""
|
|
30
|
+
|
|
31
|
+
min_pass_rate: float = 0.0
|
|
32
|
+
max_failures: int | None = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SuiteConfig(BaseModel):
|
|
36
|
+
"""Configuration for a test suite."""
|
|
37
|
+
|
|
38
|
+
name: str
|
|
39
|
+
path: str
|
|
40
|
+
tags: list[str] = Field(default_factory=list)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ExecutionConfig(BaseModel):
|
|
44
|
+
"""Configuration for test execution."""
|
|
45
|
+
|
|
46
|
+
workers: int = Field(
|
|
47
|
+
default=0,
|
|
48
|
+
ge=0,
|
|
49
|
+
description="Number of concurrent workers. 0 = unlimited (default).",
|
|
50
|
+
)
|
|
51
|
+
timeout: float = Field(default=30.0, gt=0)
|
|
52
|
+
retries: int = Field(default=0, ge=0)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class EvaldeckConfig(BaseModel):
|
|
56
|
+
"""Main evaldeck configuration."""
|
|
57
|
+
|
|
58
|
+
version: int = 1
|
|
59
|
+
|
|
60
|
+
# Agent configuration
|
|
61
|
+
agent: AgentConfig = Field(default_factory=AgentConfig)
|
|
62
|
+
|
|
63
|
+
# Test configuration
|
|
64
|
+
test_dir: str = "tests/evals"
|
|
65
|
+
suites: list[SuiteConfig] = Field(default_factory=list)
|
|
66
|
+
|
|
67
|
+
# Execution configuration
|
|
68
|
+
execution: ExecutionConfig = Field(default_factory=ExecutionConfig)
|
|
69
|
+
|
|
70
|
+
# Legacy execution defaults (deprecated, use execution instead)
|
|
71
|
+
defaults: dict[str, Any] = Field(default_factory=lambda: {
|
|
72
|
+
"timeout": 30,
|
|
73
|
+
"retries": 0,
|
|
74
|
+
})
|
|
75
|
+
|
|
76
|
+
# Grader configuration
|
|
77
|
+
graders: GraderDefaults = Field(default_factory=GraderDefaults)
|
|
78
|
+
|
|
79
|
+
# Thresholds
|
|
80
|
+
thresholds: ThresholdConfig = Field(default_factory=ThresholdConfig)
|
|
81
|
+
|
|
82
|
+
# Output configuration
|
|
83
|
+
output_dir: str = ".evaldeck"
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def load(cls, path: str | Path | None = None) -> EvaldeckConfig:
|
|
87
|
+
"""Load configuration from file.
|
|
88
|
+
|
|
89
|
+
Searches for evaldeck.yaml, evaldeck.yml in order.
|
|
90
|
+
"""
|
|
91
|
+
if path:
|
|
92
|
+
path = Path(path)
|
|
93
|
+
if not path.exists():
|
|
94
|
+
raise FileNotFoundError(f"Config file not found: {path}")
|
|
95
|
+
return cls._load_file(path)
|
|
96
|
+
|
|
97
|
+
# Search for config file
|
|
98
|
+
for name in ["evaldeck.yaml", "evaldeck.yml"]:
|
|
99
|
+
p = Path(name)
|
|
100
|
+
if p.exists():
|
|
101
|
+
return cls._load_file(p)
|
|
102
|
+
|
|
103
|
+
# Return default config
|
|
104
|
+
return cls()
|
|
105
|
+
|
|
106
|
+
@classmethod
|
|
107
|
+
def _load_file(cls, path: Path) -> EvaldeckConfig:
|
|
108
|
+
"""Load configuration from a specific file."""
|
|
109
|
+
with open(path) as f:
|
|
110
|
+
data = yaml.safe_load(f) or {}
|
|
111
|
+
|
|
112
|
+
# Handle nested objects
|
|
113
|
+
if "agent" in data and isinstance(data["agent"], dict):
|
|
114
|
+
data["agent"] = AgentConfig(**data["agent"])
|
|
115
|
+
|
|
116
|
+
if "graders" in data and isinstance(data["graders"], dict):
|
|
117
|
+
# Handle 'llm' sub-key
|
|
118
|
+
if "llm" in data["graders"]:
|
|
119
|
+
llm_config = data["graders"]["llm"]
|
|
120
|
+
data["graders"] = GraderDefaults(
|
|
121
|
+
llm_model=llm_config.get("model", "gpt-4o-mini"),
|
|
122
|
+
llm_provider=llm_config.get("provider"),
|
|
123
|
+
)
|
|
124
|
+
else:
|
|
125
|
+
data["graders"] = GraderDefaults(**data["graders"])
|
|
126
|
+
|
|
127
|
+
if "thresholds" in data and isinstance(data["thresholds"], dict):
|
|
128
|
+
data["thresholds"] = ThresholdConfig(**data["thresholds"])
|
|
129
|
+
|
|
130
|
+
if "execution" in data and isinstance(data["execution"], dict):
|
|
131
|
+
data["execution"] = ExecutionConfig(**data["execution"])
|
|
132
|
+
|
|
133
|
+
if "suites" in data:
|
|
134
|
+
suites = []
|
|
135
|
+
for s in data["suites"]:
|
|
136
|
+
if isinstance(s, dict):
|
|
137
|
+
suites.append(SuiteConfig(**s))
|
|
138
|
+
else:
|
|
139
|
+
suites.append(s)
|
|
140
|
+
data["suites"] = suites
|
|
141
|
+
|
|
142
|
+
return cls(**data)
|
|
143
|
+
|
|
144
|
+
def save(self, path: str | Path) -> None:
|
|
145
|
+
"""Save configuration to file."""
|
|
146
|
+
with open(path, "w") as f:
|
|
147
|
+
yaml.dump(self.model_dump(exclude_none=True), f, default_flow_style=False)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def generate_default_config() -> str:
|
|
151
|
+
"""Generate default configuration YAML."""
|
|
152
|
+
return """# Evaldeck Configuration
|
|
153
|
+
version: 1
|
|
154
|
+
|
|
155
|
+
# Agent configuration (optional - can also be specified per test)
|
|
156
|
+
# agent:
|
|
157
|
+
# module: my_agent
|
|
158
|
+
# function: run_agent
|
|
159
|
+
|
|
160
|
+
# Test directory
|
|
161
|
+
test_dir: tests/evals
|
|
162
|
+
|
|
163
|
+
# Test suites (optional - auto-discovers from test_dir if not specified)
|
|
164
|
+
# suites:
|
|
165
|
+
# - name: core
|
|
166
|
+
# path: tests/evals/core
|
|
167
|
+
# - name: safety
|
|
168
|
+
# path: tests/evals/safety
|
|
169
|
+
|
|
170
|
+
# Execution configuration
|
|
171
|
+
execution:
|
|
172
|
+
workers: 0 # 0 = unlimited concurrent (default)
|
|
173
|
+
timeout: 30
|
|
174
|
+
retries: 0
|
|
175
|
+
|
|
176
|
+
# Grader configuration
|
|
177
|
+
graders:
|
|
178
|
+
llm:
|
|
179
|
+
model: gpt-4o-mini
|
|
180
|
+
# API key from OPENAI_API_KEY environment variable
|
|
181
|
+
|
|
182
|
+
# Pass/fail thresholds
|
|
183
|
+
thresholds:
|
|
184
|
+
min_pass_rate: 0.0
|
|
185
|
+
# max_failures: 5
|
|
186
|
+
|
|
187
|
+
# Output directory for traces and results
|
|
188
|
+
output_dir: .evaldeck
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def generate_example_test() -> str:
|
|
193
|
+
"""Generate example test case YAML."""
|
|
194
|
+
return """# Example test case
|
|
195
|
+
name: example_test
|
|
196
|
+
description: An example test case to get you started
|
|
197
|
+
|
|
198
|
+
# Input to send to the agent
|
|
199
|
+
input: "Hello, can you help me with a simple task?"
|
|
200
|
+
|
|
201
|
+
# Expected behavior
|
|
202
|
+
expected:
|
|
203
|
+
# Tools that must be called (if any)
|
|
204
|
+
# tools_called:
|
|
205
|
+
# - search
|
|
206
|
+
# - calculate
|
|
207
|
+
|
|
208
|
+
# Output must contain these strings
|
|
209
|
+
output_contains:
|
|
210
|
+
- "help"
|
|
211
|
+
|
|
212
|
+
# Maximum steps allowed
|
|
213
|
+
max_steps: 10
|
|
214
|
+
|
|
215
|
+
# Task must complete successfully
|
|
216
|
+
task_completed: true
|
|
217
|
+
|
|
218
|
+
# Optional: Custom graders
|
|
219
|
+
# graders:
|
|
220
|
+
# - type: llm
|
|
221
|
+
# prompt: "Did the agent respond helpfully? Answer PASS or FAIL."
|
|
222
|
+
# model: gpt-4o-mini
|
|
223
|
+
"""
|