evaldeck 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
evaldeck/__init__.py ADDED
@@ -0,0 +1,88 @@
1
+ """Evaldeck - The evaluation framework for AI agents.
2
+
3
+ Evaldeck helps you answer one question: "Is my agent actually working?"
4
+
5
+ Basic usage:
6
+ from evaldeck import Trace, Step, Evaluator, EvalCase
7
+
8
+ # Create a trace (or capture with LangChain adapter)
9
+ trace = Trace(
10
+ input="Book a flight to NYC",
11
+ steps=[
12
+ Step.tool_call("search_flights", {"to": "NYC"}),
13
+ Step.tool_call("book_flight", {"flight_id": "123"}),
14
+ ],
15
+ output="Booked flight 123 to NYC",
16
+ )
17
+
18
+ # Define test case
19
+ test_case = EvalCase(
20
+ name="book_flight",
21
+ input="Book a flight to NYC",
22
+ expected=ExpectedBehavior(
23
+ tools_called=["search_flights", "book_flight"],
24
+ output_contains=["booked"],
25
+ ),
26
+ )
27
+
28
+ # Evaluate
29
+ evaluator = Evaluator()
30
+ result = evaluator.evaluate(trace, test_case)
31
+ print(f"Passed: {result.passed}")
32
+ """
33
+
34
+ from evaldeck.config import EvaldeckConfig
35
+ from evaldeck.evaluator import EvaluationRunner, Evaluator
36
+ from evaldeck.results import (
37
+ EvaluationResult,
38
+ GradeResult,
39
+ GradeStatus,
40
+ MetricResult,
41
+ RunResult,
42
+ SuiteResult,
43
+ )
44
+ from evaldeck.test_case import (
45
+ EvalCase,
46
+ EvalSuite,
47
+ ExpectedBehavior,
48
+ GraderConfig,
49
+ )
50
+ from evaldeck.trace import (
51
+ Step,
52
+ StepStatus,
53
+ StepType,
54
+ TokenUsage,
55
+ Trace,
56
+ TraceStatus,
57
+ )
58
+
59
+ __version__ = "0.1.0"
60
+
61
+ __all__ = [
62
+ # Version
63
+ "__version__",
64
+ # Trace
65
+ "Trace",
66
+ "Step",
67
+ "StepType",
68
+ "StepStatus",
69
+ "TraceStatus",
70
+ "TokenUsage",
71
+ # Test Case
72
+ "EvalCase",
73
+ "EvalSuite",
74
+ "ExpectedBehavior",
75
+ "GraderConfig",
76
+ # Results
77
+ "GradeResult",
78
+ "GradeStatus",
79
+ "MetricResult",
80
+ "EvaluationResult",
81
+ "SuiteResult",
82
+ "RunResult",
83
+ # Evaluator
84
+ "Evaluator",
85
+ "EvaluationRunner",
86
+ # Config
87
+ "EvaldeckConfig",
88
+ ]
evaldeck/cli.py ADDED
@@ -0,0 +1,324 @@
1
+ """Command-line interface for evaldeck."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ import click
9
+ from rich import box
10
+ from rich.console import Console
11
+ from rich.panel import Panel
12
+ from rich.table import Table
13
+
14
+ from evaldeck.config import EvaldeckConfig, generate_default_config, generate_example_test
15
+ from evaldeck.results import EvaluationResult, GradeStatus, RunResult
16
+
17
+ console = Console()
18
+
19
+
20
+ @click.group()
21
+ @click.version_option()
22
+ def main() -> None:
23
+ """Evaldeck - The evaluation framework for AI agents."""
24
+ pass
25
+
26
+
27
+ @main.command()
28
+ @click.option("--force", "-f", is_flag=True, help="Overwrite existing files")
29
+ def init(force: bool) -> None:
30
+ """Initialize a new evaldeck project."""
31
+ config_path = Path("evaldeck.yaml")
32
+ test_dir = Path("tests/evals")
33
+ example_test = test_dir / "example.yaml"
34
+
35
+ # Check for existing files
36
+ if config_path.exists() and not force:
37
+ console.print(f"[yellow]Config file already exists: {config_path}[/yellow]")
38
+ console.print("Use --force to overwrite")
39
+ return
40
+
41
+ # Create config
42
+ config_path.write_text(generate_default_config())
43
+ console.print(f"[green]Created:[/green] {config_path}")
44
+
45
+ # Create test directory
46
+ test_dir.mkdir(parents=True, exist_ok=True)
47
+ console.print(f"[green]Created:[/green] {test_dir}/")
48
+
49
+ # Create example test
50
+ if not example_test.exists() or force:
51
+ example_test.write_text(generate_example_test())
52
+ console.print(f"[green]Created:[/green] {example_test}")
53
+
54
+ # Create output directory
55
+ output_dir = Path(".evaldeck")
56
+ output_dir.mkdir(exist_ok=True)
57
+
58
+ console.print()
59
+ console.print(
60
+ Panel(
61
+ "[bold]Project initialized![/bold]\n\n"
62
+ "Next steps:\n"
63
+ "1. Edit [cyan]evaldeck.yaml[/cyan] to configure your agent\n"
64
+ "2. Add test cases to [cyan]tests/evals/[/cyan]\n"
65
+ "3. Run [cyan]evaldeck run[/cyan] to evaluate",
66
+ title="Evaldeck",
67
+ border_style="green",
68
+ )
69
+ )
70
+
71
+
72
+ @main.command()
73
+ @click.option("--config", "-c", type=click.Path(exists=True), help="Config file path")
74
+ @click.option("--suite", "-s", multiple=True, help="Run specific suite(s)")
75
+ @click.option("--tag", "-t", multiple=True, help="Filter by tag(s)")
76
+ @click.option("--output", "-o", type=click.Choice(["text", "json", "junit"]), default="text")
77
+ @click.option("--output-file", type=click.Path(), help="Output file path")
78
+ @click.option("--verbose", "-v", is_flag=True, help="Verbose output")
79
+ @click.option(
80
+ "--workers",
81
+ "-w",
82
+ type=int,
83
+ default=None,
84
+ help="Max concurrent tests (0=unlimited, 1=sequential). Default: from config or 0",
85
+ )
86
+ def run(
87
+ config: str | None,
88
+ suite: tuple[str, ...],
89
+ tag: tuple[str, ...],
90
+ output: str,
91
+ output_file: str | None,
92
+ verbose: bool,
93
+ workers: int | None,
94
+ ) -> None:
95
+ """Run evaluations."""
96
+ try:
97
+ # Load config
98
+ cfg = EvaldeckConfig.load(config)
99
+ except FileNotFoundError:
100
+ console.print("[red]No evaldeck.yaml found. Run 'evaldeck init' first.[/red]")
101
+ sys.exit(1)
102
+
103
+ console.print("[bold]Evaldeck[/bold] - Running evaluations...\n")
104
+
105
+ # Discover test suites
106
+ from evaldeck.evaluator import EvaluationRunner
107
+
108
+ runner = EvaluationRunner(config=cfg)
109
+
110
+ try:
111
+ suites = runner._discover_suites()
112
+ except Exception as e:
113
+ console.print(f"[red]Error discovering test suites: {e}[/red]")
114
+ sys.exit(1)
115
+
116
+ if not suites:
117
+ console.print("[yellow]No test suites found.[/yellow]")
118
+ console.print(f"Add test cases to: {cfg.test_dir}/")
119
+ sys.exit(0)
120
+
121
+ # Filter suites if specified
122
+ if suite:
123
+ suites = [s for s in suites if s.name in suite]
124
+
125
+ # Count total tests
126
+ total_tests = sum(len(s.test_cases) for s in suites)
127
+ console.print(
128
+ f"Found [cyan]{total_tests}[/cyan] test(s) in [cyan]{len(suites)}[/cyan] suite(s)\n"
129
+ )
130
+
131
+ # Check if agent is configured
132
+ if not cfg.agent.module or not cfg.agent.function:
133
+ console.print("[yellow]No agent configured in evaldeck.yaml[/yellow]")
134
+ console.print("Running in dry-run mode (no agent execution)\n")
135
+
136
+ # Show what would be run
137
+ for s in suites:
138
+ console.print(f"Suite: [bold]{s.name}[/bold]")
139
+ for tc in s.test_cases:
140
+ console.print(f" - {tc.name}")
141
+ sys.exit(0)
142
+
143
+ # Run evaluations
144
+ def on_result(result: EvaluationResult) -> None:
145
+ """Print result as it completes."""
146
+ if result.passed:
147
+ icon = "[green]✓[/green]"
148
+ elif result.status == GradeStatus.ERROR:
149
+ icon = "[red]![/red]"
150
+ else:
151
+ icon = "[red]✗[/red]"
152
+
153
+ duration = f"({result.duration_ms:.1f}ms)" if result.duration_ms else ""
154
+ console.print(f" {icon} {result.test_case_name} {duration}")
155
+
156
+ if verbose and not result.passed:
157
+ for grade in result.failed_grades:
158
+ console.print(f" [dim]└─ {grade.grader_name}: {grade.message}[/dim]")
159
+
160
+ # Show concurrency info
161
+ effective_workers = workers if workers is not None else cfg.execution.workers
162
+ if effective_workers == 0:
163
+ console.print("[dim]Running with unlimited concurrency[/dim]\n")
164
+ elif effective_workers == 1:
165
+ console.print("[dim]Running sequentially[/dim]\n")
166
+ else:
167
+ console.print(f"[dim]Running with max {effective_workers} concurrent tests[/dim]\n")
168
+
169
+ try:
170
+ run_result = runner.run(
171
+ suites=suites,
172
+ tags=list(tag) if tag else None,
173
+ on_result=on_result,
174
+ max_concurrent=workers,
175
+ )
176
+ except ValueError as e:
177
+ console.print(f"[red]Error: {e}[/red]")
178
+ sys.exit(1)
179
+ except Exception as e:
180
+ console.print(f"[red]Evaluation error: {e}[/red]")
181
+ if verbose:
182
+ import traceback
183
+
184
+ console.print(traceback.format_exc())
185
+ sys.exit(1)
186
+
187
+ # Print summary
188
+ console.print()
189
+ _print_summary(run_result)
190
+
191
+ # Output to file if requested
192
+ if output_file:
193
+ _write_output(run_result, output, output_file)
194
+
195
+ # Exit with appropriate code
196
+ if cfg.thresholds.min_pass_rate > 0:
197
+ if run_result.pass_rate < cfg.thresholds.min_pass_rate:
198
+ console.print(
199
+ f"\n[red]Pass rate {run_result.pass_rate:.1%} < "
200
+ f"threshold {cfg.thresholds.min_pass_rate:.1%}[/red]"
201
+ )
202
+ sys.exit(1)
203
+
204
+ sys.exit(0 if run_result.all_passed else 1)
205
+
206
+
207
+ def _print_summary(result: RunResult) -> None:
208
+ """Print evaluation summary."""
209
+ table = Table(box=box.SIMPLE)
210
+ table.add_column("Metric", style="bold")
211
+ table.add_column("Value", justify="right")
212
+
213
+ table.add_row("Total", str(result.total))
214
+ table.add_row("Passed", f"[green]{result.passed}[/green]")
215
+ table.add_row("Failed", f"[red]{result.failed}[/red]" if result.failed else "0")
216
+ table.add_row("Pass Rate", f"{result.pass_rate:.1%}")
217
+
218
+ console.print(Panel(table, title="Results", border_style="blue"))
219
+
220
+
221
+ def _write_output(result: RunResult, format: str, path: str) -> None:
222
+ """Write results to file."""
223
+ if format == "json":
224
+ import json
225
+
226
+ with open(path, "w") as f:
227
+ json.dump(result.model_dump(mode="json"), f, indent=2, default=str)
228
+ elif format == "junit":
229
+ _write_junit(result, path)
230
+
231
+ console.print(f"[dim]Output written to: {path}[/dim]")
232
+
233
+
234
+ def _write_junit(result: RunResult, path: str) -> None:
235
+ """Write results in JUnit XML format."""
236
+ import xml.etree.ElementTree as ET
237
+
238
+ testsuites = ET.Element("testsuites")
239
+ testsuites.set("tests", str(result.total))
240
+ testsuites.set("failures", str(result.failed))
241
+
242
+ for suite_result in result.suites:
243
+ testsuite = ET.SubElement(testsuites, "testsuite")
244
+ testsuite.set("name", suite_result.suite_name)
245
+ testsuite.set("tests", str(suite_result.total))
246
+ testsuite.set("failures", str(suite_result.failed))
247
+ testsuite.set("errors", str(suite_result.errors))
248
+
249
+ for eval_result in suite_result.results:
250
+ testcase = ET.SubElement(testsuite, "testcase")
251
+ testcase.set("name", eval_result.test_case_name)
252
+ testcase.set("time", str((eval_result.duration_ms or 0) / 1000))
253
+
254
+ if eval_result.status == GradeStatus.FAIL:
255
+ failure = ET.SubElement(testcase, "failure")
256
+ messages = [g.message for g in eval_result.failed_grades if g.message]
257
+ failure.set("message", "; ".join(messages) or "Test failed")
258
+
259
+ elif eval_result.status == GradeStatus.ERROR:
260
+ error = ET.SubElement(testcase, "error")
261
+ error.set("message", eval_result.error or "Unknown error")
262
+
263
+ tree = ET.ElementTree(testsuites)
264
+ ET.indent(tree, space=" ")
265
+ tree.write(path, encoding="unicode", xml_declaration=True)
266
+
267
+
268
+ @main.command()
269
+ @click.argument("name")
270
+ @click.option("--input", "-i", "input_text", required=True, help="Test input")
271
+ @click.option("--output-contains", "-c", multiple=True, help="Expected output contains")
272
+ @click.option("--tools", "-t", multiple=True, help="Expected tool calls")
273
+ def create(
274
+ name: str,
275
+ input_text: str,
276
+ output_contains: tuple[str, ...],
277
+ tools: tuple[str, ...],
278
+ ) -> None:
279
+ """Create a new test case."""
280
+ from evaldeck.test_case import EvalCase, ExpectedBehavior
281
+
282
+ expected = ExpectedBehavior(
283
+ output_contains=list(output_contains) if output_contains else None,
284
+ tools_called=list(tools) if tools else None,
285
+ )
286
+
287
+ test_case = EvalCase(
288
+ name=name,
289
+ input=input_text,
290
+ expected=expected,
291
+ )
292
+
293
+ console.print(test_case.to_yaml())
294
+
295
+
296
+ @main.command()
297
+ def validate() -> None:
298
+ """Validate configuration and test cases."""
299
+ try:
300
+ cfg = EvaldeckConfig.load()
301
+ console.print("[green]✓[/green] Config file is valid")
302
+ except FileNotFoundError:
303
+ console.print("[red]✗[/red] No config file found")
304
+ sys.exit(1)
305
+ except Exception as e:
306
+ console.print(f"[red]✗[/red] Config error: {e}")
307
+ sys.exit(1)
308
+
309
+ # Validate test cases
310
+ from evaldeck.evaluator import EvaluationRunner
311
+
312
+ runner = EvaluationRunner(config=cfg)
313
+
314
+ try:
315
+ suites = runner._discover_suites()
316
+ total_tests = sum(len(s.test_cases) for s in suites)
317
+ console.print(f"[green]✓[/green] Found {total_tests} valid test case(s)")
318
+ except Exception as e:
319
+ console.print(f"[red]✗[/red] Test case error: {e}")
320
+ sys.exit(1)
321
+
322
+
323
+ if __name__ == "__main__":
324
+ main()
evaldeck/config.py ADDED
@@ -0,0 +1,223 @@
1
+ """Configuration loading and management."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import yaml
9
+ from pydantic import BaseModel, Field
10
+
11
+
12
+ class AgentConfig(BaseModel):
13
+ """Configuration for the agent to test."""
14
+
15
+ module: str | None = None
16
+ function: str | None = None
17
+ class_name: str | None = None
18
+
19
+
20
+ class GraderDefaults(BaseModel):
21
+ """Default configuration for graders."""
22
+
23
+ llm_model: str = "gpt-4o-mini"
24
+ llm_provider: str | None = None
25
+ timeout: float = 30.0
26
+
27
+
28
+ class ThresholdConfig(BaseModel):
29
+ """Threshold configuration for pass/fail."""
30
+
31
+ min_pass_rate: float = 0.0
32
+ max_failures: int | None = None
33
+
34
+
35
+ class SuiteConfig(BaseModel):
36
+ """Configuration for a test suite."""
37
+
38
+ name: str
39
+ path: str
40
+ tags: list[str] = Field(default_factory=list)
41
+
42
+
43
+ class ExecutionConfig(BaseModel):
44
+ """Configuration for test execution."""
45
+
46
+ workers: int = Field(
47
+ default=0,
48
+ ge=0,
49
+ description="Number of concurrent workers. 0 = unlimited (default).",
50
+ )
51
+ timeout: float = Field(default=30.0, gt=0)
52
+ retries: int = Field(default=0, ge=0)
53
+
54
+
55
+ class EvaldeckConfig(BaseModel):
56
+ """Main evaldeck configuration."""
57
+
58
+ version: int = 1
59
+
60
+ # Agent configuration
61
+ agent: AgentConfig = Field(default_factory=AgentConfig)
62
+
63
+ # Test configuration
64
+ test_dir: str = "tests/evals"
65
+ suites: list[SuiteConfig] = Field(default_factory=list)
66
+
67
+ # Execution configuration
68
+ execution: ExecutionConfig = Field(default_factory=ExecutionConfig)
69
+
70
+ # Legacy execution defaults (deprecated, use execution instead)
71
+ defaults: dict[str, Any] = Field(default_factory=lambda: {
72
+ "timeout": 30,
73
+ "retries": 0,
74
+ })
75
+
76
+ # Grader configuration
77
+ graders: GraderDefaults = Field(default_factory=GraderDefaults)
78
+
79
+ # Thresholds
80
+ thresholds: ThresholdConfig = Field(default_factory=ThresholdConfig)
81
+
82
+ # Output configuration
83
+ output_dir: str = ".evaldeck"
84
+
85
+ @classmethod
86
+ def load(cls, path: str | Path | None = None) -> EvaldeckConfig:
87
+ """Load configuration from file.
88
+
89
+ Searches for evaldeck.yaml, evaldeck.yml in order.
90
+ """
91
+ if path:
92
+ path = Path(path)
93
+ if not path.exists():
94
+ raise FileNotFoundError(f"Config file not found: {path}")
95
+ return cls._load_file(path)
96
+
97
+ # Search for config file
98
+ for name in ["evaldeck.yaml", "evaldeck.yml"]:
99
+ p = Path(name)
100
+ if p.exists():
101
+ return cls._load_file(p)
102
+
103
+ # Return default config
104
+ return cls()
105
+
106
+ @classmethod
107
+ def _load_file(cls, path: Path) -> EvaldeckConfig:
108
+ """Load configuration from a specific file."""
109
+ with open(path) as f:
110
+ data = yaml.safe_load(f) or {}
111
+
112
+ # Handle nested objects
113
+ if "agent" in data and isinstance(data["agent"], dict):
114
+ data["agent"] = AgentConfig(**data["agent"])
115
+
116
+ if "graders" in data and isinstance(data["graders"], dict):
117
+ # Handle 'llm' sub-key
118
+ if "llm" in data["graders"]:
119
+ llm_config = data["graders"]["llm"]
120
+ data["graders"] = GraderDefaults(
121
+ llm_model=llm_config.get("model", "gpt-4o-mini"),
122
+ llm_provider=llm_config.get("provider"),
123
+ )
124
+ else:
125
+ data["graders"] = GraderDefaults(**data["graders"])
126
+
127
+ if "thresholds" in data and isinstance(data["thresholds"], dict):
128
+ data["thresholds"] = ThresholdConfig(**data["thresholds"])
129
+
130
+ if "execution" in data and isinstance(data["execution"], dict):
131
+ data["execution"] = ExecutionConfig(**data["execution"])
132
+
133
+ if "suites" in data:
134
+ suites = []
135
+ for s in data["suites"]:
136
+ if isinstance(s, dict):
137
+ suites.append(SuiteConfig(**s))
138
+ else:
139
+ suites.append(s)
140
+ data["suites"] = suites
141
+
142
+ return cls(**data)
143
+
144
+ def save(self, path: str | Path) -> None:
145
+ """Save configuration to file."""
146
+ with open(path, "w") as f:
147
+ yaml.dump(self.model_dump(exclude_none=True), f, default_flow_style=False)
148
+
149
+
150
+ def generate_default_config() -> str:
151
+ """Generate default configuration YAML."""
152
+ return """# Evaldeck Configuration
153
+ version: 1
154
+
155
+ # Agent configuration (optional - can also be specified per test)
156
+ # agent:
157
+ # module: my_agent
158
+ # function: run_agent
159
+
160
+ # Test directory
161
+ test_dir: tests/evals
162
+
163
+ # Test suites (optional - auto-discovers from test_dir if not specified)
164
+ # suites:
165
+ # - name: core
166
+ # path: tests/evals/core
167
+ # - name: safety
168
+ # path: tests/evals/safety
169
+
170
+ # Execution configuration
171
+ execution:
172
+ workers: 0 # 0 = unlimited concurrent (default)
173
+ timeout: 30
174
+ retries: 0
175
+
176
+ # Grader configuration
177
+ graders:
178
+ llm:
179
+ model: gpt-4o-mini
180
+ # API key from OPENAI_API_KEY environment variable
181
+
182
+ # Pass/fail thresholds
183
+ thresholds:
184
+ min_pass_rate: 0.0
185
+ # max_failures: 5
186
+
187
+ # Output directory for traces and results
188
+ output_dir: .evaldeck
189
+ """
190
+
191
+
192
+ def generate_example_test() -> str:
193
+ """Generate example test case YAML."""
194
+ return """# Example test case
195
+ name: example_test
196
+ description: An example test case to get you started
197
+
198
+ # Input to send to the agent
199
+ input: "Hello, can you help me with a simple task?"
200
+
201
+ # Expected behavior
202
+ expected:
203
+ # Tools that must be called (if any)
204
+ # tools_called:
205
+ # - search
206
+ # - calculate
207
+
208
+ # Output must contain these strings
209
+ output_contains:
210
+ - "help"
211
+
212
+ # Maximum steps allowed
213
+ max_steps: 10
214
+
215
+ # Task must complete successfully
216
+ task_completed: true
217
+
218
+ # Optional: Custom graders
219
+ # graders:
220
+ # - type: llm
221
+ # prompt: "Did the agent respond helpfully? Answer PASS or FAIL."
222
+ # model: gpt-4o-mini
223
+ """