agenteval-py 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
agenteval/cli.py ADDED
@@ -0,0 +1,93 @@
1
+ """CLI entry point for agenteval."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pathlib
6
+ from typing import Annotated, Optional
7
+
8
+ import typer
9
+ from rich.console import Console
10
+
11
+ from agenteval.registry import TestRegistry
12
+ from agenteval.reporter import RichReporter
13
+ from agenteval.suite import run_suite
14
+
15
+ app = typer.Typer(
16
+ name="agenteval",
17
+ help="Evaluation toolkit for LLM agents.",
18
+ no_args_is_help=True,
19
+ add_completion=False,
20
+ )
21
+
22
+
23
+ @app.command(name="run")
24
+ def run_cmd(
25
+ paths: Annotated[
26
+ list[str],
27
+ typer.Argument(help="Test files or directories to discover (default: current dir)"),
28
+ ] = [".", ], # noqa: B006
29
+ pattern: str = typer.Option("test_*.py", "--pattern", "-p", help="File glob pattern"),
30
+ tags: Optional[list[str]] = typer.Option(None, "--tag", "-t", help="Only run tests with this tag (repeatable)"),
31
+ n: Optional[int] = typer.Option(None, "--n", help="Override number of runs per test"),
32
+ threshold: Optional[float] = typer.Option(None, "--threshold", help="Override pass rate threshold (0.0–1.0)"),
33
+ concurrency: int = typer.Option(4, "--concurrency", "-c", help="Max concurrent runs"),
34
+ output: Optional[pathlib.Path] = typer.Option(None, "--output", "-o", help="Write JSON report to this file"),
35
+ no_color: bool = typer.Option(False, "--no-color", help="Disable color output"),
36
+ show_traces: bool = typer.Option(False, "--traces", help="Show per-trace details"),
37
+ show_failures: bool = typer.Option(True, "--failures/--no-failures", help="Show failure reasons"),
38
+ ) -> None:
39
+ """Discover and run agenteval tests."""
40
+ console = Console(no_color=no_color)
41
+ reporter = RichReporter(console=console, show_traces=show_traces, show_failures=show_failures)
42
+
43
+ # Reset registry so re-running the CLI in the same process doesn't double-count
44
+ TestRegistry.reset()
45
+
46
+ try:
47
+ suite = run_suite(
48
+ paths=paths,
49
+ pattern=pattern,
50
+ tags=tags or None,
51
+ fail_under=threshold,
52
+ n_override=n,
53
+ concurrency=concurrency,
54
+ reporter=reporter,
55
+ )
56
+ except Exception as e:
57
+ console.print(f"[bold red]Error:[/bold red] {e}")
58
+ raise typer.Exit(code=2) from e
59
+
60
+ if output is not None:
61
+ reporter.export_json(suite, output)
62
+
63
+ raise typer.Exit(code=0 if suite.all_passed else 1)
64
+
65
+
66
+ @app.command(name="report")
67
+ def report_cmd(
68
+ json_file: Annotated[pathlib.Path, typer.Argument(help="JSON report file from a previous run")],
69
+ show_traces: bool = typer.Option(False, "--traces", help="Show per-trace details"),
70
+ no_color: bool = typer.Option(False, "--no-color"),
71
+ ) -> None:
72
+ """Pretty-print a saved JSON report."""
73
+ import json as _json
74
+
75
+ from agenteval.models import SuiteResult
76
+
77
+ console = Console(no_color=no_color)
78
+
79
+ if not json_file.exists():
80
+ console.print(f"[bold red]File not found:[/bold red] {json_file}")
81
+ raise typer.Exit(code=2)
82
+
83
+ try:
84
+ data = _json.loads(json_file.read_text(encoding="utf-8"))
85
+ suite = SuiteResult.model_validate(data)
86
+ except Exception as e:
87
+ console.print(f"[bold red]Failed to load report:[/bold red] {e}")
88
+ raise typer.Exit(code=2) from e
89
+
90
+ reporter = RichReporter(console=console, show_traces=show_traces)
91
+ for result in suite.results:
92
+ reporter.render_result(result)
93
+ reporter.render_suite(suite)
@@ -0,0 +1,17 @@
1
+ """Custom exception hierarchy for agenteval."""
2
+
3
+
4
+ class AgentEvalError(Exception):
5
+ """Base exception for all agenteval errors."""
6
+
7
+
8
+ class AssertionFailure(AgentEvalError):
9
+ """Raised when one or more trace assertions fail."""
10
+
11
+
12
+ class DiscoveryError(AgentEvalError):
13
+ """Raised when test file discovery or import fails."""
14
+
15
+
16
+ class TracerError(AgentEvalError):
17
+ """Raised for invalid tracer usage (e.g., accessing trace before run completes)."""
agenteval/models.py ADDED
@@ -0,0 +1,123 @@
1
+ """Pydantic data models for agenteval traces and results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Optional
6
+
7
+ from pydantic import BaseModel, ConfigDict, computed_field, field_validator
8
+
9
+
10
+ class ToolCall(BaseModel):
11
+ """A single tool/function call recorded during an agent run."""
12
+
13
+ model_config = ConfigDict(frozen=True)
14
+
15
+ name: str
16
+ arguments: dict[str, Any]
17
+ result: Any = None
18
+ timestamp: float
19
+ duration_seconds: float
20
+ error: Optional[str] = None
21
+
22
+
23
+ class AgentTrace(BaseModel):
24
+ """Full trace of one agent run, including all tool calls and outcome."""
25
+
26
+ run_id: str
27
+ input: Any
28
+ output: Any = None
29
+ tool_calls: list[ToolCall] = []
30
+ total_steps: Optional[int] = None
31
+ duration_seconds: float = 0.0
32
+ token_usage: Optional[dict[str, int]] = None
33
+ error: Optional[str] = None
34
+ assertion_errors: list[str] = []
35
+ passed: bool = True
36
+ metadata: dict[str, Any] = {}
37
+
38
+ @computed_field # type: ignore[misc]
39
+ @property
40
+ def effective_steps(self) -> int:
41
+ """Number of steps: explicit total_steps if set, otherwise len(tool_calls)."""
42
+ return self.total_steps if self.total_steps is not None else len(self.tool_calls)
43
+
44
+
45
+ class TestResult(BaseModel):
46
+ """Aggregated results from running a test function N times."""
47
+
48
+ test_name: str
49
+ n_runs: int
50
+ n_passed: int
51
+ pass_rate: float
52
+ threshold: float
53
+ traces: list[AgentTrace]
54
+ tags: list[str] = []
55
+
56
+ @field_validator("pass_rate")
57
+ @classmethod
58
+ def _validate_pass_rate(cls, v: float) -> float:
59
+ if not 0.0 <= v <= 1.0:
60
+ raise ValueError(f"pass_rate must be between 0 and 1, got {v}")
61
+ return v
62
+
63
+ @computed_field # type: ignore[misc]
64
+ @property
65
+ def passed_traces(self) -> list[AgentTrace]:
66
+ return [t for t in self.traces if t.passed]
67
+
68
+ @computed_field # type: ignore[misc]
69
+ @property
70
+ def failed_traces(self) -> list[AgentTrace]:
71
+ return [t for t in self.traces if not t.passed]
72
+
73
+ @computed_field # type: ignore[misc]
74
+ @property
75
+ def avg_duration(self) -> float:
76
+ if not self.traces:
77
+ return 0.0
78
+ return sum(t.duration_seconds for t in self.traces) / len(self.traces)
79
+
80
+ @computed_field # type: ignore[misc]
81
+ @property
82
+ def avg_steps(self) -> float:
83
+ if not self.traces:
84
+ return 0.0
85
+ return sum(t.effective_steps for t in self.traces) / len(self.traces)
86
+
87
+ @computed_field # type: ignore[misc]
88
+ @property
89
+ def met_threshold(self) -> bool:
90
+ return self.pass_rate >= self.threshold
91
+
92
+
93
+ class SuiteResult(BaseModel):
94
+ """Aggregated results for an entire test suite."""
95
+
96
+ results: list[TestResult]
97
+ start_time: float
98
+ end_time: float
99
+
100
+ @computed_field # type: ignore[misc]
101
+ @property
102
+ def total_tests(self) -> int:
103
+ return len(self.results)
104
+
105
+ @computed_field # type: ignore[misc]
106
+ @property
107
+ def passed_tests(self) -> int:
108
+ return sum(1 for r in self.results if r.met_threshold)
109
+
110
+ @computed_field # type: ignore[misc]
111
+ @property
112
+ def failed_tests(self) -> int:
113
+ return self.total_tests - self.passed_tests
114
+
115
+ @computed_field # type: ignore[misc]
116
+ @property
117
+ def all_passed(self) -> bool:
118
+ return self.failed_tests == 0
119
+
120
+ @computed_field # type: ignore[misc]
121
+ @property
122
+ def duration_seconds(self) -> float:
123
+ return self.end_time - self.start_time
agenteval/py.typed ADDED
File without changes
agenteval/registry.py ADDED
@@ -0,0 +1,99 @@
1
+ """Global test registry and @agenteval.test decorator."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any, Callable, Optional
7
+
8
+
9
+ @dataclass
10
+ class RegisteredTest:
11
+ """Metadata for a test function registered with @agenteval.test."""
12
+
13
+ fn: Callable[..., Any]
14
+ name: str
15
+ n: int
16
+ threshold: float
17
+ tags: list[str]
18
+ module: str
19
+
20
+
21
+ class TestRegistry:
22
+ """Singleton registry that collects all @agenteval.test-decorated functions."""
23
+
24
+ _instance: Optional["TestRegistry"] = None
25
+
26
+ def __init__(self) -> None:
27
+ self._tests: list[RegisteredTest] = []
28
+
29
+ @classmethod
30
+ def global_registry(cls) -> "TestRegistry":
31
+ if cls._instance is None:
32
+ cls._instance = cls()
33
+ return cls._instance
34
+
35
+ @classmethod
36
+ def reset(cls) -> None:
37
+ """Replace the global registry with a fresh one. Used in tests."""
38
+ cls._instance = cls()
39
+
40
+ def register(self, entry: RegisteredTest) -> None:
41
+ self._tests.append(entry)
42
+
43
+ def clear(self) -> None:
44
+ self._tests.clear()
45
+
46
+ def get_all(self, tags: Optional[list[str]] = None) -> list[RegisteredTest]:
47
+ """Return all registered tests, optionally filtered by tags."""
48
+ if not tags:
49
+ return list(self._tests)
50
+ return [t for t in self._tests if any(tag in t.tags for tag in tags)]
51
+
52
+ def snapshot(self) -> list[RegisteredTest]:
53
+ """Return a copy of the current registered tests."""
54
+ return list(self._tests)
55
+
56
+
57
+ def test(
58
+ fn: Optional[Callable[..., Any]] = None,
59
+ *,
60
+ n: int = 20,
61
+ threshold: float = 0.8,
62
+ tags: Optional[list[str]] = None,
63
+ ) -> Any:
64
+ """Decorator that registers a test function with the global TestRegistry.
65
+
66
+ Supports both bare and parameterized forms::
67
+
68
+ @agenteval.test
69
+ async def test_basic(tracer: Tracer) -> None: ...
70
+
71
+ @agenteval.test(n=10, threshold=0.9, tags=["slow"])
72
+ async def test_complex(tracer: Tracer) -> None: ...
73
+
74
+ Args:
75
+ fn: The test function (when used as bare decorator).
76
+ n: Number of runs. Default: 20.
77
+ threshold: Required pass rate. Default: 0.8.
78
+ tags: Optional tags for filtering.
79
+ """
80
+ def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
81
+ registry = TestRegistry.global_registry()
82
+ registry.register(
83
+ RegisteredTest(
84
+ fn=func,
85
+ name=func.__name__,
86
+ n=n,
87
+ threshold=threshold,
88
+ tags=tags or [],
89
+ module=func.__module__,
90
+ )
91
+ )
92
+ return func
93
+
94
+ if fn is not None:
95
+ # Called as @agenteval.test (no parentheses)
96
+ return decorator(fn)
97
+
98
+ # Called as @agenteval.test(...) (with parentheses)
99
+ return decorator
agenteval/reporter.py ADDED
@@ -0,0 +1,139 @@
1
+ """Rich terminal reporter and JSON exporter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import pathlib
7
+ from typing import Protocol, runtime_checkable
8
+
9
+ from rich import box
10
+ from rich.console import Console
11
+ from rich.table import Table
12
+ from rich.text import Text
13
+
14
+ from agenteval.models import SuiteResult, TestResult
15
+
16
+
17
+ @runtime_checkable
18
+ class Reporter(Protocol):
19
+ def render_result(self, result: TestResult) -> None: ...
20
+ def render_suite(self, suite: SuiteResult) -> None: ...
21
+ def export_json(self, suite: SuiteResult, path: pathlib.Path) -> None: ...
22
+
23
+
24
+ def _pass_rate_display(result: TestResult) -> tuple[str, str]:
25
+ """Return (label, style) for a test result's pass rate."""
26
+ rate = result.pass_rate
27
+ threshold = result.threshold
28
+ fraction = f"{result.n_passed}/{result.n_runs}"
29
+
30
+ if rate >= threshold:
31
+ return f"✅ {fraction} ({rate:.0%})", "green"
32
+ elif rate >= threshold * 0.5:
33
+ return f"⚠️ {fraction} ({rate:.0%})", "yellow"
34
+ else:
35
+ return f"❌ {fraction} ({rate:.0%})", "red"
36
+
37
+
38
+ class RichReporter:
39
+ """Rich terminal reporter with color-coded pass rates and summary tables.
40
+
41
+ Output format::
42
+
43
+ test_basic_search 18/20 ✅ 90% avg 1.2s 3.1 steps
44
+ test_complex_reasoning 8/20 ⚠️ 40% avg 4.7s 7.2 steps
45
+ test_hallucination 3/20 ❌ 15% avg 2.1s 5.0 steps
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ console: Console | None = None,
51
+ *,
52
+ show_failures: bool = True,
53
+ show_traces: bool = False,
54
+ ) -> None:
55
+ self.console = console or Console()
56
+ self.show_failures = show_failures
57
+ self.show_traces = show_traces
58
+
59
+ def render_result(self, result: TestResult) -> None:
60
+ """Print a single test result line. Called after each test completes."""
61
+ label, style = _pass_rate_display(result)
62
+ self.console.print(
63
+ f" [bold]{result.test_name}[/bold]"
64
+ f" [{style}]{label}[/{style}]"
65
+ f" [dim]avg {result.avg_duration:.2f}s {result.avg_steps:.1f} steps[/dim]"
66
+ )
67
+
68
+ if self.show_failures and result.failed_traces:
69
+ for i, trace in enumerate(result.failed_traces[:3], 1):
70
+ reasons: list[str] = []
71
+ if trace.error:
72
+ reasons.append(f"error: {trace.error}")
73
+ if trace.assertion_errors:
74
+ reasons.extend(trace.assertion_errors[:2])
75
+ if reasons:
76
+ reason_text = " | ".join(r[:120] for r in reasons[:2])
77
+ self.console.print(f" [dim red]↳ failure {i}: {reason_text}[/dim red]")
78
+
79
+ if self.show_traces:
80
+ self._print_traces(result)
81
+
82
+ def render_suite(self, suite: SuiteResult) -> None:
83
+ """Print a summary table for the full suite."""
84
+ if not suite.results:
85
+ self.console.print("[dim]No tests found.[/dim]")
86
+ return
87
+
88
+ total_runs = sum(r.n_runs for r in suite.results)
89
+ self.console.print()
90
+
91
+ table = Table(
92
+ box=box.ROUNDED,
93
+ show_header=True,
94
+ header_style="bold",
95
+ title=f"agenteval results — {suite.total_tests} test(s) · {total_runs} total runs · {suite.duration_seconds:.1f}s",
96
+ )
97
+ table.add_column("Test", style="bold", no_wrap=True)
98
+ table.add_column("Runs", justify="center")
99
+ table.add_column("Pass Rate", justify="center")
100
+ table.add_column("Avg Duration", justify="right")
101
+ table.add_column("Avg Steps", justify="right")
102
+ table.add_column("Threshold", justify="center")
103
+
104
+ for result in suite.results:
105
+ label, style = _pass_rate_display(result)
106
+ table.add_row(
107
+ result.test_name,
108
+ f"{result.n_passed}/{result.n_runs}",
109
+ Text(label, style=style),
110
+ f"{result.avg_duration:.2f}s",
111
+ f"{result.avg_steps:.1f}",
112
+ f"{result.threshold:.0%}",
113
+ )
114
+
115
+ self.console.print(table)
116
+
117
+ # Footer summary
118
+ if suite.all_passed:
119
+ self.console.print(
120
+ f"\n [bold green]✅ All {suite.total_tests} test(s) passed their threshold.[/bold green]"
121
+ )
122
+ else:
123
+ self.console.print(
124
+ f"\n [bold red]❌ {suite.failed_tests}/{suite.total_tests} test(s) failed their threshold.[/bold red]"
125
+ )
126
+
127
+ def export_json(self, suite: SuiteResult, path: pathlib.Path) -> None:
128
+ """Write suite results to a JSON file."""
129
+ data = suite.model_dump(mode="json")
130
+ path.write_text(json.dumps(data, indent=2), encoding="utf-8")
131
+ self.console.print(f"\n [dim]JSON report saved to {path}[/dim]")
132
+
133
+ def _print_traces(self, result: TestResult) -> None:
134
+ for i, trace in enumerate(result.traces, 1):
135
+ status = "✅" if trace.passed else "❌"
136
+ self.console.print(
137
+ f" [dim]{status} run {i} {trace.duration_seconds:.2f}s "
138
+ f"{trace.effective_steps} steps output={str(trace.output)[:60]!r}[/dim]"
139
+ )
agenteval/runner.py ADDED
@@ -0,0 +1,119 @@
1
+ """Test runner: executes a test function N times with bounded async concurrency."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import functools
6
+ import inspect
7
+ import time
8
+ from typing import Any, Callable, Optional
9
+
10
+ import anyio
11
+
12
+ from agenteval.models import AgentTrace, TestResult
13
+ from agenteval.tracer import _ACTIVE_TRACER, Tracer
14
+
15
+
16
+ async def _run_single(
17
+ test_fn: Callable[[Tracer], Any],
18
+ run_index: int,
19
+ ) -> AgentTrace:
20
+ """Execute one run of test_fn with a fresh Tracer. Returns the completed AgentTrace."""
21
+ tracer = Tracer()
22
+ token = _ACTIVE_TRACER.set(tracer)
23
+ try:
24
+ await test_fn(tracer)
25
+ except AssertionError as e:
26
+ # Assertion failures from tracer.assert_that().check()
27
+ tracer._assertion_errors = [line.strip(" •") for line in str(e).splitlines() if line.strip()]
28
+ tracer._assertion_errors = [str(e)]
29
+ except Exception as e:
30
+ if tracer._run_error is None:
31
+ tracer._run_error = f"{type(e).__name__}: {e}"
32
+ finally:
33
+ _ACTIVE_TRACER.reset(token)
34
+ # Close the run context if the test forgot to (e.g. exception before __aexit__)
35
+ if tracer._start_time is not None and tracer._end_time is None:
36
+ tracer._end_time = time.perf_counter()
37
+
38
+ return tracer.build_trace()
39
+
40
+
41
+ async def _run_async(
42
+ test_fn: Callable[[Tracer], Any],
43
+ n: int,
44
+ concurrency: int,
45
+ ) -> list[AgentTrace]:
46
+ """Run test_fn N times with bounded concurrency using anyio."""
47
+ traces: list[AgentTrace] = []
48
+ lock = anyio.Lock()
49
+ semaphore = anyio.Semaphore(concurrency)
50
+
51
+ async def bounded_run(i: int) -> None:
52
+ async with semaphore:
53
+ trace = await _run_single(test_fn, i)
54
+ async with lock:
55
+ traces.append(trace)
56
+
57
+ async with anyio.create_task_group() as tg:
58
+ for i in range(n):
59
+ tg.start_soon(bounded_run, i)
60
+
61
+ return traces
62
+
63
+
64
+ def run(
65
+ test_fn: Callable[[Tracer], Any],
66
+ *,
67
+ n: int = 20,
68
+ concurrency: int = 4,
69
+ name: Optional[str] = None,
70
+ threshold: float = 0.8,
71
+ tags: Optional[list[str]] = None,
72
+ ) -> TestResult:
73
+ """Run a test function N times and return aggregated results.
74
+
75
+ Accepts both sync and async test functions. Sync functions are run in a
76
+ thread pool via anyio.to_thread.run_sync so they don't block the event loop.
77
+
78
+ Args:
79
+ test_fn: The test function. Signature: (tracer: Tracer) -> None.
80
+ n: Number of times to run the test. Default: 20.
81
+ concurrency: Maximum number of concurrent runs. Default: 4.
82
+ name: Override the test name (defaults to test_fn.__name__).
83
+ threshold: Pass rate required to consider the test successful. Default: 0.8.
84
+ tags: Optional list of tags for filtering in run_suite().
85
+
86
+ Returns:
87
+ TestResult with pass rate, all traces, and statistics.
88
+
89
+ Example::
90
+
91
+ result = agenteval.run(test_my_agent, n=20, threshold=0.9)
92
+ reporter.render_result(result)
93
+ """
94
+ actual_name = name or getattr(test_fn, "__name__", "unnamed_test")
95
+
96
+ # Normalize: wrap sync test functions so the runner is purely async internally
97
+ if not inspect.iscoroutinefunction(test_fn):
98
+ original = test_fn
99
+
100
+ async def async_wrapper(tracer: Tracer) -> None:
101
+ await anyio.to_thread.run_sync(functools.partial(original, tracer))
102
+
103
+ async_wrapper.__name__ = actual_name
104
+ wrapped: Callable[[Tracer], Any] = async_wrapper
105
+ else:
106
+ wrapped = test_fn
107
+
108
+ traces = anyio.run(_run_async, wrapped, n, concurrency)
109
+
110
+ n_passed = sum(1 for t in traces if t.passed)
111
+ return TestResult(
112
+ test_name=actual_name,
113
+ n_runs=n,
114
+ n_passed=n_passed,
115
+ pass_rate=n_passed / n if n > 0 else 0.0,
116
+ threshold=threshold,
117
+ traces=traces,
118
+ tags=tags or [],
119
+ )