agenteval-py 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agenteval/__init__.py +46 -0
- agenteval/adapters/__init__.py +9 -0
- agenteval/adapters/anthropic_adapter.py +80 -0
- agenteval/adapters/langchain_adapter.py +135 -0
- agenteval/adapters/openai_adapter.py +80 -0
- agenteval/assertions.py +289 -0
- agenteval/cli.py +93 -0
- agenteval/exceptions.py +17 -0
- agenteval/models.py +123 -0
- agenteval/py.typed +0 -0
- agenteval/registry.py +99 -0
- agenteval/reporter.py +139 -0
- agenteval/runner.py +119 -0
- agenteval/suite.py +181 -0
- agenteval/tracer.py +303 -0
- agenteval_py-0.1.0.dist-info/METADATA +561 -0
- agenteval_py-0.1.0.dist-info/RECORD +20 -0
- agenteval_py-0.1.0.dist-info/WHEEL +4 -0
- agenteval_py-0.1.0.dist-info/entry_points.txt +2 -0
- agenteval_py-0.1.0.dist-info/licenses/LICENSE +21 -0
agenteval/cli.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""CLI entry point for agenteval."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pathlib
|
|
6
|
+
from typing import Annotated, Optional
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
|
|
11
|
+
from agenteval.registry import TestRegistry
|
|
12
|
+
from agenteval.reporter import RichReporter
|
|
13
|
+
from agenteval.suite import run_suite
|
|
14
|
+
|
|
15
|
+
app = typer.Typer(
|
|
16
|
+
name="agenteval",
|
|
17
|
+
help="Evaluation toolkit for LLM agents.",
|
|
18
|
+
no_args_is_help=True,
|
|
19
|
+
add_completion=False,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@app.command(name="run")
|
|
24
|
+
def run_cmd(
|
|
25
|
+
paths: Annotated[
|
|
26
|
+
list[str],
|
|
27
|
+
typer.Argument(help="Test files or directories to discover (default: current dir)"),
|
|
28
|
+
] = [".", ], # noqa: B006
|
|
29
|
+
pattern: str = typer.Option("test_*.py", "--pattern", "-p", help="File glob pattern"),
|
|
30
|
+
tags: Optional[list[str]] = typer.Option(None, "--tag", "-t", help="Only run tests with this tag (repeatable)"),
|
|
31
|
+
n: Optional[int] = typer.Option(None, "--n", help="Override number of runs per test"),
|
|
32
|
+
threshold: Optional[float] = typer.Option(None, "--threshold", help="Override pass rate threshold (0.0–1.0)"),
|
|
33
|
+
concurrency: int = typer.Option(4, "--concurrency", "-c", help="Max concurrent runs"),
|
|
34
|
+
output: Optional[pathlib.Path] = typer.Option(None, "--output", "-o", help="Write JSON report to this file"),
|
|
35
|
+
no_color: bool = typer.Option(False, "--no-color", help="Disable color output"),
|
|
36
|
+
show_traces: bool = typer.Option(False, "--traces", help="Show per-trace details"),
|
|
37
|
+
show_failures: bool = typer.Option(True, "--failures/--no-failures", help="Show failure reasons"),
|
|
38
|
+
) -> None:
|
|
39
|
+
"""Discover and run agenteval tests."""
|
|
40
|
+
console = Console(no_color=no_color)
|
|
41
|
+
reporter = RichReporter(console=console, show_traces=show_traces, show_failures=show_failures)
|
|
42
|
+
|
|
43
|
+
# Reset registry so re-running the CLI in the same process doesn't double-count
|
|
44
|
+
TestRegistry.reset()
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
suite = run_suite(
|
|
48
|
+
paths=paths,
|
|
49
|
+
pattern=pattern,
|
|
50
|
+
tags=tags or None,
|
|
51
|
+
fail_under=threshold,
|
|
52
|
+
n_override=n,
|
|
53
|
+
concurrency=concurrency,
|
|
54
|
+
reporter=reporter,
|
|
55
|
+
)
|
|
56
|
+
except Exception as e:
|
|
57
|
+
console.print(f"[bold red]Error:[/bold red] {e}")
|
|
58
|
+
raise typer.Exit(code=2) from e
|
|
59
|
+
|
|
60
|
+
if output is not None:
|
|
61
|
+
reporter.export_json(suite, output)
|
|
62
|
+
|
|
63
|
+
raise typer.Exit(code=0 if suite.all_passed else 1)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@app.command(name="report")
|
|
67
|
+
def report_cmd(
|
|
68
|
+
json_file: Annotated[pathlib.Path, typer.Argument(help="JSON report file from a previous run")],
|
|
69
|
+
show_traces: bool = typer.Option(False, "--traces", help="Show per-trace details"),
|
|
70
|
+
no_color: bool = typer.Option(False, "--no-color"),
|
|
71
|
+
) -> None:
|
|
72
|
+
"""Pretty-print a saved JSON report."""
|
|
73
|
+
import json as _json
|
|
74
|
+
|
|
75
|
+
from agenteval.models import SuiteResult
|
|
76
|
+
|
|
77
|
+
console = Console(no_color=no_color)
|
|
78
|
+
|
|
79
|
+
if not json_file.exists():
|
|
80
|
+
console.print(f"[bold red]File not found:[/bold red] {json_file}")
|
|
81
|
+
raise typer.Exit(code=2)
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
data = _json.loads(json_file.read_text(encoding="utf-8"))
|
|
85
|
+
suite = SuiteResult.model_validate(data)
|
|
86
|
+
except Exception as e:
|
|
87
|
+
console.print(f"[bold red]Failed to load report:[/bold red] {e}")
|
|
88
|
+
raise typer.Exit(code=2) from e
|
|
89
|
+
|
|
90
|
+
reporter = RichReporter(console=console, show_traces=show_traces)
|
|
91
|
+
for result in suite.results:
|
|
92
|
+
reporter.render_result(result)
|
|
93
|
+
reporter.render_suite(suite)
|
agenteval/exceptions.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Custom exception hierarchy for agenteval."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class AgentEvalError(Exception):
|
|
5
|
+
"""Base exception for all agenteval errors."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AssertionFailure(AgentEvalError):
|
|
9
|
+
"""Raised when one or more trace assertions fail."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DiscoveryError(AgentEvalError):
|
|
13
|
+
"""Raised when test file discovery or import fails."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TracerError(AgentEvalError):
|
|
17
|
+
"""Raised for invalid tracer usage (e.g., accessing trace before run completes)."""
|
agenteval/models.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Pydantic data models for agenteval traces and results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, computed_field, field_validator
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ToolCall(BaseModel):
|
|
11
|
+
"""A single tool/function call recorded during an agent run."""
|
|
12
|
+
|
|
13
|
+
model_config = ConfigDict(frozen=True)
|
|
14
|
+
|
|
15
|
+
name: str
|
|
16
|
+
arguments: dict[str, Any]
|
|
17
|
+
result: Any = None
|
|
18
|
+
timestamp: float
|
|
19
|
+
duration_seconds: float
|
|
20
|
+
error: Optional[str] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AgentTrace(BaseModel):
|
|
24
|
+
"""Full trace of one agent run, including all tool calls and outcome."""
|
|
25
|
+
|
|
26
|
+
run_id: str
|
|
27
|
+
input: Any
|
|
28
|
+
output: Any = None
|
|
29
|
+
tool_calls: list[ToolCall] = []
|
|
30
|
+
total_steps: Optional[int] = None
|
|
31
|
+
duration_seconds: float = 0.0
|
|
32
|
+
token_usage: Optional[dict[str, int]] = None
|
|
33
|
+
error: Optional[str] = None
|
|
34
|
+
assertion_errors: list[str] = []
|
|
35
|
+
passed: bool = True
|
|
36
|
+
metadata: dict[str, Any] = {}
|
|
37
|
+
|
|
38
|
+
@computed_field # type: ignore[misc]
|
|
39
|
+
@property
|
|
40
|
+
def effective_steps(self) -> int:
|
|
41
|
+
"""Number of steps: explicit total_steps if set, otherwise len(tool_calls)."""
|
|
42
|
+
return self.total_steps if self.total_steps is not None else len(self.tool_calls)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class TestResult(BaseModel):
|
|
46
|
+
"""Aggregated results from running a test function N times."""
|
|
47
|
+
|
|
48
|
+
test_name: str
|
|
49
|
+
n_runs: int
|
|
50
|
+
n_passed: int
|
|
51
|
+
pass_rate: float
|
|
52
|
+
threshold: float
|
|
53
|
+
traces: list[AgentTrace]
|
|
54
|
+
tags: list[str] = []
|
|
55
|
+
|
|
56
|
+
@field_validator("pass_rate")
|
|
57
|
+
@classmethod
|
|
58
|
+
def _validate_pass_rate(cls, v: float) -> float:
|
|
59
|
+
if not 0.0 <= v <= 1.0:
|
|
60
|
+
raise ValueError(f"pass_rate must be between 0 and 1, got {v}")
|
|
61
|
+
return v
|
|
62
|
+
|
|
63
|
+
@computed_field # type: ignore[misc]
|
|
64
|
+
@property
|
|
65
|
+
def passed_traces(self) -> list[AgentTrace]:
|
|
66
|
+
return [t for t in self.traces if t.passed]
|
|
67
|
+
|
|
68
|
+
@computed_field # type: ignore[misc]
|
|
69
|
+
@property
|
|
70
|
+
def failed_traces(self) -> list[AgentTrace]:
|
|
71
|
+
return [t for t in self.traces if not t.passed]
|
|
72
|
+
|
|
73
|
+
@computed_field # type: ignore[misc]
|
|
74
|
+
@property
|
|
75
|
+
def avg_duration(self) -> float:
|
|
76
|
+
if not self.traces:
|
|
77
|
+
return 0.0
|
|
78
|
+
return sum(t.duration_seconds for t in self.traces) / len(self.traces)
|
|
79
|
+
|
|
80
|
+
@computed_field # type: ignore[misc]
|
|
81
|
+
@property
|
|
82
|
+
def avg_steps(self) -> float:
|
|
83
|
+
if not self.traces:
|
|
84
|
+
return 0.0
|
|
85
|
+
return sum(t.effective_steps for t in self.traces) / len(self.traces)
|
|
86
|
+
|
|
87
|
+
@computed_field # type: ignore[misc]
|
|
88
|
+
@property
|
|
89
|
+
def met_threshold(self) -> bool:
|
|
90
|
+
return self.pass_rate >= self.threshold
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class SuiteResult(BaseModel):
|
|
94
|
+
"""Aggregated results for an entire test suite."""
|
|
95
|
+
|
|
96
|
+
results: list[TestResult]
|
|
97
|
+
start_time: float
|
|
98
|
+
end_time: float
|
|
99
|
+
|
|
100
|
+
@computed_field # type: ignore[misc]
|
|
101
|
+
@property
|
|
102
|
+
def total_tests(self) -> int:
|
|
103
|
+
return len(self.results)
|
|
104
|
+
|
|
105
|
+
@computed_field # type: ignore[misc]
|
|
106
|
+
@property
|
|
107
|
+
def passed_tests(self) -> int:
|
|
108
|
+
return sum(1 for r in self.results if r.met_threshold)
|
|
109
|
+
|
|
110
|
+
@computed_field # type: ignore[misc]
|
|
111
|
+
@property
|
|
112
|
+
def failed_tests(self) -> int:
|
|
113
|
+
return self.total_tests - self.passed_tests
|
|
114
|
+
|
|
115
|
+
@computed_field # type: ignore[misc]
|
|
116
|
+
@property
|
|
117
|
+
def all_passed(self) -> bool:
|
|
118
|
+
return self.failed_tests == 0
|
|
119
|
+
|
|
120
|
+
@computed_field # type: ignore[misc]
|
|
121
|
+
@property
|
|
122
|
+
def duration_seconds(self) -> float:
|
|
123
|
+
return self.end_time - self.start_time
|
agenteval/py.typed
ADDED
|
File without changes
|
agenteval/registry.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Global test registry and @agenteval.test decorator."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, Callable, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class RegisteredTest:
|
|
11
|
+
"""Metadata for a test function registered with @agenteval.test."""
|
|
12
|
+
|
|
13
|
+
fn: Callable[..., Any]
|
|
14
|
+
name: str
|
|
15
|
+
n: int
|
|
16
|
+
threshold: float
|
|
17
|
+
tags: list[str]
|
|
18
|
+
module: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TestRegistry:
|
|
22
|
+
"""Singleton registry that collects all @agenteval.test-decorated functions."""
|
|
23
|
+
|
|
24
|
+
_instance: Optional["TestRegistry"] = None
|
|
25
|
+
|
|
26
|
+
def __init__(self) -> None:
|
|
27
|
+
self._tests: list[RegisteredTest] = []
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def global_registry(cls) -> "TestRegistry":
|
|
31
|
+
if cls._instance is None:
|
|
32
|
+
cls._instance = cls()
|
|
33
|
+
return cls._instance
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def reset(cls) -> None:
|
|
37
|
+
"""Replace the global registry with a fresh one. Used in tests."""
|
|
38
|
+
cls._instance = cls()
|
|
39
|
+
|
|
40
|
+
def register(self, entry: RegisteredTest) -> None:
|
|
41
|
+
self._tests.append(entry)
|
|
42
|
+
|
|
43
|
+
def clear(self) -> None:
|
|
44
|
+
self._tests.clear()
|
|
45
|
+
|
|
46
|
+
def get_all(self, tags: Optional[list[str]] = None) -> list[RegisteredTest]:
|
|
47
|
+
"""Return all registered tests, optionally filtered by tags."""
|
|
48
|
+
if not tags:
|
|
49
|
+
return list(self._tests)
|
|
50
|
+
return [t for t in self._tests if any(tag in t.tags for tag in tags)]
|
|
51
|
+
|
|
52
|
+
def snapshot(self) -> list[RegisteredTest]:
|
|
53
|
+
"""Return a copy of the current registered tests."""
|
|
54
|
+
return list(self._tests)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test(
|
|
58
|
+
fn: Optional[Callable[..., Any]] = None,
|
|
59
|
+
*,
|
|
60
|
+
n: int = 20,
|
|
61
|
+
threshold: float = 0.8,
|
|
62
|
+
tags: Optional[list[str]] = None,
|
|
63
|
+
) -> Any:
|
|
64
|
+
"""Decorator that registers a test function with the global TestRegistry.
|
|
65
|
+
|
|
66
|
+
Supports both bare and parameterized forms::
|
|
67
|
+
|
|
68
|
+
@agenteval.test
|
|
69
|
+
async def test_basic(tracer: Tracer) -> None: ...
|
|
70
|
+
|
|
71
|
+
@agenteval.test(n=10, threshold=0.9, tags=["slow"])
|
|
72
|
+
async def test_complex(tracer: Tracer) -> None: ...
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
fn: The test function (when used as bare decorator).
|
|
76
|
+
n: Number of runs. Default: 20.
|
|
77
|
+
threshold: Required pass rate. Default: 0.8.
|
|
78
|
+
tags: Optional tags for filtering.
|
|
79
|
+
"""
|
|
80
|
+
def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
|
|
81
|
+
registry = TestRegistry.global_registry()
|
|
82
|
+
registry.register(
|
|
83
|
+
RegisteredTest(
|
|
84
|
+
fn=func,
|
|
85
|
+
name=func.__name__,
|
|
86
|
+
n=n,
|
|
87
|
+
threshold=threshold,
|
|
88
|
+
tags=tags or [],
|
|
89
|
+
module=func.__module__,
|
|
90
|
+
)
|
|
91
|
+
)
|
|
92
|
+
return func
|
|
93
|
+
|
|
94
|
+
if fn is not None:
|
|
95
|
+
# Called as @agenteval.test (no parentheses)
|
|
96
|
+
return decorator(fn)
|
|
97
|
+
|
|
98
|
+
# Called as @agenteval.test(...) (with parentheses)
|
|
99
|
+
return decorator
|
agenteval/reporter.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Rich terminal reporter and JSON exporter."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import pathlib
|
|
7
|
+
from typing import Protocol, runtime_checkable
|
|
8
|
+
|
|
9
|
+
from rich import box
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.table import Table
|
|
12
|
+
from rich.text import Text
|
|
13
|
+
|
|
14
|
+
from agenteval.models import SuiteResult, TestResult
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@runtime_checkable
|
|
18
|
+
class Reporter(Protocol):
|
|
19
|
+
def render_result(self, result: TestResult) -> None: ...
|
|
20
|
+
def render_suite(self, suite: SuiteResult) -> None: ...
|
|
21
|
+
def export_json(self, suite: SuiteResult, path: pathlib.Path) -> None: ...
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _pass_rate_display(result: TestResult) -> tuple[str, str]:
|
|
25
|
+
"""Return (label, style) for a test result's pass rate."""
|
|
26
|
+
rate = result.pass_rate
|
|
27
|
+
threshold = result.threshold
|
|
28
|
+
fraction = f"{result.n_passed}/{result.n_runs}"
|
|
29
|
+
|
|
30
|
+
if rate >= threshold:
|
|
31
|
+
return f"✅ {fraction} ({rate:.0%})", "green"
|
|
32
|
+
elif rate >= threshold * 0.5:
|
|
33
|
+
return f"⚠️ {fraction} ({rate:.0%})", "yellow"
|
|
34
|
+
else:
|
|
35
|
+
return f"❌ {fraction} ({rate:.0%})", "red"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class RichReporter:
|
|
39
|
+
"""Rich terminal reporter with color-coded pass rates and summary tables.
|
|
40
|
+
|
|
41
|
+
Output format::
|
|
42
|
+
|
|
43
|
+
test_basic_search 18/20 ✅ 90% avg 1.2s 3.1 steps
|
|
44
|
+
test_complex_reasoning 8/20 ⚠️ 40% avg 4.7s 7.2 steps
|
|
45
|
+
test_hallucination 3/20 ❌ 15% avg 2.1s 5.0 steps
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
console: Console | None = None,
|
|
51
|
+
*,
|
|
52
|
+
show_failures: bool = True,
|
|
53
|
+
show_traces: bool = False,
|
|
54
|
+
) -> None:
|
|
55
|
+
self.console = console or Console()
|
|
56
|
+
self.show_failures = show_failures
|
|
57
|
+
self.show_traces = show_traces
|
|
58
|
+
|
|
59
|
+
def render_result(self, result: TestResult) -> None:
|
|
60
|
+
"""Print a single test result line. Called after each test completes."""
|
|
61
|
+
label, style = _pass_rate_display(result)
|
|
62
|
+
self.console.print(
|
|
63
|
+
f" [bold]{result.test_name}[/bold]"
|
|
64
|
+
f" [{style}]{label}[/{style}]"
|
|
65
|
+
f" [dim]avg {result.avg_duration:.2f}s {result.avg_steps:.1f} steps[/dim]"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
if self.show_failures and result.failed_traces:
|
|
69
|
+
for i, trace in enumerate(result.failed_traces[:3], 1):
|
|
70
|
+
reasons: list[str] = []
|
|
71
|
+
if trace.error:
|
|
72
|
+
reasons.append(f"error: {trace.error}")
|
|
73
|
+
if trace.assertion_errors:
|
|
74
|
+
reasons.extend(trace.assertion_errors[:2])
|
|
75
|
+
if reasons:
|
|
76
|
+
reason_text = " | ".join(r[:120] for r in reasons[:2])
|
|
77
|
+
self.console.print(f" [dim red]↳ failure {i}: {reason_text}[/dim red]")
|
|
78
|
+
|
|
79
|
+
if self.show_traces:
|
|
80
|
+
self._print_traces(result)
|
|
81
|
+
|
|
82
|
+
def render_suite(self, suite: SuiteResult) -> None:
|
|
83
|
+
"""Print a summary table for the full suite."""
|
|
84
|
+
if not suite.results:
|
|
85
|
+
self.console.print("[dim]No tests found.[/dim]")
|
|
86
|
+
return
|
|
87
|
+
|
|
88
|
+
total_runs = sum(r.n_runs for r in suite.results)
|
|
89
|
+
self.console.print()
|
|
90
|
+
|
|
91
|
+
table = Table(
|
|
92
|
+
box=box.ROUNDED,
|
|
93
|
+
show_header=True,
|
|
94
|
+
header_style="bold",
|
|
95
|
+
title=f"agenteval results — {suite.total_tests} test(s) · {total_runs} total runs · {suite.duration_seconds:.1f}s",
|
|
96
|
+
)
|
|
97
|
+
table.add_column("Test", style="bold", no_wrap=True)
|
|
98
|
+
table.add_column("Runs", justify="center")
|
|
99
|
+
table.add_column("Pass Rate", justify="center")
|
|
100
|
+
table.add_column("Avg Duration", justify="right")
|
|
101
|
+
table.add_column("Avg Steps", justify="right")
|
|
102
|
+
table.add_column("Threshold", justify="center")
|
|
103
|
+
|
|
104
|
+
for result in suite.results:
|
|
105
|
+
label, style = _pass_rate_display(result)
|
|
106
|
+
table.add_row(
|
|
107
|
+
result.test_name,
|
|
108
|
+
f"{result.n_passed}/{result.n_runs}",
|
|
109
|
+
Text(label, style=style),
|
|
110
|
+
f"{result.avg_duration:.2f}s",
|
|
111
|
+
f"{result.avg_steps:.1f}",
|
|
112
|
+
f"{result.threshold:.0%}",
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
self.console.print(table)
|
|
116
|
+
|
|
117
|
+
# Footer summary
|
|
118
|
+
if suite.all_passed:
|
|
119
|
+
self.console.print(
|
|
120
|
+
f"\n [bold green]✅ All {suite.total_tests} test(s) passed their threshold.[/bold green]"
|
|
121
|
+
)
|
|
122
|
+
else:
|
|
123
|
+
self.console.print(
|
|
124
|
+
f"\n [bold red]❌ {suite.failed_tests}/{suite.total_tests} test(s) failed their threshold.[/bold red]"
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
def export_json(self, suite: SuiteResult, path: pathlib.Path) -> None:
|
|
128
|
+
"""Write suite results to a JSON file."""
|
|
129
|
+
data = suite.model_dump(mode="json")
|
|
130
|
+
path.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
|
131
|
+
self.console.print(f"\n [dim]JSON report saved to {path}[/dim]")
|
|
132
|
+
|
|
133
|
+
def _print_traces(self, result: TestResult) -> None:
|
|
134
|
+
for i, trace in enumerate(result.traces, 1):
|
|
135
|
+
status = "✅" if trace.passed else "❌"
|
|
136
|
+
self.console.print(
|
|
137
|
+
f" [dim]{status} run {i} {trace.duration_seconds:.2f}s "
|
|
138
|
+
f"{trace.effective_steps} steps output={str(trace.output)[:60]!r}[/dim]"
|
|
139
|
+
)
|
agenteval/runner.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Test runner: executes a test function N times with bounded async concurrency."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import functools
|
|
6
|
+
import inspect
|
|
7
|
+
import time
|
|
8
|
+
from typing import Any, Callable, Optional
|
|
9
|
+
|
|
10
|
+
import anyio
|
|
11
|
+
|
|
12
|
+
from agenteval.models import AgentTrace, TestResult
|
|
13
|
+
from agenteval.tracer import _ACTIVE_TRACER, Tracer
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def _run_single(
|
|
17
|
+
test_fn: Callable[[Tracer], Any],
|
|
18
|
+
run_index: int,
|
|
19
|
+
) -> AgentTrace:
|
|
20
|
+
"""Execute one run of test_fn with a fresh Tracer. Returns the completed AgentTrace."""
|
|
21
|
+
tracer = Tracer()
|
|
22
|
+
token = _ACTIVE_TRACER.set(tracer)
|
|
23
|
+
try:
|
|
24
|
+
await test_fn(tracer)
|
|
25
|
+
except AssertionError as e:
|
|
26
|
+
# Assertion failures from tracer.assert_that().check()
|
|
27
|
+
tracer._assertion_errors = [line.strip(" •") for line in str(e).splitlines() if line.strip()]
|
|
28
|
+
tracer._assertion_errors = [str(e)]
|
|
29
|
+
except Exception as e:
|
|
30
|
+
if tracer._run_error is None:
|
|
31
|
+
tracer._run_error = f"{type(e).__name__}: {e}"
|
|
32
|
+
finally:
|
|
33
|
+
_ACTIVE_TRACER.reset(token)
|
|
34
|
+
# Close the run context if the test forgot to (e.g. exception before __aexit__)
|
|
35
|
+
if tracer._start_time is not None and tracer._end_time is None:
|
|
36
|
+
tracer._end_time = time.perf_counter()
|
|
37
|
+
|
|
38
|
+
return tracer.build_trace()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
async def _run_async(
|
|
42
|
+
test_fn: Callable[[Tracer], Any],
|
|
43
|
+
n: int,
|
|
44
|
+
concurrency: int,
|
|
45
|
+
) -> list[AgentTrace]:
|
|
46
|
+
"""Run test_fn N times with bounded concurrency using anyio."""
|
|
47
|
+
traces: list[AgentTrace] = []
|
|
48
|
+
lock = anyio.Lock()
|
|
49
|
+
semaphore = anyio.Semaphore(concurrency)
|
|
50
|
+
|
|
51
|
+
async def bounded_run(i: int) -> None:
|
|
52
|
+
async with semaphore:
|
|
53
|
+
trace = await _run_single(test_fn, i)
|
|
54
|
+
async with lock:
|
|
55
|
+
traces.append(trace)
|
|
56
|
+
|
|
57
|
+
async with anyio.create_task_group() as tg:
|
|
58
|
+
for i in range(n):
|
|
59
|
+
tg.start_soon(bounded_run, i)
|
|
60
|
+
|
|
61
|
+
return traces
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def run(
|
|
65
|
+
test_fn: Callable[[Tracer], Any],
|
|
66
|
+
*,
|
|
67
|
+
n: int = 20,
|
|
68
|
+
concurrency: int = 4,
|
|
69
|
+
name: Optional[str] = None,
|
|
70
|
+
threshold: float = 0.8,
|
|
71
|
+
tags: Optional[list[str]] = None,
|
|
72
|
+
) -> TestResult:
|
|
73
|
+
"""Run a test function N times and return aggregated results.
|
|
74
|
+
|
|
75
|
+
Accepts both sync and async test functions. Sync functions are run in a
|
|
76
|
+
thread pool via anyio.to_thread.run_sync so they don't block the event loop.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
test_fn: The test function. Signature: (tracer: Tracer) -> None.
|
|
80
|
+
n: Number of times to run the test. Default: 20.
|
|
81
|
+
concurrency: Maximum number of concurrent runs. Default: 4.
|
|
82
|
+
name: Override the test name (defaults to test_fn.__name__).
|
|
83
|
+
threshold: Pass rate required to consider the test successful. Default: 0.8.
|
|
84
|
+
tags: Optional list of tags for filtering in run_suite().
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
TestResult with pass rate, all traces, and statistics.
|
|
88
|
+
|
|
89
|
+
Example::
|
|
90
|
+
|
|
91
|
+
result = agenteval.run(test_my_agent, n=20, threshold=0.9)
|
|
92
|
+
reporter.render_result(result)
|
|
93
|
+
"""
|
|
94
|
+
actual_name = name or getattr(test_fn, "__name__", "unnamed_test")
|
|
95
|
+
|
|
96
|
+
# Normalize: wrap sync test functions so the runner is purely async internally
|
|
97
|
+
if not inspect.iscoroutinefunction(test_fn):
|
|
98
|
+
original = test_fn
|
|
99
|
+
|
|
100
|
+
async def async_wrapper(tracer: Tracer) -> None:
|
|
101
|
+
await anyio.to_thread.run_sync(functools.partial(original, tracer))
|
|
102
|
+
|
|
103
|
+
async_wrapper.__name__ = actual_name
|
|
104
|
+
wrapped: Callable[[Tracer], Any] = async_wrapper
|
|
105
|
+
else:
|
|
106
|
+
wrapped = test_fn
|
|
107
|
+
|
|
108
|
+
traces = anyio.run(_run_async, wrapped, n, concurrency)
|
|
109
|
+
|
|
110
|
+
n_passed = sum(1 for t in traces if t.passed)
|
|
111
|
+
return TestResult(
|
|
112
|
+
test_name=actual_name,
|
|
113
|
+
n_runs=n,
|
|
114
|
+
n_passed=n_passed,
|
|
115
|
+
pass_rate=n_passed / n if n > 0 else 0.0,
|
|
116
|
+
threshold=threshold,
|
|
117
|
+
traces=traces,
|
|
118
|
+
tags=tags or [],
|
|
119
|
+
)
|