agenttest-py 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
agenttest/__init__.py ADDED
@@ -0,0 +1,67 @@
1
+ """agenttest — The pytest of AI agents."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .record import record
6
+ from .assertions import (
7
+ assert_compare,
8
+ assert_conciseness,
9
+ assert_contains_action,
10
+ assert_faithfulness,
11
+ assert_no_hallucination,
12
+ assert_no_toxicity,
13
+ assert_relevance,
14
+ assert_score,
15
+ assert_tone,
16
+ )
17
+ from .judge import Judge, judge
18
+ from .reporter import load_last_report, report_results
19
+ from .runner import discover_tests, run_tests
20
+
21
+ __all__ = [
22
+ "eval",
23
+ "judge",
24
+ "record",
25
+ "EvalResult",
26
+ "Judge",
27
+ "assert_tone",
28
+ "assert_contains_action",
29
+ "assert_no_hallucination",
30
+ "assert_relevance",
31
+ "assert_no_toxicity",
32
+ "assert_faithfulness",
33
+ "assert_conciseness",
34
+ "assert_score",
35
+ "assert_compare",
36
+ "discover_tests",
37
+ "run_tests",
38
+ "load_last_report",
39
+ "report_results",
40
+ ]
41
+
42
+
43
+ def eval(fn: object) -> object:
44
+ """
45
+ Decorator to mark a function as an agent eval test.
46
+ Discovered by agenttest run.
47
+ """
48
+ # Passthrough - runner discovers by parsing source for @eval
49
+ return fn
50
+
51
+
52
+ class EvalResult:
53
+ """Result of a single eval run."""
54
+
55
+ def __init__(
56
+ self,
57
+ test_name: str,
58
+ status: str,
59
+ duration: float = 0,
60
+ error_message: str | None = None,
61
+ scores: dict[str, float] | None = None,
62
+ ):
63
+ self.test_name = test_name
64
+ self.status = status
65
+ self.duration = duration
66
+ self.error_message = error_message
67
+ self.scores = scores or {}
agenttest/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Allow running as python -m agenttest."""
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
@@ -0,0 +1,154 @@
1
+ """Assertion helpers for agent evaluations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from .judge import Judge, judge
8
+
9
+ __all__ = [
10
+ "assert_tone",
11
+ "assert_contains_action",
12
+ "assert_no_hallucination",
13
+ "assert_relevance",
14
+ "assert_no_toxicity",
15
+ "assert_faithfulness",
16
+ "assert_conciseness",
17
+ "assert_score",
18
+ "assert_compare",
19
+ ]
20
+
21
+
22
+ def _format_explanation(data: dict[str, Any]) -> str:
23
+ """Extract explanation from judge JSON response."""
24
+ for key in ("explanation", "reason"):
25
+ if key in data and data[key]:
26
+ return str(data[key])
27
+ return ""
28
+
29
+
30
+ def assert_tone(
31
+ response: str,
32
+ expected: str,
33
+ judge_instance: Judge | None = None,
34
+ ) -> None:
35
+ """
36
+ Assert the response has the expected tone (empathetic, professional, aggressive, neutral).
37
+ """
38
+ j = judge_instance or judge
39
+ actual_tone = j.tone(response)
40
+ if actual_tone != expected:
41
+ raise AssertionError(
42
+ f"Expected tone '{expected}', got '{actual_tone}'"
43
+ )
44
+
45
+
46
+ def assert_contains_action(
47
+ response: str,
48
+ action: str,
49
+ judge_instance: Judge | None = None,
50
+ ) -> None:
51
+ """Assert the response contains or implies the given action."""
52
+ j = judge_instance or judge
53
+ if not j.contains_action(response, action):
54
+ raise AssertionError(
55
+ f"Expected response to contain action '{action}', but it did not"
56
+ )
57
+
58
+
59
+ def assert_no_hallucination(
60
+ response: str,
61
+ context: str | None = None,
62
+ judge_instance: Judge | None = None,
63
+ ) -> None:
64
+ """Assert the response contains no hallucinations."""
65
+ j = judge_instance or judge
66
+ if not j.no_hallucination(response, context):
67
+ raise AssertionError(
68
+ "Response appears to contain hallucinations (claims not supported by context)"
69
+ )
70
+
71
+
72
+ def assert_relevance(
73
+ response: str,
74
+ query: str,
75
+ min_score: float = 0.7,
76
+ judge_instance: Judge | None = None,
77
+ ) -> None:
78
+ """Assert relevance score of response to query is at least min_score."""
79
+ j = judge_instance or judge
80
+ score = j.relevance(response, query)
81
+ if score < min_score:
82
+ raise AssertionError(
83
+ f"Relevance score {score:.2f} below threshold {min_score}"
84
+ )
85
+
86
+
87
+ def assert_no_toxicity(
88
+ response: str,
89
+ judge_instance: Judge | None = None,
90
+ ) -> None:
91
+ """Assert the response is not toxic."""
92
+ j = judge_instance or judge
93
+ if j.toxicity(response):
94
+ raise AssertionError("Response contains toxic content")
95
+
96
+
97
+ def assert_faithfulness(
98
+ response: str,
99
+ source: str,
100
+ min_score: float = 0.8,
101
+ judge_instance: Judge | None = None,
102
+ ) -> None:
103
+ """Assert faithfulness score of response to source is at least min_score."""
104
+ j = judge_instance or judge
105
+ score = j.faithfulness(response, source)
106
+ if score < min_score:
107
+ raise AssertionError(
108
+ f"Faithfulness score {score:.2f} below threshold {min_score}"
109
+ )
110
+
111
+
112
+ def assert_conciseness(
113
+ response: str,
114
+ expected: str = "good",
115
+ judge_instance: Judge | None = None,
116
+ ) -> None:
117
+ """Assert conciseness is expected (too_short, good, too_long)."""
118
+ j = judge_instance or judge
119
+ actual = j.conciseness(response)
120
+ if actual != expected:
121
+ raise AssertionError(
122
+ f"Expected conciseness '{expected}', got '{actual}'"
123
+ )
124
+
125
+
126
+ def assert_score(
127
+ response: str,
128
+ criteria: str,
129
+ min_score: float = 0.7,
130
+ judge_instance: Judge | None = None,
131
+ ) -> None:
132
+ """Assert custom score meets minimum."""
133
+ j = judge_instance or judge
134
+ score = j.score(response, criteria)
135
+ if score < min_score:
136
+ raise AssertionError(
137
+ f"Score {score:.2f} below threshold {min_score} for criteria: {criteria}"
138
+ )
139
+
140
+
141
+ def assert_compare(
142
+ response_a: str,
143
+ response_b: str,
144
+ criteria: str,
145
+ expected_winner: str,
146
+ judge_instance: Judge | None = None,
147
+ ) -> None:
148
+ """Assert expected_winner ('a', 'b', or 'tie') wins the comparison."""
149
+ j = judge_instance or judge
150
+ winner = j.compare(response_a, response_b, criteria)
151
+ if winner != expected_winner:
152
+ raise AssertionError(
153
+ f"Expected winner '{expected_winner}', got '{winner}'"
154
+ )
agenttest/cli.py ADDED
@@ -0,0 +1,249 @@
1
+ """CLI for agenttest."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import sys
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ import click
11
+ from rich.console import Console
12
+ from rich.progress import Progress, SpinnerColumn, TextColumn
13
+ from rich.panel import Panel
14
+
15
+ from .config import load_config
16
+ from .reporter import RESULTS_FILE, load_last_report, report_compare, report_diff, report_results
17
+ from .runner import discover_tests, run_tests
18
+
19
+ console = Console()
20
+
21
+
22
+ def _results_path(tag: str | None = None) -> Path:
23
+ """Path for results file, optionally tagged."""
24
+ if tag:
25
+ return Path(f".agenttest_results_{tag}.json")
26
+ return RESULTS_FILE
27
+
28
+
29
+ @click.group()
30
+ @click.version_option(version="0.1.0")
31
+ def app() -> None:
32
+ """The pytest of AI agents. Eval-driven testing for LLM applications."""
33
+ pass
34
+
35
+
36
+ @app.command()
37
+ @click.option("--path", "-p", default=".", type=click.Path(exists=True, file_okay=False, dir_okay=True))
38
+ @click.option("--filter", "-k", "filter_pattern", default=None)
39
+ @click.option("--workers", "-w", default=4, type=int)
40
+ @click.option("--tag", "-t", default=None)
41
+ def run(path: str, filter_pattern: Optional[str], workers: int, tag: Optional[str]) -> None:
42
+ """Discover and run all agent eval tests."""
43
+ from .config import get_api_key
44
+
45
+ path_obj = Path(path)
46
+ config = load_config(path_obj)
47
+ try:
48
+ get_api_key(config)
49
+ except ValueError as e:
50
+ console.print(str(e), style="red")
51
+ sys.exit(1)
52
+
53
+ timeout = config.get("timeout_seconds", 30)
54
+ fail_threshold = config.get("fail_threshold", 0.8)
55
+
56
+ tests = discover_tests(path_obj, filter_pattern)
57
+ if not tests:
58
+ console.print(Panel(
59
+ "No tests found. Create files matching agent_test_*.py or *_agent_test.py with @eval decorated functions.",
60
+ title="[yellow]No Tests[/yellow]",
61
+ border_style="yellow",
62
+ ))
63
+ sys.exit(1)
64
+
65
+ console.print(f"[cyan]Running {len(tests)} test(s)...[/cyan]\n")
66
+
67
+ with Progress(
68
+ SpinnerColumn(),
69
+ TextColumn("[progress.description]{task.description}"),
70
+ console=console,
71
+ ) as progress:
72
+ task = progress.add_task("Executing evals...", total=len(tests))
73
+ results = run_tests(path_obj, filter_pattern, workers=workers, timeout_seconds=timeout)
74
+ progress.update(task, completed=len(tests))
75
+
76
+ report = report_results(results, fail_threshold=fail_threshold)
77
+
78
+ if tag:
79
+ out = _results_path(tag)
80
+ out.write_text(json.dumps(report, indent=2))
81
+ console.print(f"\n[dim]Results saved to {out}[/dim]")
82
+
83
+ total = report.get("total", 0)
84
+ passed = report.get("passed", 0)
85
+ if total > 0 and (passed / total) < fail_threshold:
86
+ sys.exit(1)
87
+
88
+
89
+ @app.command()
90
+ @click.option("--path", "-p", default=".", type=click.Path())
91
+ def init(path: str) -> None:
92
+ """Scaffold agent_test_example.py, agenttest.toml, and GitHub Actions workflow."""
93
+ path_obj = Path(path).resolve()
94
+ path_obj.mkdir(parents=True, exist_ok=True)
95
+
96
+ example_content = '''"""Example agent eval tests. Replace my_agent with your agent."""
97
+
98
+ from agenttest import eval, judge, record
99
+
100
+
101
+ def my_agent(query: str) -> str:
102
+ """Mock agent - replace with your actual agent call."""
103
+ if "refund" in query.lower():
104
+ return (
105
+ "I'm sorry to hear you're unhappy. Our refund policy allows "
106
+ "returns within 30 days. Would you like me to start the refund process?"
107
+ )
108
+ return "How can I help you today?"
109
+
110
+
111
+ @eval
112
+ def test_customer_support_refund():
113
+ query = "I want a refund"
114
+ response = my_agent(query)
115
+ record(query, response)
116
+ assert judge.tone(response) == "empathetic"
117
+ assert judge.contains_action(response, "refund_policy")
118
+ assert judge.no_hallucination(response)
119
+
120
+
121
+ @eval
122
+ def test_customer_support_generic():
123
+ query = "What are your hours?"
124
+ response = my_agent(query)
125
+ record(query, response)
126
+ assert judge.relevance(response, "business hours") >= 0.3 # May be generic
127
+ assert not judge.toxicity(response)
128
+
129
+
130
+ @eval
131
+ def test_conciseness():
132
+ query = "Hi"
133
+ response = my_agent(query)
134
+ record(query, response)
135
+ assert judge.conciseness(response) in ("good", "too_short", "too_long")
136
+
137
+
138
+ @eval
139
+ def test_custom_score():
140
+ query = "I want a refund"
141
+ response = my_agent(query)
142
+ score = judge.score(
143
+ response,
144
+ criteria="Does the response show empathy and offer a clear next step?",
145
+ )
146
+ record(query, response, score)
147
+ assert score >= 0.5
148
+ '''
149
+
150
+ toml_content = '''[agenttest]
151
+ model = "claude-3-5-haiku-latest"
152
+ timeout_seconds = 30
153
+ workers = 4
154
+ fail_threshold = 0.8
155
+ cache = true
156
+
157
+ [agenttest.env]
158
+ ANTHROPIC_API_KEY = "$ANTHROPIC_API_KEY"
159
+ '''
160
+
161
+ workflow_content = '''name: Agent Evals
162
+ on:
163
+ pull_request:
164
+ branches: [main]
165
+ push:
166
+ branches: [main]
167
+
168
+ jobs:
169
+ eval:
170
+ runs-on: ubuntu-latest
171
+ steps:
172
+ - uses: actions/checkout@v4
173
+ - uses: actions/setup-python@v5
174
+ with:
175
+ python-version: "3.11"
176
+ - run: pip install agenttest-py
177
+ - run: agenttest run
178
+ env:
179
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
180
+ '''
181
+
182
+ (path_obj / "agent_test_example.py").write_text(example_content)
183
+ (path_obj / "agenttest.toml").write_text(toml_content)
184
+ workflows_dir = path_obj / ".github" / "workflows"
185
+ workflows_dir.mkdir(parents=True, exist_ok=True)
186
+ (workflows_dir / "agenttest.yml").write_text(workflow_content)
187
+
188
+ console.print("[green]✓[/green] Created agent_test_example.py")
189
+ console.print("[green]✓[/green] Created agenttest.toml")
190
+ console.print("[green]✓[/green] Created .github/workflows/agenttest.yml")
191
+ console.print("\n[dim]Run: agenttest run[/dim]")
192
+ console.print("[dim]Set ANTHROPIC_API_KEY in your environment or agenttest.toml[/dim]")
193
+
194
+
195
+ @app.command()
196
+ def report() -> None:
197
+ """Show last run results from .agenttest_results.json."""
198
+ data = load_last_report()
199
+ if not data:
200
+ console.print("[yellow]No previous run found. Run 'agenttest run' first.[/yellow]")
201
+ sys.exit(1)
202
+
203
+ report_results(data.get("results", []), live=False)
204
+
205
+
206
+ @app.command()
207
+ @click.argument("v1")
208
+ @click.argument("v2")
209
+ def compare(v1: str, v2: str) -> None:
210
+ """Compare pass/fail metrics between two tagged runs."""
211
+ p1 = _results_path(v1)
212
+ p2 = _results_path(v2)
213
+ if not p1.is_file():
214
+ console.print(f"[red]No results found for tag '{v1}'. Run: agenttest run --tag {v1}[/red]")
215
+ sys.exit(1)
216
+ if not p2.is_file():
217
+ console.print(f"[red]No results found for tag '{v2}'. Run: agenttest run --tag {v2}[/red]")
218
+ sys.exit(1)
219
+
220
+ r1 = json.loads(p1.read_text())
221
+ r2 = json.loads(p2.read_text())
222
+ report_compare(r1, r2, tag_a=v1, tag_b=v2)
223
+
224
+
225
+ @app.command()
226
+ @click.argument("v1")
227
+ @click.argument("v2")
228
+ def diff(v1: str, v2: str) -> None:
229
+ """Show side-by-side diff of agent responses between two runs. The git diff for agent behavior."""
230
+ p1 = _results_path(v1)
231
+ p2 = _results_path(v2)
232
+ if not p1.is_file():
233
+ console.print(f"[red]No results found for tag '{v1}'. Run: agenttest run --tag {v1}[/red]")
234
+ sys.exit(1)
235
+ if not p2.is_file():
236
+ console.print(f"[red]No results found for tag '{v2}'. Run: agenttest run --tag {v2}[/red]")
237
+ sys.exit(1)
238
+
239
+ r1 = json.loads(p1.read_text())
240
+ r2 = json.loads(p2.read_text())
241
+ report_diff(r1, r2, tag_a=v1, tag_b=v2)
242
+
243
+
244
+ def main() -> None:
245
+ app()
246
+
247
+
248
+ if __name__ == "__main__":
249
+ main()
agenttest/config.py ADDED
@@ -0,0 +1,123 @@
1
+ """Configuration loader for agenttest."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ try:
10
+ import toml
11
+ except ImportError:
12
+ toml = None # type: ignore[assignment]
13
+
14
+ DEFAULT_CONFIG = {
15
+ "model": "claude-3-5-haiku-latest",
16
+ "timeout_seconds": 30,
17
+ "workers": 4,
18
+ "fail_threshold": 0.8,
19
+ "cache": True,
20
+ }
21
+
22
+ CONFIG_FILENAMES = ("agenttest.toml", "pyproject.toml")
23
+
24
+
25
+ def _find_config_path(start: Path) -> Path | None:
26
+ """Find agenttest.toml or [tool.agenttest] in pyproject.toml."""
27
+ current = start.resolve()
28
+ for _ in range(10): # Max 10 levels up
29
+ for name in CONFIG_FILENAMES:
30
+ path = current / name
31
+ if path.is_file():
32
+ if name == "agenttest.toml":
33
+ return path
34
+ if name == "pyproject.toml":
35
+ try:
36
+ data = toml.load(path)
37
+ if "tool" in data and "agenttest" in data["tool"]:
38
+ return path
39
+ except Exception:
40
+ pass
41
+ parent = current.parent
42
+ if parent == current:
43
+ break
44
+ current = parent
45
+ return None
46
+
47
+
48
+ def _load_toml(path: Path) -> dict[str, Any]:
49
+ """Load TOML file."""
50
+ if toml is None:
51
+ raise ImportError(
52
+ "toml package is required for config loading. Install with: pip install toml"
53
+ )
54
+ return toml.load(path)
55
+
56
+
57
+ def _resolve_env(value: str) -> str:
58
+ """Resolve $VAR or ${VAR} in string. Missing vars become empty string."""
59
+ if not isinstance(value, str):
60
+ return value
61
+ if value.startswith("$") and len(value) > 1:
62
+ var = value[1:]
63
+ if var.startswith("{"):
64
+ var = var[1:-1]
65
+ return os.environ.get(var, "")
66
+ return value
67
+
68
+
69
+ def load_config(start_dir: Path | str | None = None) -> dict[str, Any]:
70
+ """
71
+ Load agenttest configuration from agenttest.toml or pyproject.toml.
72
+
73
+ Searches upward from start_dir (default: cwd) for config files.
74
+ Environment variables in config values (e.g. $ANTHROPIC_API_KEY) are resolved.
75
+
76
+ Returns:
77
+ Merged config dict with defaults.
78
+ """
79
+ start = Path(start_dir or os.getcwd())
80
+ config: dict[str, Any] = {**DEFAULT_CONFIG}
81
+
82
+ path = _find_config_path(start)
83
+ if path:
84
+ raw = _load_toml(path)
85
+ if path.name == "pyproject.toml":
86
+ agent = raw.get("tool", {}).get("agenttest", {})
87
+ else:
88
+ agent = raw.get("agenttest", {})
89
+
90
+ # Merge [agenttest] section
91
+ for key, value in agent.items():
92
+ if key == "env":
93
+ continue
94
+ if value is not None:
95
+ config[key] = value
96
+
97
+ # Resolve [agenttest.env]
98
+ env_section = agent.get("env", {})
99
+ for key, val in env_section.items():
100
+ if isinstance(val, str):
101
+ config.setdefault("env", {})[key] = _resolve_env(val)
102
+ else:
103
+ config.setdefault("env", {})[key] = val
104
+
105
+ return config
106
+
107
+
108
+ def get_api_key(config: dict[str, Any] | None = None) -> str:
109
+ """
110
+ Get Anthropic API key from config env or ANTHROPIC_API_KEY.
111
+
112
+ Raises:
113
+ ValueError: If no API key is found.
114
+ """
115
+ if config is None:
116
+ config = load_config()
117
+ env = config.get("env", {})
118
+ key = env.get("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_API_KEY")
119
+ if not key:
120
+ raise ValueError(
121
+ "ANTHROPIC_API_KEY not set. Set it in your environment or in agenttest.toml under agenttest.env."
122
+ )
123
+ return str(key)