agenttest-py 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agenttest_py-0.1.0/.github/workflows/agenttest.yml +19 -0
- agenttest_py-0.1.0/.github/workflows/publish.yml +29 -0
- agenttest_py-0.1.0/.gitignore +44 -0
- agenttest_py-0.1.0/CONTRIBUTING.md +38 -0
- agenttest_py-0.1.0/LICENSE +21 -0
- agenttest_py-0.1.0/PKG-INFO +174 -0
- agenttest_py-0.1.0/README.md +155 -0
- agenttest_py-0.1.0/agenttest/__init__.py +67 -0
- agenttest_py-0.1.0/agenttest/__main__.py +6 -0
- agenttest_py-0.1.0/agenttest/assertions.py +154 -0
- agenttest_py-0.1.0/agenttest/cli.py +249 -0
- agenttest_py-0.1.0/agenttest/config.py +123 -0
- agenttest_py-0.1.0/agenttest/judge.py +237 -0
- agenttest_py-0.1.0/agenttest/record.py +35 -0
- agenttest_py-0.1.0/agenttest/reporter.py +222 -0
- agenttest_py-0.1.0/agenttest/runner.py +228 -0
- agenttest_py-0.1.0/agenttest.toml +9 -0
- agenttest_py-0.1.0/pyproject.toml +31 -0
- agenttest_py-0.1.0/tests/__init__.py +1 -0
- agenttest_py-0.1.0/tests/agent_test_example.py +85 -0
- agenttest_py-0.1.0/tests/test_runner.py +46 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
name: Agent Evals
|
|
2
|
+
on:
|
|
3
|
+
pull_request:
|
|
4
|
+
branches: [main]
|
|
5
|
+
push:
|
|
6
|
+
branches: [main]
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
eval:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
- uses: actions/setup-python@v5
|
|
14
|
+
with:
|
|
15
|
+
python-version: "3.11"
|
|
16
|
+
- run: pip install -e .
|
|
17
|
+
- run: agenttest run
|
|
18
|
+
env:
|
|
19
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
publish:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
environment: release
|
|
12
|
+
permissions:
|
|
13
|
+
id-token: write
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.11"
|
|
21
|
+
|
|
22
|
+
- name: Install hatch
|
|
23
|
+
run: pip install hatch
|
|
24
|
+
|
|
25
|
+
- name: Build
|
|
26
|
+
run: hatch build
|
|
27
|
+
|
|
28
|
+
- name: Publish to PyPI
|
|
29
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# Agenttest
|
|
2
|
+
.agenttest_cache/
|
|
3
|
+
.agenttest_results.json
|
|
4
|
+
|
|
5
|
+
# Python
|
|
6
|
+
__pycache__/
|
|
7
|
+
*.py[cod]
|
|
8
|
+
*$py.class
|
|
9
|
+
*.so
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
*.egg-info/
|
|
24
|
+
.installed.cfg
|
|
25
|
+
*.egg
|
|
26
|
+
|
|
27
|
+
# Virtual environments
|
|
28
|
+
.venv/
|
|
29
|
+
venv/
|
|
30
|
+
ENV/
|
|
31
|
+
env/
|
|
32
|
+
|
|
33
|
+
# IDE
|
|
34
|
+
.idea/
|
|
35
|
+
.vscode/
|
|
36
|
+
*.swp
|
|
37
|
+
*.swo
|
|
38
|
+
*~
|
|
39
|
+
|
|
40
|
+
# Testing
|
|
41
|
+
.coverage
|
|
42
|
+
htmlcov/
|
|
43
|
+
.pytest_cache/
|
|
44
|
+
.tox/
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Contributing to agenttest
|
|
2
|
+
|
|
3
|
+
Thank you for considering contributing to agenttest! This document provides guidelines for contributing.
|
|
4
|
+
|
|
5
|
+
## Development Setup
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
git clone https://github.com/your-org/agenttest.git
|
|
9
|
+
cd agenttest
|
|
10
|
+
pip install -e ".[dev]"
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Running Tests
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# Unit tests (pytest)
|
|
17
|
+
pytest tests/
|
|
18
|
+
|
|
19
|
+
# Agent evals (requires ANTHROPIC_API_KEY)
|
|
20
|
+
agenttest run
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Code Style
|
|
24
|
+
|
|
25
|
+
- Use type hints on all public functions
|
|
26
|
+
- Add docstrings to public APIs
|
|
27
|
+
- Follow existing patterns in the codebase
|
|
28
|
+
|
|
29
|
+
## Pull Request Process
|
|
30
|
+
|
|
31
|
+
1. Fork the repo and create a branch
|
|
32
|
+
2. Make your changes with tests
|
|
33
|
+
3. Ensure `pytest tests/` and `agenttest run` pass
|
|
34
|
+
4. Submit a PR with a clear description
|
|
35
|
+
|
|
36
|
+
## Questions?
|
|
37
|
+
|
|
38
|
+
Open an issue for discussion.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 ShashStudios
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agenttest-py
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: The pytest of AI agents. Eval-driven testing for LLM applications.
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: agents,ai,anthropic,evals,llm,pytest,testing
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Topic :: Software Development :: Testing
|
|
11
|
+
Requires-Python: >=3.9
|
|
12
|
+
Requires-Dist: anthropic>=0.20.0
|
|
13
|
+
Requires-Dist: click>=8.0.0
|
|
14
|
+
Requires-Dist: pydantic>=2.0.0
|
|
15
|
+
Requires-Dist: pytest>=7.0.0
|
|
16
|
+
Requires-Dist: rich>=13.0.0
|
|
17
|
+
Requires-Dist: toml>=0.10.0
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# agenttest
|
|
21
|
+
|
|
22
|
+
**The pytest of AI agents.** Catch regressions before they reach prod.
|
|
23
|
+
|
|
24
|
+
[](https://pypi.org/project/agenttest-py/)
|
|
25
|
+
[](https://opensource.org/licenses/MIT)
|
|
26
|
+
[](https://www.python.org/downloads/)
|
|
27
|
+
[](https://github.com/ShashStudios/agenttest/actions)
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## You ship an agent. You change a prompt. Did it get better or worse?
|
|
32
|
+
|
|
33
|
+
**You have no idea.** No test suite. No CI. No diff. Just deploy and hope.
|
|
34
|
+
|
|
35
|
+
Every team shipping AI agents hits the same wall: your "eval" is manually pasting examples into a playground. One prompt tweak could break everything—or fix everything—and you won't know until a user complains.
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## 30-Second Quickstart
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install agenttest-py
|
|
43
|
+
export ANTHROPIC_API_KEY=your_key
|
|
44
|
+
agenttest init
|
|
45
|
+
agenttest run
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
**Or from scratch:**
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
# agent_test_example.py
|
|
52
|
+
from agenttest import eval, judge
|
|
53
|
+
|
|
54
|
+
def my_agent(query: str) -> str:
|
|
55
|
+
return "Your agent's response" # Replace with real agent
|
|
56
|
+
|
|
57
|
+
@eval
|
|
58
|
+
def test_customer_support():
|
|
59
|
+
response = my_agent("I want a refund")
|
|
60
|
+
assert judge.tone(response) == "empathetic"
|
|
61
|
+
assert judge.no_hallucination(response)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
agenttest run
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## Features
|
|
71
|
+
|
|
72
|
+
- **Code-first** — Tests are just Python. No YAML. No config hell.
|
|
73
|
+
- **LLM-as-judge** — 9 built-in scorers: tone, hallucination, relevance, toxicity, faithfulness, conciseness, custom criteria, A/B compare.
|
|
74
|
+
- **Local & CI** — Runs anywhere. Add 4 lines to GitHub Actions. No account. No dashboard.
|
|
75
|
+
- **`agenttest diff`** — Side-by-side view of how your agent's responses changed between two runs. The git diff for agent behavior.
|
|
76
|
+
- **Caching** — Judge results cached in `.agenttest_cache/` to avoid redundant API calls.
|
|
77
|
+
- **Parallel** — `--workers 4` for faster runs.
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## agenttest diff — The Git Diff for Agent Behavior
|
|
82
|
+
|
|
83
|
+
See exactly how your agent's responses changed between two runs:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
agenttest run --tag v1 # Before your prompt change
|
|
87
|
+
agenttest run --tag v2 # After your prompt change
|
|
88
|
+
agenttest diff v1 v2
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
test_customer_support_refund:
|
|
93
|
+
BEFORE: "I cannot help with refunds" pass
|
|
94
|
+
AFTER: "I'd be happy to process that for you" pass
|
|
95
|
+
DELTA: ✓ improved
|
|
96
|
+
|
|
97
|
+
test_helpful_tone:
|
|
98
|
+
BEFORE: "Our policy states no returns" fail
|
|
99
|
+
AFTER: "I'm sorry to hear that. Let me help" pass
|
|
100
|
+
DELTA: +1 ↑
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Judge Functions
|
|
106
|
+
|
|
107
|
+
| Function | Returns | Description |
|
|
108
|
+
|----------|---------|-------------|
|
|
109
|
+
| `judge.tone(response)` | `str` | empathetic, professional, aggressive, neutral |
|
|
110
|
+
| `judge.no_hallucination(response, context?)` | `bool` | True if no hallucination |
|
|
111
|
+
| `judge.contains_action(response, action)` | `bool` | Response mentions/implies the action |
|
|
112
|
+
| `judge.relevance(response, query)` | `float` | 0.0–1.0 relevance |
|
|
113
|
+
| `judge.toxicity(response)` | `bool` | True if toxic |
|
|
114
|
+
| `judge.faithfulness(response, source)` | `float` | 0.0–1.0 faithfulness |
|
|
115
|
+
| `judge.conciseness(response)` | `str` | too_short, good, too_long |
|
|
116
|
+
| `judge.score(response, criteria)` | `float` | Custom 0.0–1.0 score |
|
|
117
|
+
| `judge.compare(a, b, criteria)` | `str` | "a", "b", or "tie" |
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## CI in 4 Lines
|
|
122
|
+
|
|
123
|
+
```yaml
|
|
124
|
+
# .github/workflows/agenttest.yml
|
|
125
|
+
- run: pip install agenttest-py-py
|
|
126
|
+
- run: agenttest run
|
|
127
|
+
env:
|
|
128
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Every PR shows whether your agent got better or worse.
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## agenttest vs Braintrust / DeepEval / Promptfoo
|
|
136
|
+
|
|
137
|
+
| | agenttest | Braintrust | DeepEval | Promptfoo |
|
|
138
|
+
|---|:---:|:---:|:---:|:---:|
|
|
139
|
+
| **No account required** | ✅ | ❌ | ❌ | ❌ |
|
|
140
|
+
| **No vendor lock-in** | ✅ | ❌ | ❌ | ❌ |
|
|
141
|
+
| **Lives in your codebase** | ✅ | ❌ | ❌ | ❌ |
|
|
142
|
+
| **Behavior diff (before/after)** | ✅ | ❌ | ❌ | ❌ |
|
|
143
|
+
| **Runs locally** | ✅ | ✅ | ✅ | ✅ |
|
|
144
|
+
| **MIT License** | ✅ | ❌ | ❌ | ✅ |
|
|
145
|
+
| **Code-first API** | ✅ | ⚠️ | ⚠️ | ⚠️ |
|
|
146
|
+
|
|
147
|
+
**agenttest** = pytest for agents. No dashboards. No SaaS. Your tests, your repo, your CI.
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## Config
|
|
152
|
+
|
|
153
|
+
```toml
|
|
154
|
+
# agenttest.toml
|
|
155
|
+
[agenttest]
|
|
156
|
+
model = "claude-3-5-haiku-latest"
|
|
157
|
+
timeout_seconds = 30
|
|
158
|
+
workers = 4
|
|
159
|
+
fail_threshold = 0.8
|
|
160
|
+
cache = true
|
|
161
|
+
|
|
162
|
+
[agenttest.env]
|
|
163
|
+
ANTHROPIC_API_KEY = "$ANTHROPIC_API_KEY"
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Contributing
|
|
169
|
+
|
|
170
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
171
|
+
|
|
172
|
+
## License
|
|
173
|
+
|
|
174
|
+
MIT
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# agenttest
|
|
2
|
+
|
|
3
|
+
**The pytest of AI agents.** Catch regressions before they reach prod.
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/agenttest-py/)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
[](https://www.python.org/downloads/)
|
|
8
|
+
[](https://github.com/ShashStudios/agenttest/actions)
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## You ship an agent. You change a prompt. Did it get better or worse?
|
|
13
|
+
|
|
14
|
+
**You have no idea.** No test suite. No CI. No diff. Just deploy and hope.
|
|
15
|
+
|
|
16
|
+
Every team shipping AI agents hits the same wall: your "eval" is manually pasting examples into a playground. One prompt tweak could break everything—or fix everything—and you won't know until a user complains.
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## 30-Second Quickstart
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install agenttest-py
|
|
24
|
+
export ANTHROPIC_API_KEY=your_key
|
|
25
|
+
agenttest init
|
|
26
|
+
agenttest run
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
**Or from scratch:**
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
# agent_test_example.py
|
|
33
|
+
from agenttest import eval, judge
|
|
34
|
+
|
|
35
|
+
def my_agent(query: str) -> str:
|
|
36
|
+
return "Your agent's response" # Replace with real agent
|
|
37
|
+
|
|
38
|
+
@eval
|
|
39
|
+
def test_customer_support():
|
|
40
|
+
response = my_agent("I want a refund")
|
|
41
|
+
assert judge.tone(response) == "empathetic"
|
|
42
|
+
assert judge.no_hallucination(response)
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
agenttest run
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## Features
|
|
52
|
+
|
|
53
|
+
- **Code-first** — Tests are just Python. No YAML. No config hell.
|
|
54
|
+
- **LLM-as-judge** — 9 built-in scorers: tone, hallucination, relevance, toxicity, faithfulness, conciseness, custom criteria, A/B compare.
|
|
55
|
+
- **Local & CI** — Runs anywhere. Add 4 lines to GitHub Actions. No account. No dashboard.
|
|
56
|
+
- **`agenttest diff`** — Side-by-side view of how your agent's responses changed between two runs. The git diff for agent behavior.
|
|
57
|
+
- **Caching** — Judge results cached in `.agenttest_cache/` to avoid redundant API calls.
|
|
58
|
+
- **Parallel** — `--workers 4` for faster runs.
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## agenttest diff — The Git Diff for Agent Behavior
|
|
63
|
+
|
|
64
|
+
See exactly how your agent's responses changed between two runs:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
agenttest run --tag v1 # Before your prompt change
|
|
68
|
+
agenttest run --tag v2 # After your prompt change
|
|
69
|
+
agenttest diff v1 v2
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
```
|
|
73
|
+
test_customer_support_refund:
|
|
74
|
+
BEFORE: "I cannot help with refunds" pass
|
|
75
|
+
AFTER: "I'd be happy to process that for you" pass
|
|
76
|
+
DELTA: ✓ improved
|
|
77
|
+
|
|
78
|
+
test_helpful_tone:
|
|
79
|
+
BEFORE: "Our policy states no returns" fail
|
|
80
|
+
AFTER: "I'm sorry to hear that. Let me help" pass
|
|
81
|
+
DELTA: +1 ↑
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## Judge Functions
|
|
87
|
+
|
|
88
|
+
| Function | Returns | Description |
|
|
89
|
+
|----------|---------|-------------|
|
|
90
|
+
| `judge.tone(response)` | `str` | empathetic, professional, aggressive, neutral |
|
|
91
|
+
| `judge.no_hallucination(response, context?)` | `bool` | True if no hallucination |
|
|
92
|
+
| `judge.contains_action(response, action)` | `bool` | Response mentions/implies the action |
|
|
93
|
+
| `judge.relevance(response, query)` | `float` | 0.0–1.0 relevance |
|
|
94
|
+
| `judge.toxicity(response)` | `bool` | True if toxic |
|
|
95
|
+
| `judge.faithfulness(response, source)` | `float` | 0.0–1.0 faithfulness |
|
|
96
|
+
| `judge.conciseness(response)` | `str` | too_short, good, too_long |
|
|
97
|
+
| `judge.score(response, criteria)` | `float` | Custom 0.0–1.0 score |
|
|
98
|
+
| `judge.compare(a, b, criteria)` | `str` | "a", "b", or "tie" |
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## CI in 4 Lines
|
|
103
|
+
|
|
104
|
+
```yaml
|
|
105
|
+
# .github/workflows/agenttest.yml
|
|
106
|
+
- run: pip install agenttest-py-py
|
|
107
|
+
- run: agenttest run
|
|
108
|
+
env:
|
|
109
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Every PR shows whether your agent got better or worse.
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## agenttest vs Braintrust / DeepEval / Promptfoo
|
|
117
|
+
|
|
118
|
+
| | agenttest | Braintrust | DeepEval | Promptfoo |
|
|
119
|
+
|---|:---:|:---:|:---:|:---:|
|
|
120
|
+
| **No account required** | ✅ | ❌ | ❌ | ❌ |
|
|
121
|
+
| **No vendor lock-in** | ✅ | ❌ | ❌ | ❌ |
|
|
122
|
+
| **Lives in your codebase** | ✅ | ❌ | ❌ | ❌ |
|
|
123
|
+
| **Behavior diff (before/after)** | ✅ | ❌ | ❌ | ❌ |
|
|
124
|
+
| **Runs locally** | ✅ | ✅ | ✅ | ✅ |
|
|
125
|
+
| **MIT License** | ✅ | ❌ | ❌ | ✅ |
|
|
126
|
+
| **Code-first API** | ✅ | ⚠️ | ⚠️ | ⚠️ |
|
|
127
|
+
|
|
128
|
+
**agenttest** = pytest for agents. No dashboards. No SaaS. Your tests, your repo, your CI.
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## Config
|
|
133
|
+
|
|
134
|
+
```toml
|
|
135
|
+
# agenttest.toml
|
|
136
|
+
[agenttest]
|
|
137
|
+
model = "claude-3-5-haiku-latest"
|
|
138
|
+
timeout_seconds = 30
|
|
139
|
+
workers = 4
|
|
140
|
+
fail_threshold = 0.8
|
|
141
|
+
cache = true
|
|
142
|
+
|
|
143
|
+
[agenttest.env]
|
|
144
|
+
ANTHROPIC_API_KEY = "$ANTHROPIC_API_KEY"
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Contributing
|
|
150
|
+
|
|
151
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
152
|
+
|
|
153
|
+
## License
|
|
154
|
+
|
|
155
|
+
MIT
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""agenttest — The pytest of AI agents."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .record import record
|
|
6
|
+
from .assertions import (
|
|
7
|
+
assert_compare,
|
|
8
|
+
assert_conciseness,
|
|
9
|
+
assert_contains_action,
|
|
10
|
+
assert_faithfulness,
|
|
11
|
+
assert_no_hallucination,
|
|
12
|
+
assert_no_toxicity,
|
|
13
|
+
assert_relevance,
|
|
14
|
+
assert_score,
|
|
15
|
+
assert_tone,
|
|
16
|
+
)
|
|
17
|
+
from .judge import Judge, judge
|
|
18
|
+
from .reporter import load_last_report, report_results
|
|
19
|
+
from .runner import discover_tests, run_tests
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"eval",
|
|
23
|
+
"judge",
|
|
24
|
+
"record",
|
|
25
|
+
"EvalResult",
|
|
26
|
+
"Judge",
|
|
27
|
+
"assert_tone",
|
|
28
|
+
"assert_contains_action",
|
|
29
|
+
"assert_no_hallucination",
|
|
30
|
+
"assert_relevance",
|
|
31
|
+
"assert_no_toxicity",
|
|
32
|
+
"assert_faithfulness",
|
|
33
|
+
"assert_conciseness",
|
|
34
|
+
"assert_score",
|
|
35
|
+
"assert_compare",
|
|
36
|
+
"discover_tests",
|
|
37
|
+
"run_tests",
|
|
38
|
+
"load_last_report",
|
|
39
|
+
"report_results",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def eval(fn: object) -> object:
|
|
44
|
+
"""
|
|
45
|
+
Decorator to mark a function as an agent eval test.
|
|
46
|
+
Discovered by agenttest run.
|
|
47
|
+
"""
|
|
48
|
+
# Passthrough - runner discovers by parsing source for @eval
|
|
49
|
+
return fn
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class EvalResult:
|
|
53
|
+
"""Result of a single eval run."""
|
|
54
|
+
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
test_name: str,
|
|
58
|
+
status: str,
|
|
59
|
+
duration: float = 0,
|
|
60
|
+
error_message: str | None = None,
|
|
61
|
+
scores: dict[str, float] | None = None,
|
|
62
|
+
):
|
|
63
|
+
self.test_name = test_name
|
|
64
|
+
self.status = status
|
|
65
|
+
self.duration = duration
|
|
66
|
+
self.error_message = error_message
|
|
67
|
+
self.scores = scores or {}
|