agenttest-py 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ name: Agent Evals
2
+ on:
3
+ pull_request:
4
+ branches: [main]
5
+ push:
6
+ branches: [main]
7
+
8
+ jobs:
9
+ eval:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ - uses: actions/setup-python@v5
14
+ with:
15
+ python-version: "3.11"
16
+ - run: pip install -e .
17
+ - run: agenttest run
18
+ env:
19
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
@@ -0,0 +1,29 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ jobs:
9
+ publish:
10
+ runs-on: ubuntu-latest
11
+ environment: release
12
+ permissions:
13
+ id-token: write
14
+
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - uses: actions/setup-python@v5
19
+ with:
20
+ python-version: "3.11"
21
+
22
+ - name: Install hatch
23
+ run: pip install hatch
24
+
25
+ - name: Build
26
+ run: hatch build
27
+
28
+ - name: Publish to PyPI
29
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,44 @@
1
+ # Agenttest
2
+ .agenttest_cache/
3
+ .agenttest_results.json
4
+
5
+ # Python
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+ *.so
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+
27
+ # Virtual environments
28
+ .venv/
29
+ venv/
30
+ ENV/
31
+ env/
32
+
33
+ # IDE
34
+ .idea/
35
+ .vscode/
36
+ *.swp
37
+ *.swo
38
+ *~
39
+
40
+ # Testing
41
+ .coverage
42
+ htmlcov/
43
+ .pytest_cache/
44
+ .tox/
@@ -0,0 +1,38 @@
1
+ # Contributing to agenttest
2
+
3
+ Thank you for considering contributing to agenttest! This document provides guidelines for contributing.
4
+
5
+ ## Development Setup
6
+
7
+ ```bash
8
+ git clone https://github.com/your-org/agenttest.git
9
+ cd agenttest
10
+ pip install -e ".[dev]"
11
+ ```
12
+
13
+ ## Running Tests
14
+
15
+ ```bash
16
+ # Unit tests (pytest)
17
+ pytest tests/
18
+
19
+ # Agent evals (requires ANTHROPIC_API_KEY)
20
+ agenttest run
21
+ ```
22
+
23
+ ## Code Style
24
+
25
+ - Use type hints on all public functions
26
+ - Add docstrings to public APIs
27
+ - Follow existing patterns in the codebase
28
+
29
+ ## Pull Request Process
30
+
31
+ 1. Fork the repo and create a branch
32
+ 2. Make your changes with tests
33
+ 3. Ensure `pytest tests/` and `agenttest run` pass
34
+ 4. Submit a PR with a clear description
35
+
36
+ ## Questions?
37
+
38
+ Open an issue for discussion.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 ShashStudios
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,174 @@
1
+ Metadata-Version: 2.4
2
+ Name: agenttest-py
3
+ Version: 0.1.0
4
+ Summary: The pytest of AI agents. Eval-driven testing for LLM applications.
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Keywords: agents,ai,anthropic,evals,llm,pytest,testing
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Topic :: Software Development :: Testing
11
+ Requires-Python: >=3.9
12
+ Requires-Dist: anthropic>=0.20.0
13
+ Requires-Dist: click>=8.0.0
14
+ Requires-Dist: pydantic>=2.0.0
15
+ Requires-Dist: pytest>=7.0.0
16
+ Requires-Dist: rich>=13.0.0
17
+ Requires-Dist: toml>=0.10.0
18
+ Description-Content-Type: text/markdown
19
+
20
+ # agenttest
21
+
22
+ **The pytest of AI agents.** Catch regressions before they reach prod.
23
+
24
+ [![PyPI version](https://img.shields.io/pypi/v/agenttest-py.svg)](https://pypi.org/project/agenttest-py/)
25
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
26
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
27
+ [![CI](https://github.com/ShashStudios/agenttest/actions/workflows/agenttest.yml/badge.svg)](https://github.com/ShashStudios/agenttest/actions)
28
+
29
+ ---
30
+
31
+ ## You ship an agent. You change a prompt. Did it get better or worse?
32
+
33
+ **You have no idea.** No test suite. No CI. No diff. Just deploy and hope.
34
+
35
+ Every team shipping AI agents hits the same wall: your "eval" is manually pasting examples into a playground. One prompt tweak could break everything—or fix everything—and you won't know until a user complains.
36
+
37
+ ---
38
+
39
+ ## 30-Second Quickstart
40
+
41
+ ```bash
42
+ pip install agenttest-py
43
+ export ANTHROPIC_API_KEY=your_key
44
+ agenttest init
45
+ agenttest run
46
+ ```
47
+
48
+ **Or from scratch:**
49
+
50
+ ```python
51
+ # agent_test_example.py
52
+ from agenttest import eval, judge
53
+
54
+ def my_agent(query: str) -> str:
55
+ return "Your agent's response" # Replace with real agent
56
+
57
+ @eval
58
+ def test_customer_support():
59
+ response = my_agent("I want a refund")
60
+ assert judge.tone(response) == "empathetic"
61
+ assert judge.no_hallucination(response)
62
+ ```
63
+
64
+ ```bash
65
+ agenttest run
66
+ ```
67
+
68
+ ---
69
+
70
+ ## Features
71
+
72
+ - **Code-first** — Tests are just Python. No YAML. No config hell.
73
+ - **LLM-as-judge** — 9 built-in scorers: tone, hallucination, relevance, toxicity, faithfulness, conciseness, custom criteria, A/B compare.
74
+ - **Local & CI** — Runs anywhere. Add 4 lines to GitHub Actions. No account. No dashboard.
75
+ - **`agenttest diff`** — Side-by-side view of how your agent's responses changed between two runs. The git diff for agent behavior.
76
+ - **Caching** — Judge results cached in `.agenttest_cache/` to avoid redundant API calls.
77
+ - **Parallel** — `--workers 4` for faster runs.
78
+
79
+ ---
80
+
81
+ ## agenttest diff — The Git Diff for Agent Behavior
82
+
83
+ See exactly how your agent's responses changed between two runs:
84
+
85
+ ```bash
86
+ agenttest run --tag v1 # Before your prompt change
87
+ agenttest run --tag v2 # After your prompt change
88
+ agenttest diff v1 v2
89
+ ```
90
+
91
+ ```
92
+ test_customer_support_refund:
93
+ BEFORE: "I cannot help with refunds" pass
94
+ AFTER: "I'd be happy to process that for you" pass
95
+ DELTA: ✓ improved
96
+
97
+ test_helpful_tone:
98
+ BEFORE: "Our policy states no returns" fail
99
+ AFTER: "I'm sorry to hear that. Let me help" pass
100
+ DELTA: +1 ↑
101
+ ```
102
+
103
+ ---
104
+
105
+ ## Judge Functions
106
+
107
+ | Function | Returns | Description |
108
+ |----------|---------|-------------|
109
+ | `judge.tone(response)` | `str` | empathetic, professional, aggressive, neutral |
110
+ | `judge.no_hallucination(response, context?)` | `bool` | True if no hallucination |
111
+ | `judge.contains_action(response, action)` | `bool` | Response mentions/implies the action |
112
+ | `judge.relevance(response, query)` | `float` | 0.0–1.0 relevance |
113
+ | `judge.toxicity(response)` | `bool` | True if toxic |
114
+ | `judge.faithfulness(response, source)` | `float` | 0.0–1.0 faithfulness |
115
+ | `judge.conciseness(response)` | `str` | too_short, good, too_long |
116
+ | `judge.score(response, criteria)` | `float` | Custom 0.0–1.0 score |
117
+ | `judge.compare(a, b, criteria)` | `str` | "a", "b", or "tie" |
118
+
119
+ ---
120
+
121
+ ## CI in 4 Lines
122
+
123
+ ```yaml
124
+ # .github/workflows/agenttest.yml
125
+ - run: pip install agenttest-py-py
126
+ - run: agenttest run
127
+ env:
128
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
129
+ ```
130
+
131
+ Every PR shows whether your agent got better or worse.
132
+
133
+ ---
134
+
135
+ ## agenttest vs Braintrust / DeepEval / Promptfoo
136
+
137
+ | | agenttest | Braintrust | DeepEval | Promptfoo |
138
+ |---|:---:|:---:|:---:|:---:|
139
+ | **No account required** | ✅ | ❌ | ❌ | ❌ |
140
+ | **No vendor lock-in** | ✅ | ❌ | ❌ | ❌ |
141
+ | **Lives in your codebase** | ✅ | ❌ | ❌ | ❌ |
142
+ | **Behavior diff (before/after)** | ✅ | ❌ | ❌ | ❌ |
143
+ | **Runs locally** | ✅ | ✅ | ✅ | ✅ |
144
+ | **MIT License** | ✅ | ❌ | ❌ | ✅ |
145
+ | **Code-first API** | ✅ | ⚠️ | ⚠️ | ⚠️ |
146
+
147
+ **agenttest** = pytest for agents. No dashboards. No SaaS. Your tests, your repo, your CI.
148
+
149
+ ---
150
+
151
+ ## Config
152
+
153
+ ```toml
154
+ # agenttest.toml
155
+ [agenttest]
156
+ model = "claude-3-5-haiku-latest"
157
+ timeout_seconds = 30
158
+ workers = 4
159
+ fail_threshold = 0.8
160
+ cache = true
161
+
162
+ [agenttest.env]
163
+ ANTHROPIC_API_KEY = "$ANTHROPIC_API_KEY"
164
+ ```
165
+
166
+ ---
167
+
168
+ ## Contributing
169
+
170
+ See [CONTRIBUTING.md](CONTRIBUTING.md).
171
+
172
+ ## License
173
+
174
+ MIT
@@ -0,0 +1,155 @@
1
+ # agenttest
2
+
3
+ **The pytest of AI agents.** Catch regressions before they reach prod.
4
+
5
+ [![PyPI version](https://img.shields.io/pypi/v/agenttest-py.svg)](https://pypi.org/project/agenttest-py/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
8
+ [![CI](https://github.com/ShashStudios/agenttest/actions/workflows/agenttest.yml/badge.svg)](https://github.com/ShashStudios/agenttest/actions)
9
+
10
+ ---
11
+
12
+ ## You ship an agent. You change a prompt. Did it get better or worse?
13
+
14
+ **You have no idea.** No test suite. No CI. No diff. Just deploy and hope.
15
+
16
+ Every team shipping AI agents hits the same wall: your "eval" is manually pasting examples into a playground. One prompt tweak could break everything—or fix everything—and you won't know until a user complains.
17
+
18
+ ---
19
+
20
+ ## 30-Second Quickstart
21
+
22
+ ```bash
23
+ pip install agenttest-py
24
+ export ANTHROPIC_API_KEY=your_key
25
+ agenttest init
26
+ agenttest run
27
+ ```
28
+
29
+ **Or from scratch:**
30
+
31
+ ```python
32
+ # agent_test_example.py
33
+ from agenttest import eval, judge
34
+
35
+ def my_agent(query: str) -> str:
36
+ return "Your agent's response" # Replace with real agent
37
+
38
+ @eval
39
+ def test_customer_support():
40
+ response = my_agent("I want a refund")
41
+ assert judge.tone(response) == "empathetic"
42
+ assert judge.no_hallucination(response)
43
+ ```
44
+
45
+ ```bash
46
+ agenttest run
47
+ ```
48
+
49
+ ---
50
+
51
+ ## Features
52
+
53
+ - **Code-first** — Tests are just Python. No YAML. No config hell.
54
+ - **LLM-as-judge** — 9 built-in scorers: tone, hallucination, relevance, toxicity, faithfulness, conciseness, custom criteria, A/B compare.
55
+ - **Local & CI** — Runs anywhere. Add 4 lines to GitHub Actions. No account. No dashboard.
56
+ - **`agenttest diff`** — Side-by-side view of how your agent's responses changed between two runs. The git diff for agent behavior.
57
+ - **Caching** — Judge results cached in `.agenttest_cache/` to avoid redundant API calls.
58
+ - **Parallel** — `--workers 4` for faster runs.
59
+
60
+ ---
61
+
62
+ ## agenttest diff — The Git Diff for Agent Behavior
63
+
64
+ See exactly how your agent's responses changed between two runs:
65
+
66
+ ```bash
67
+ agenttest run --tag v1 # Before your prompt change
68
+ agenttest run --tag v2 # After your prompt change
69
+ agenttest diff v1 v2
70
+ ```
71
+
72
+ ```
73
+ test_customer_support_refund:
74
+ BEFORE: "I cannot help with refunds" pass
75
+ AFTER: "I'd be happy to process that for you" pass
76
+ DELTA: ✓ improved
77
+
78
+ test_helpful_tone:
79
+ BEFORE: "Our policy states no returns" fail
80
+ AFTER: "I'm sorry to hear that. Let me help" pass
81
+ DELTA: +1 ↑
82
+ ```
83
+
84
+ ---
85
+
86
+ ## Judge Functions
87
+
88
+ | Function | Returns | Description |
89
+ |----------|---------|-------------|
90
+ | `judge.tone(response)` | `str` | empathetic, professional, aggressive, neutral |
91
+ | `judge.no_hallucination(response, context?)` | `bool` | True if no hallucination |
92
+ | `judge.contains_action(response, action)` | `bool` | Response mentions/implies the action |
93
+ | `judge.relevance(response, query)` | `float` | 0.0–1.0 relevance |
94
+ | `judge.toxicity(response)` | `bool` | True if toxic |
95
+ | `judge.faithfulness(response, source)` | `float` | 0.0–1.0 faithfulness |
96
+ | `judge.conciseness(response)` | `str` | too_short, good, too_long |
97
+ | `judge.score(response, criteria)` | `float` | Custom 0.0–1.0 score |
98
+ | `judge.compare(a, b, criteria)` | `str` | "a", "b", or "tie" |
99
+
100
+ ---
101
+
102
+ ## CI in 4 Lines
103
+
104
+ ```yaml
105
+ # .github/workflows/agenttest.yml
106
+ - run: pip install agenttest-py-py
107
+ - run: agenttest run
108
+ env:
109
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
110
+ ```
111
+
112
+ Every PR shows whether your agent got better or worse.
113
+
114
+ ---
115
+
116
+ ## agenttest vs Braintrust / DeepEval / Promptfoo
117
+
118
+ | | agenttest | Braintrust | DeepEval | Promptfoo |
119
+ |---|:---:|:---:|:---:|:---:|
120
+ | **No account required** | ✅ | ❌ | ❌ | ❌ |
121
+ | **No vendor lock-in** | ✅ | ❌ | ❌ | ❌ |
122
+ | **Lives in your codebase** | ✅ | ❌ | ❌ | ❌ |
123
+ | **Behavior diff (before/after)** | ✅ | ❌ | ❌ | ❌ |
124
+ | **Runs locally** | ✅ | ✅ | ✅ | ✅ |
125
+ | **MIT License** | ✅ | ❌ | ❌ | ✅ |
126
+ | **Code-first API** | ✅ | ⚠️ | ⚠️ | ⚠️ |
127
+
128
+ **agenttest** = pytest for agents. No dashboards. No SaaS. Your tests, your repo, your CI.
129
+
130
+ ---
131
+
132
+ ## Config
133
+
134
+ ```toml
135
+ # agenttest.toml
136
+ [agenttest]
137
+ model = "claude-3-5-haiku-latest"
138
+ timeout_seconds = 30
139
+ workers = 4
140
+ fail_threshold = 0.8
141
+ cache = true
142
+
143
+ [agenttest.env]
144
+ ANTHROPIC_API_KEY = "$ANTHROPIC_API_KEY"
145
+ ```
146
+
147
+ ---
148
+
149
+ ## Contributing
150
+
151
+ See [CONTRIBUTING.md](CONTRIBUTING.md).
152
+
153
+ ## License
154
+
155
+ MIT
@@ -0,0 +1,67 @@
1
+ """agenttest — The pytest of AI agents."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .record import record
6
+ from .assertions import (
7
+ assert_compare,
8
+ assert_conciseness,
9
+ assert_contains_action,
10
+ assert_faithfulness,
11
+ assert_no_hallucination,
12
+ assert_no_toxicity,
13
+ assert_relevance,
14
+ assert_score,
15
+ assert_tone,
16
+ )
17
+ from .judge import Judge, judge
18
+ from .reporter import load_last_report, report_results
19
+ from .runner import discover_tests, run_tests
20
+
21
+ __all__ = [
22
+ "eval",
23
+ "judge",
24
+ "record",
25
+ "EvalResult",
26
+ "Judge",
27
+ "assert_tone",
28
+ "assert_contains_action",
29
+ "assert_no_hallucination",
30
+ "assert_relevance",
31
+ "assert_no_toxicity",
32
+ "assert_faithfulness",
33
+ "assert_conciseness",
34
+ "assert_score",
35
+ "assert_compare",
36
+ "discover_tests",
37
+ "run_tests",
38
+ "load_last_report",
39
+ "report_results",
40
+ ]
41
+
42
+
43
+ def eval(fn: object) -> object:
44
+ """
45
+ Decorator to mark a function as an agent eval test.
46
+ Discovered by agenttest run.
47
+ """
48
+ # Passthrough - runner discovers by parsing source for @eval
49
+ return fn
50
+
51
+
52
+ class EvalResult:
53
+ """Result of a single eval run."""
54
+
55
+ def __init__(
56
+ self,
57
+ test_name: str,
58
+ status: str,
59
+ duration: float = 0,
60
+ error_message: str | None = None,
61
+ scores: dict[str, float] | None = None,
62
+ ):
63
+ self.test_name = test_name
64
+ self.status = status
65
+ self.duration = duration
66
+ self.error_message = error_message
67
+ self.scores = scores or {}
@@ -0,0 +1,6 @@
1
+ """Allow running as python -m agenttest."""
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()