agenteval-py 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agenteval_py-0.1.0/.github/workflows/ci.yml +36 -0
- agenteval_py-0.1.0/.github/workflows/publish.yml +30 -0
- agenteval_py-0.1.0/.gitignore +104 -0
- agenteval_py-0.1.0/.pre-commit-config.yaml +12 -0
- agenteval_py-0.1.0/CHANGELOG.md +31 -0
- agenteval_py-0.1.0/CONTRIBUTING.md +130 -0
- agenteval_py-0.1.0/LICENSE +21 -0
- agenteval_py-0.1.0/PKG-INFO +561 -0
- agenteval_py-0.1.0/README.md +520 -0
- agenteval_py-0.1.0/docs/adapters.md +182 -0
- agenteval_py-0.1.0/docs/assertions.md +202 -0
- agenteval_py-0.1.0/docs/cli.md +164 -0
- agenteval_py-0.1.0/docs/quickstart.md +147 -0
- agenteval_py-0.1.0/docs/tracer.md +151 -0
- agenteval_py-0.1.0/examples/README.md +59 -0
- agenteval_py-0.1.0/examples/live_llm/README.md +104 -0
- agenteval_py-0.1.0/examples/live_llm/eval_anthropic_support_agent.py +577 -0
- agenteval_py-0.1.0/examples/live_llm/eval_openai_support_agent.py +260 -0
- agenteval_py-0.1.0/examples/test_demo_agent.py +32 -0
- agenteval_py-0.1.0/examples/test_realistic_agents.py +240 -0
- agenteval_py-0.1.0/pyproject.toml +76 -0
- agenteval_py-0.1.0/src/agenteval/__init__.py +46 -0
- agenteval_py-0.1.0/src/agenteval/adapters/__init__.py +9 -0
- agenteval_py-0.1.0/src/agenteval/adapters/anthropic_adapter.py +80 -0
- agenteval_py-0.1.0/src/agenteval/adapters/langchain_adapter.py +135 -0
- agenteval_py-0.1.0/src/agenteval/adapters/openai_adapter.py +80 -0
- agenteval_py-0.1.0/src/agenteval/assertions.py +289 -0
- agenteval_py-0.1.0/src/agenteval/cli.py +93 -0
- agenteval_py-0.1.0/src/agenteval/exceptions.py +17 -0
- agenteval_py-0.1.0/src/agenteval/models.py +123 -0
- agenteval_py-0.1.0/src/agenteval/py.typed +0 -0
- agenteval_py-0.1.0/src/agenteval/registry.py +99 -0
- agenteval_py-0.1.0/src/agenteval/reporter.py +139 -0
- agenteval_py-0.1.0/src/agenteval/runner.py +119 -0
- agenteval_py-0.1.0/src/agenteval/suite.py +181 -0
- agenteval_py-0.1.0/src/agenteval/tracer.py +303 -0
- agenteval_py-0.1.0/tests/__init__.py +0 -0
- agenteval_py-0.1.0/tests/adapters/__init__.py +0 -0
- agenteval_py-0.1.0/tests/adapters/test_anthropic_adapter.py +57 -0
- agenteval_py-0.1.0/tests/adapters/test_openai_adapter.py +65 -0
- agenteval_py-0.1.0/tests/conftest.py +13 -0
- agenteval_py-0.1.0/tests/test_assertions.py +297 -0
- agenteval_py-0.1.0/tests/test_models.py +154 -0
- agenteval_py-0.1.0/tests/test_registry.py +113 -0
- agenteval_py-0.1.0/tests/test_reporter.py +118 -0
- agenteval_py-0.1.0/tests/test_runner.py +122 -0
- agenteval_py-0.1.0/tests/test_suite.py +154 -0
- agenteval_py-0.1.0/tests/test_tracer.py +222 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
strategy:
|
|
12
|
+
matrix:
|
|
13
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
19
|
+
uses: actions/setup-python@v5
|
|
20
|
+
with:
|
|
21
|
+
python-version: ${{ matrix.python-version }}
|
|
22
|
+
|
|
23
|
+
- name: Install uv
|
|
24
|
+
uses: astral-sh/setup-uv@v3
|
|
25
|
+
|
|
26
|
+
- name: Install dependencies
|
|
27
|
+
run: uv pip install -e ".[dev,all]" --system
|
|
28
|
+
|
|
29
|
+
- name: Lint
|
|
30
|
+
run: ruff check src/ tests/ --select F,I
|
|
31
|
+
|
|
32
|
+
- name: Test
|
|
33
|
+
run: pytest tests/ -v --tb=short
|
|
34
|
+
|
|
35
|
+
- name: Build package
|
|
36
|
+
run: python -m build
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: Publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
publish:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
permissions:
|
|
12
|
+
contents: read
|
|
13
|
+
id-token: write
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- name: Set up Python
|
|
19
|
+
uses: actions/setup-python@v5
|
|
20
|
+
with:
|
|
21
|
+
python-version: "3.11"
|
|
22
|
+
|
|
23
|
+
- name: Install build
|
|
24
|
+
run: python -m pip install --upgrade build
|
|
25
|
+
|
|
26
|
+
- name: Build package
|
|
27
|
+
run: python -m build
|
|
28
|
+
|
|
29
|
+
- name: Publish to PyPI
|
|
30
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# ----------------------
|
|
2
|
+
# Python basics
|
|
3
|
+
# ----------------------
|
|
4
|
+
__pycache__/
|
|
5
|
+
*.py[cod]
|
|
6
|
+
*$py.class
|
|
7
|
+
|
|
8
|
+
# C extensions
|
|
9
|
+
*.so
|
|
10
|
+
|
|
11
|
+
# ----------------------
|
|
12
|
+
# Virtual environments
|
|
13
|
+
# ----------------------
|
|
14
|
+
.venv/
|
|
15
|
+
venv/
|
|
16
|
+
env/
|
|
17
|
+
ENV/
|
|
18
|
+
|
|
19
|
+
# uv-specific (lock + cache)
|
|
20
|
+
.uv/
|
|
21
|
+
uv.lock
|
|
22
|
+
|
|
23
|
+
# ----------------------
|
|
24
|
+
# Distribution / packaging
|
|
25
|
+
# ----------------------
|
|
26
|
+
build/
|
|
27
|
+
dist/
|
|
28
|
+
*.egg-info/
|
|
29
|
+
.eggs/
|
|
30
|
+
pip-wheel-metadata/
|
|
31
|
+
|
|
32
|
+
# ----------------------
|
|
33
|
+
# Logs & runtime files
|
|
34
|
+
# ----------------------
|
|
35
|
+
*.log
|
|
36
|
+
logs/
|
|
37
|
+
*.pid
|
|
38
|
+
*.seed
|
|
39
|
+
|
|
40
|
+
# ----------------------
|
|
41
|
+
# Environment variables
|
|
42
|
+
# ----------------------
|
|
43
|
+
.env
|
|
44
|
+
.env.*
|
|
45
|
+
*.env
|
|
46
|
+
|
|
47
|
+
# ----------------------
|
|
48
|
+
# IDE / Editor
|
|
49
|
+
# ----------------------
|
|
50
|
+
.vscode/
|
|
51
|
+
.idea/
|
|
52
|
+
*.swp
|
|
53
|
+
*.swo
|
|
54
|
+
*~
|
|
55
|
+
|
|
56
|
+
# ----------------------
|
|
57
|
+
# macOS
|
|
58
|
+
# ----------------------
|
|
59
|
+
.DS_Store
|
|
60
|
+
|
|
61
|
+
# ----------------------
|
|
62
|
+
# Testing
|
|
63
|
+
# ----------------------
|
|
64
|
+
.pytest_cache/
|
|
65
|
+
.coverage
|
|
66
|
+
coverage.xml
|
|
67
|
+
htmlcov/
|
|
68
|
+
|
|
69
|
+
# ----------------------
|
|
70
|
+
# Type checking
|
|
71
|
+
# ----------------------
|
|
72
|
+
.mypy_cache/
|
|
73
|
+
.pyre/
|
|
74
|
+
|
|
75
|
+
# ----------------------
|
|
76
|
+
# Ruff / linting
|
|
77
|
+
# ----------------------
|
|
78
|
+
.ruff_cache/
|
|
79
|
+
|
|
80
|
+
# ----------------------
|
|
81
|
+
# Jupyter
|
|
82
|
+
# ----------------------
|
|
83
|
+
.ipynb_checkpoints/
|
|
84
|
+
|
|
85
|
+
# ----------------------
|
|
86
|
+
# CLI / Agent artifacts
|
|
87
|
+
# ----------------------
|
|
88
|
+
outputs/
|
|
89
|
+
runs/
|
|
90
|
+
artifacts/
|
|
91
|
+
cache/
|
|
92
|
+
tmp/
|
|
93
|
+
|
|
94
|
+
# ----------------------
|
|
95
|
+
# Secrets / API keys
|
|
96
|
+
# ----------------------
|
|
97
|
+
secrets.json
|
|
98
|
+
config.local.json
|
|
99
|
+
|
|
100
|
+
# ----------------------
|
|
101
|
+
# Misc
|
|
102
|
+
# ----------------------
|
|
103
|
+
*.sqlite3
|
|
104
|
+
*.db
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to agenteval are documented here. The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and the project uses [Semantic Versioning](https://semver.org/).
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## [Unreleased]
|
|
8
|
+
|
|
9
|
+
Nothing yet.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## [0.1.0] — 2024-01-01
|
|
14
|
+
|
|
15
|
+
Initial release.
|
|
16
|
+
|
|
17
|
+
### Added
|
|
18
|
+
|
|
19
|
+
- **Tracer** — records tool calls (name, arguments, result, duration, error) and run boundaries via `tracer.wrap()`, `@tracer.tool`, and `async with tracer.run()`
|
|
20
|
+
- **AssertionSet** — fluent, chainable assertion API with collected failures: `called_tool`, `never_called_tool`, `tool_call_count`, `tool_called_before`, `tool_called_with_args`, `completed_within_steps`, `completed_within_seconds`, `response_contains`, `response_matches_schema`, `no_errors`, `custom`
|
|
21
|
+
- **Runner** — executes a test function N times concurrently using `anyio`, supports both sync and async test functions
|
|
22
|
+
- **`@agenteval.test` decorator** — registers test functions with `n`, `threshold`, and `tags` parameters; supports bare and parameterized form
|
|
23
|
+
- **Suite runner** — discovers `test_*.py` files, imports them, and runs all registered tests
|
|
24
|
+
- **RichReporter** — color-coded terminal output (✅ ⚠️ ❌) with pass rate, timing, step count, and optional per-run trace details; JSON export for CI
|
|
25
|
+
- **CLI** — `agenteval run` and `agenteval report` commands via Typer
|
|
26
|
+
- **OpenAI adapter** — `wrap_tools()` and `extract_token_usage()` for OpenAI function calling
|
|
27
|
+
- **Anthropic adapter** — `wrap_tools()` and `extract_token_usage()` for Anthropic tool use (including cache token fields)
|
|
28
|
+
- **LangChain adapter** — `AgentEvalCallbackHandler` that auto-connects to the active `Tracer` via `ContextVar`, enabling concurrent runs with full isolation
|
|
29
|
+
- **Data models** — `AgentTrace`, `ToolCall`, `TestResult`, `SuiteResult` — all Pydantic v2, fully serializable to JSON
|
|
30
|
+
- **Typed** — `py.typed` marker included for downstream type checkers
|
|
31
|
+
- **CI** — GitHub Actions workflow testing Python 3.11, 3.12, and 3.13
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# Contributing to agenteval
|
|
2
|
+
|
|
3
|
+
Thanks for taking the time to contribute. This document covers everything you need to get the development environment running, the conventions the codebase follows, and what a good pull request looks like.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Getting set up
|
|
8
|
+
|
|
9
|
+
You'll need Python 3.11 or later. The project uses `pip` for dependency management and `hatchling` as the build backend.
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
git clone https://github.com/awesome-pro/agenteval
|
|
13
|
+
cd agenteval
|
|
14
|
+
|
|
15
|
+
# Install in editable mode with all dev dependencies
|
|
16
|
+
pip install -e ".[dev]"
|
|
17
|
+
|
|
18
|
+
# Install pre-commit hooks (runs ruff on every commit automatically)
|
|
19
|
+
pre-commit install
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
That's it. Run the tests to make sure everything is working:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pytest tests/ -v
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
All tests pass without any API keys. The framework is tested against mock agents.
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## Project layout
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
src/agenteval/ — library source code
|
|
36
|
+
tests/ — test suite (mirrors src/agenteval/ structure)
|
|
37
|
+
docs/ — topic-based documentation
|
|
38
|
+
.github/workflows/ — CI configuration (lint + tests + build on 3.11/3.12/3.13)
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
The source lives under `src/` to keep it cleanly separated from tests and config. When you add a new module, mirror it in `tests/` with a `test_` prefix.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Code conventions
|
|
46
|
+
|
|
47
|
+
**Type annotations** — the package ships with type information and uses Pyright
|
|
48
|
+
for type checking. If you're working on typing-heavy changes, run `pyright src/`
|
|
49
|
+
before pushing.
|
|
50
|
+
|
|
51
|
+
**Formatting and linting** — Ruff handles both. The pre-commit hook runs it automatically on changed files. You can also run it manually:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
ruff check src/ tests/ --select F,I
|
|
55
|
+
ruff format src/
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
**Async first** — the runner, tracer, and suite are all async-native. If you're adding functionality that touches the execution path, prefer `async def` and `anyio` primitives over `asyncio` directly. This keeps things backend-agnostic.
|
|
59
|
+
|
|
60
|
+
**No framework dependencies in core** — `src/agenteval/` (outside `adapters/`) should not import `openai`, `anthropic`, or `langchain`. Framework-specific code belongs in `src/agenteval/adapters/`.
|
|
61
|
+
|
|
62
|
+
**Keep the public API surface small** — `__init__.py` exports only what users actually need. If something is implementation detail, don't add it to `__all__`.
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## Running specific test groups
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
# Run everything
|
|
70
|
+
pytest tests/ -v
|
|
71
|
+
|
|
72
|
+
# Run a specific file
|
|
73
|
+
pytest tests/test_tracer.py -v
|
|
74
|
+
|
|
75
|
+
# Run with a keyword filter
|
|
76
|
+
pytest tests/ -k "assertion" -v
|
|
77
|
+
|
|
78
|
+
# Run adapter tests only
|
|
79
|
+
pytest tests/adapters/ -v
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Making changes
|
|
85
|
+
|
|
86
|
+
**For bug fixes:** open an issue first if the behavior is surprising or the fix is non-obvious. Small, clear bugs can go straight to a PR.
|
|
87
|
+
|
|
88
|
+
**For new features:** please open an issue or discussion first before building anything significant. This saves everyone time if the design needs to change.
|
|
89
|
+
|
|
90
|
+
**For documentation:** PRs that improve clarity, fix typos, or add examples are always welcome without prior discussion.
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## Pull request checklist
|
|
95
|
+
|
|
96
|
+
Before submitting, make sure:
|
|
97
|
+
|
|
98
|
+
- [ ] `pytest tests/ -v` passes
|
|
99
|
+
- [ ] `ruff check src/ tests/ --select F,I` passes with no errors
|
|
100
|
+
- [ ] `python -m build` completes successfully
|
|
101
|
+
- [ ] New behavior has corresponding tests
|
|
102
|
+
- [ ] If you added a public API, it's documented in `docs/` and/or the relevant docstring
|
|
103
|
+
- [ ] Commit messages are descriptive (what changed and roughly why, not just "fix bug")
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## Commit style
|
|
108
|
+
|
|
109
|
+
No strict convention here, but a good commit message answers two questions: what changed, and why. One-liners are fine for small changes:
|
|
110
|
+
|
|
111
|
+
```
|
|
112
|
+
fix: tool wrapper now records arguments for positional-only params
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
For anything that took real thought, add a short body:
|
|
116
|
+
|
|
117
|
+
```
|
|
118
|
+
feat: add response_matches_schema assertion
|
|
119
|
+
|
|
120
|
+
Agents returning structured JSON are common enough that a schema
|
|
121
|
+
validation assertion makes sense as a first-class feature. Uses
|
|
122
|
+
Pydantic v2 model_validate under the hood, which also handles
|
|
123
|
+
JSON string inputs automatically.
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## Questions
|
|
129
|
+
|
|
130
|
+
If something in the codebase is confusing or the docs are unclear, opening an issue to ask is completely fine. Clear documentation is a feature.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Abhinandan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|