agenteval-py 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. agenteval_py-0.1.0/.github/workflows/ci.yml +36 -0
  2. agenteval_py-0.1.0/.github/workflows/publish.yml +30 -0
  3. agenteval_py-0.1.0/.gitignore +104 -0
  4. agenteval_py-0.1.0/.pre-commit-config.yaml +12 -0
  5. agenteval_py-0.1.0/CHANGELOG.md +31 -0
  6. agenteval_py-0.1.0/CONTRIBUTING.md +130 -0
  7. agenteval_py-0.1.0/LICENSE +21 -0
  8. agenteval_py-0.1.0/PKG-INFO +561 -0
  9. agenteval_py-0.1.0/README.md +520 -0
  10. agenteval_py-0.1.0/docs/adapters.md +182 -0
  11. agenteval_py-0.1.0/docs/assertions.md +202 -0
  12. agenteval_py-0.1.0/docs/cli.md +164 -0
  13. agenteval_py-0.1.0/docs/quickstart.md +147 -0
  14. agenteval_py-0.1.0/docs/tracer.md +151 -0
  15. agenteval_py-0.1.0/examples/README.md +59 -0
  16. agenteval_py-0.1.0/examples/live_llm/README.md +104 -0
  17. agenteval_py-0.1.0/examples/live_llm/eval_anthropic_support_agent.py +577 -0
  18. agenteval_py-0.1.0/examples/live_llm/eval_openai_support_agent.py +260 -0
  19. agenteval_py-0.1.0/examples/test_demo_agent.py +32 -0
  20. agenteval_py-0.1.0/examples/test_realistic_agents.py +240 -0
  21. agenteval_py-0.1.0/pyproject.toml +76 -0
  22. agenteval_py-0.1.0/src/agenteval/__init__.py +46 -0
  23. agenteval_py-0.1.0/src/agenteval/adapters/__init__.py +9 -0
  24. agenteval_py-0.1.0/src/agenteval/adapters/anthropic_adapter.py +80 -0
  25. agenteval_py-0.1.0/src/agenteval/adapters/langchain_adapter.py +135 -0
  26. agenteval_py-0.1.0/src/agenteval/adapters/openai_adapter.py +80 -0
  27. agenteval_py-0.1.0/src/agenteval/assertions.py +289 -0
  28. agenteval_py-0.1.0/src/agenteval/cli.py +93 -0
  29. agenteval_py-0.1.0/src/agenteval/exceptions.py +17 -0
  30. agenteval_py-0.1.0/src/agenteval/models.py +123 -0
  31. agenteval_py-0.1.0/src/agenteval/py.typed +0 -0
  32. agenteval_py-0.1.0/src/agenteval/registry.py +99 -0
  33. agenteval_py-0.1.0/src/agenteval/reporter.py +139 -0
  34. agenteval_py-0.1.0/src/agenteval/runner.py +119 -0
  35. agenteval_py-0.1.0/src/agenteval/suite.py +181 -0
  36. agenteval_py-0.1.0/src/agenteval/tracer.py +303 -0
  37. agenteval_py-0.1.0/tests/__init__.py +0 -0
  38. agenteval_py-0.1.0/tests/adapters/__init__.py +0 -0
  39. agenteval_py-0.1.0/tests/adapters/test_anthropic_adapter.py +57 -0
  40. agenteval_py-0.1.0/tests/adapters/test_openai_adapter.py +65 -0
  41. agenteval_py-0.1.0/tests/conftest.py +13 -0
  42. agenteval_py-0.1.0/tests/test_assertions.py +297 -0
  43. agenteval_py-0.1.0/tests/test_models.py +154 -0
  44. agenteval_py-0.1.0/tests/test_registry.py +113 -0
  45. agenteval_py-0.1.0/tests/test_reporter.py +118 -0
  46. agenteval_py-0.1.0/tests/test_runner.py +122 -0
  47. agenteval_py-0.1.0/tests/test_suite.py +154 -0
  48. agenteval_py-0.1.0/tests/test_tracer.py +222 -0
@@ -0,0 +1,36 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ strategy:
12
+ matrix:
13
+ python-version: ["3.11", "3.12", "3.13"]
14
+
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Set up Python ${{ matrix.python-version }}
19
+ uses: actions/setup-python@v5
20
+ with:
21
+ python-version: ${{ matrix.python-version }}
22
+
23
+ - name: Install uv
24
+ uses: astral-sh/setup-uv@v3
25
+
26
+ - name: Install dependencies
27
+ run: uv pip install -e ".[dev,all]" --system
28
+
29
+ - name: Lint
30
+ run: ruff check src/ tests/ --select F,I
31
+
32
+ - name: Test
33
+ run: pytest tests/ -v --tb=short
34
+
35
+ - name: Build package
36
+ run: python -m build
@@ -0,0 +1,30 @@
1
+ name: Publish
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ publish:
10
+ runs-on: ubuntu-latest
11
+ permissions:
12
+ contents: read
13
+ id-token: write
14
+
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Set up Python
19
+ uses: actions/setup-python@v5
20
+ with:
21
+ python-version: "3.11"
22
+
23
+ - name: Install build
24
+ run: python -m pip install --upgrade build
25
+
26
+ - name: Build package
27
+ run: python -m build
28
+
29
+ - name: Publish to PyPI
30
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,104 @@
1
+ # ----------------------
2
+ # Python basics
3
+ # ----------------------
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+
8
+ # C extensions
9
+ *.so
10
+
11
+ # ----------------------
12
+ # Virtual environments
13
+ # ----------------------
14
+ .venv/
15
+ venv/
16
+ env/
17
+ ENV/
18
+
19
+ # uv-specific (lock + cache)
20
+ .uv/
21
+ uv.lock
22
+
23
+ # ----------------------
24
+ # Distribution / packaging
25
+ # ----------------------
26
+ build/
27
+ dist/
28
+ *.egg-info/
29
+ .eggs/
30
+ pip-wheel-metadata/
31
+
32
+ # ----------------------
33
+ # Logs & runtime files
34
+ # ----------------------
35
+ *.log
36
+ logs/
37
+ *.pid
38
+ *.seed
39
+
40
+ # ----------------------
41
+ # Environment variables
42
+ # ----------------------
43
+ .env
44
+ .env.*
45
+ *.env
46
+
47
+ # ----------------------
48
+ # IDE / Editor
49
+ # ----------------------
50
+ .vscode/
51
+ .idea/
52
+ *.swp
53
+ *.swo
54
+ *~
55
+
56
+ # ----------------------
57
+ # macOS
58
+ # ----------------------
59
+ .DS_Store
60
+
61
+ # ----------------------
62
+ # Testing
63
+ # ----------------------
64
+ .pytest_cache/
65
+ .coverage
66
+ coverage.xml
67
+ htmlcov/
68
+
69
+ # ----------------------
70
+ # Type checking
71
+ # ----------------------
72
+ .mypy_cache/
73
+ .pyre/
74
+
75
+ # ----------------------
76
+ # Ruff / linting
77
+ # ----------------------
78
+ .ruff_cache/
79
+
80
+ # ----------------------
81
+ # Jupyter
82
+ # ----------------------
83
+ .ipynb_checkpoints/
84
+
85
+ # ----------------------
86
+ # CLI / Agent artifacts
87
+ # ----------------------
88
+ outputs/
89
+ runs/
90
+ artifacts/
91
+ cache/
92
+ tmp/
93
+
94
+ # ----------------------
95
+ # Secrets / API keys
96
+ # ----------------------
97
+ secrets.json
98
+ config.local.json
99
+
100
+ # ----------------------
101
+ # Misc
102
+ # ----------------------
103
+ *.sqlite3
104
+ *.db
@@ -0,0 +1,12 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.4.0
4
+ hooks:
5
+ - id: ruff
6
+ args: [--fix]
7
+ - id: ruff-format
8
+ - repo: https://github.com/RobertCraigie/pyright-python
9
+ rev: v1.1.360
10
+ hooks:
11
+ - id: pyright
12
+ stages: [manual]
@@ -0,0 +1,31 @@
1
+ # Changelog
2
+
3
+ All notable changes to agenteval are documented here. The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and the project uses [Semantic Versioning](https://semver.org/).
4
+
5
+ ---
6
+
7
+ ## [Unreleased]
8
+
9
+ Nothing yet.
10
+
11
+ ---
12
+
13
+ ## [0.1.0] — 2024-01-01
14
+
15
+ Initial release.
16
+
17
+ ### Added
18
+
19
+ - **Tracer** — records tool calls (name, arguments, result, duration, error) and run boundaries via `tracer.wrap()`, `@tracer.tool`, and `async with tracer.run()`
20
+ - **AssertionSet** — fluent, chainable assertion API with collected failures: `called_tool`, `never_called_tool`, `tool_call_count`, `tool_called_before`, `tool_called_with_args`, `completed_within_steps`, `completed_within_seconds`, `response_contains`, `response_matches_schema`, `no_errors`, `custom`
21
+ - **Runner** — executes a test function N times concurrently using `anyio`, supports both sync and async test functions
22
+ - **`@agenteval.test` decorator** — registers test functions with `n`, `threshold`, and `tags` parameters; supports bare and parameterized form
23
+ - **Suite runner** — discovers `test_*.py` files, imports them, and runs all registered tests
24
+ - **RichReporter** — color-coded terminal output (✅ ⚠️ ❌) with pass rate, timing, step count, and optional per-run trace details; JSON export for CI
25
+ - **CLI** — `agenteval run` and `agenteval report` commands via Typer
26
+ - **OpenAI adapter** — `wrap_tools()` and `extract_token_usage()` for OpenAI function calling
27
+ - **Anthropic adapter** — `wrap_tools()` and `extract_token_usage()` for Anthropic tool use (including cache token fields)
28
+ - **LangChain adapter** — `AgentEvalCallbackHandler` that auto-connects to the active `Tracer` via `ContextVar`, enabling concurrent runs with full isolation
29
+ - **Data models** — `AgentTrace`, `ToolCall`, `TestResult`, `SuiteResult` — all Pydantic v2, fully serializable to JSON
30
+ - **Typed** — `py.typed` marker included for downstream type checkers
31
+ - **CI** — GitHub Actions workflow testing Python 3.11, 3.12, and 3.13
@@ -0,0 +1,130 @@
1
+ # Contributing to agenteval
2
+
3
+ Thanks for taking the time to contribute. This document covers everything you need to get the development environment running, the conventions the codebase follows, and what a good pull request looks like.
4
+
5
+ ---
6
+
7
+ ## Getting set up
8
+
9
+ You'll need Python 3.11 or later. The project uses `pip` for dependency management and `hatchling` as the build backend.
10
+
11
+ ```bash
12
+ git clone https://github.com/awesome-pro/agenteval
13
+ cd agenteval
14
+
15
+ # Install in editable mode with all dev dependencies
16
+ pip install -e ".[dev]"
17
+
18
+ # Install pre-commit hooks (runs ruff on every commit automatically)
19
+ pre-commit install
20
+ ```
21
+
22
+ That's it. Run the tests to make sure everything is working:
23
+
24
+ ```bash
25
+ pytest tests/ -v
26
+ ```
27
+
28
+ All tests pass without any API keys. The framework is tested against mock agents.
29
+
30
+ ---
31
+
32
+ ## Project layout
33
+
34
+ ```
35
+ src/agenteval/ — library source code
36
+ tests/ — test suite (mirrors src/agenteval/ structure)
37
+ docs/ — topic-based documentation
38
+ .github/workflows/ — CI configuration (lint + tests + build on 3.11/3.12/3.13)
39
+ ```
40
+
41
+ The source lives under `src/` to keep it cleanly separated from tests and config. When you add a new module, mirror it in `tests/` with a `test_` prefix.
42
+
43
+ ---
44
+
45
+ ## Code conventions
46
+
47
+ **Type annotations** — the package ships with type information and uses Pyright
48
+ for type checking. If you're working on typing-heavy changes, run `pyright src/`
49
+ before pushing.
50
+
51
+ **Formatting and linting** — Ruff handles both. The pre-commit hook runs it automatically on changed files. You can also run it manually:
52
+
53
+ ```bash
54
+ ruff check src/ tests/ --select F,I
55
+ ruff format src/
56
+ ```
57
+
58
+ **Async first** — the runner, tracer, and suite are all async-native. If you're adding functionality that touches the execution path, prefer `async def` and `anyio` primitives over `asyncio` directly. This keeps things backend-agnostic.
59
+
60
+ **No framework dependencies in core** — `src/agenteval/` (outside `adapters/`) should not import `openai`, `anthropic`, or `langchain`. Framework-specific code belongs in `src/agenteval/adapters/`.
61
+
62
+ **Keep the public API surface small** — `__init__.py` exports only what users actually need. If something is implementation detail, don't add it to `__all__`.
63
+
64
+ ---
65
+
66
+ ## Running specific test groups
67
+
68
+ ```bash
69
+ # Run everything
70
+ pytest tests/ -v
71
+
72
+ # Run a specific file
73
+ pytest tests/test_tracer.py -v
74
+
75
+ # Run with a keyword filter
76
+ pytest tests/ -k "assertion" -v
77
+
78
+ # Run adapter tests only
79
+ pytest tests/adapters/ -v
80
+ ```
81
+
82
+ ---
83
+
84
+ ## Making changes
85
+
86
+ **For bug fixes:** open an issue first if the behavior is surprising or the fix is non-obvious. Small, clear bugs can go straight to a PR.
87
+
88
+ **For new features:** please open an issue or discussion first before building anything significant. This saves everyone time if the design needs to change.
89
+
90
+ **For documentation:** PRs that improve clarity, fix typos, or add examples are always welcome without prior discussion.
91
+
92
+ ---
93
+
94
+ ## Pull request checklist
95
+
96
+ Before submitting, make sure:
97
+
98
+ - [ ] `pytest tests/ -v` passes
99
+ - [ ] `ruff check src/ tests/ --select F,I` passes with no errors
100
+ - [ ] `python -m build` completes successfully
101
+ - [ ] New behavior has corresponding tests
102
+ - [ ] If you added a public API, it's documented in `docs/` and/or the relevant docstring
103
+ - [ ] Commit messages are descriptive (what changed and roughly why, not just "fix bug")
104
+
105
+ ---
106
+
107
+ ## Commit style
108
+
109
+ No strict convention here, but a good commit message answers two questions: what changed, and why. One-liners are fine for small changes:
110
+
111
+ ```
112
+ fix: tool wrapper now records arguments for positional-only params
113
+ ```
114
+
115
+ For anything that took real thought, add a short body:
116
+
117
+ ```
118
+ feat: add response_matches_schema assertion
119
+
120
+ Agents returning structured JSON are common enough that a schema
121
+ validation assertion makes sense as a first-class feature. Uses
122
+ Pydantic v2 model_validate under the hood, which also handles
123
+ JSON string inputs automatically.
124
+ ```
125
+
126
+ ---
127
+
128
+ ## Questions
129
+
130
+ If something in the codebase is confusing or the docs are unclear, opening an issue to ask is completely fine. Clear documentation is a feature.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Abhinandan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.