aevals 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aevals-0.1.0/.github/ISSUE_TEMPLATE/bug_report.md +26 -0
- aevals-0.1.0/.github/ISSUE_TEMPLATE/feature_request.md +13 -0
- aevals-0.1.0/.github/PULL_REQUEST_TEMPLATE.md +13 -0
- aevals-0.1.0/.github/workflows/ci.yml +50 -0
- aevals-0.1.0/.github/workflows/release.yml +83 -0
- aevals-0.1.0/.gitignore +219 -0
- aevals-0.1.0/.pre-commit-config.yaml +16 -0
- aevals-0.1.0/CHANGELOG.md +22 -0
- aevals-0.1.0/CLAUDE.md +74 -0
- aevals-0.1.0/CONTRIBUTING.md +62 -0
- aevals-0.1.0/LICENSE +21 -0
- aevals-0.1.0/PKG-INFO +190 -0
- aevals-0.1.0/README.md +140 -0
- aevals-0.1.0/cliff.toml +48 -0
- aevals-0.1.0/examples/booking-agent/README.md +32 -0
- aevals-0.1.0/examples/booking-agent/aevals.yaml +57 -0
- aevals-0.1.0/examples/booking-agent/agent.py +131 -0
- aevals-0.1.0/examples/booking-agent/requirements.txt +3 -0
- aevals-0.1.0/examples/booking-agent/tools.py +51 -0
- aevals-0.1.0/examples/sdr-agent/README.md +40 -0
- aevals-0.1.0/examples/sdr-agent/aevals.yaml +127 -0
- aevals-0.1.0/examples/sdr-agent/agent.py +258 -0
- aevals-0.1.0/examples/sdr-agent/requirements.txt +3 -0
- aevals-0.1.0/examples/sdr-agent/tools.py +396 -0
- aevals-0.1.0/pyproject.toml +95 -0
- aevals-0.1.0/scripts/release.sh +85 -0
- aevals-0.1.0/src/aevals/__init__.py +3 -0
- aevals-0.1.0/src/aevals/__main__.py +6 -0
- aevals-0.1.0/src/aevals/capture/__init__.py +0 -0
- aevals-0.1.0/src/aevals/capture/otel.py +236 -0
- aevals-0.1.0/src/aevals/capture/schema.py +38 -0
- aevals-0.1.0/src/aevals/cli/__init__.py +20 -0
- aevals-0.1.0/src/aevals/cli/init.py +80 -0
- aevals-0.1.0/src/aevals/cli/mcp.py +13 -0
- aevals-0.1.0/src/aevals/cli/run.py +240 -0
- aevals-0.1.0/src/aevals/config/__init__.py +0 -0
- aevals-0.1.0/src/aevals/config/loader.py +51 -0
- aevals-0.1.0/src/aevals/config/schema.py +77 -0
- aevals-0.1.0/src/aevals/constants.py +20 -0
- aevals-0.1.0/src/aevals/evals/__init__.py +0 -0
- aevals-0.1.0/src/aevals/evals/constraints.py +111 -0
- aevals-0.1.0/src/aevals/evals/judge.py +178 -0
- aevals-0.1.0/src/aevals/evals/runner.py +62 -0
- aevals-0.1.0/src/aevals/evals/types.py +37 -0
- aevals-0.1.0/src/aevals/mcp/__init__.py +0 -0
- aevals-0.1.0/src/aevals/mcp/server.py +204 -0
- aevals-0.1.0/src/aevals/output/__init__.py +0 -0
- aevals-0.1.0/src/aevals/output/json.py +58 -0
- aevals-0.1.0/src/aevals/output/terminal.py +106 -0
- aevals-0.1.0/src/aevals/py.typed +0 -0
- aevals-0.1.0/src/aevals/scanner/__init__.py +0 -0
- aevals-0.1.0/src/aevals/scanner/detect.py +136 -0
- aevals-0.1.0/src/aevals/scanner/wrapper.py +53 -0
- aevals-0.1.0/tests/__init__.py +0 -0
- aevals-0.1.0/tests/conftest.py +206 -0
- aevals-0.1.0/tests/test_cli_init.py +81 -0
- aevals-0.1.0/tests/test_cli_run.py +101 -0
- aevals-0.1.0/tests/test_cli_run_integration.py +207 -0
- aevals-0.1.0/tests/test_config_loader.py +89 -0
- aevals-0.1.0/tests/test_config_schema.py +126 -0
- aevals-0.1.0/tests/test_constants.py +42 -0
- aevals-0.1.0/tests/test_constraints.py +196 -0
- aevals-0.1.0/tests/test_extract_output.py +54 -0
- aevals-0.1.0/tests/test_judge.py +224 -0
- aevals-0.1.0/tests/test_mcp_server.py +212 -0
- aevals-0.1.0/tests/test_otel.py +126 -0
- aevals-0.1.0/tests/test_otel_exporter.py +122 -0
- aevals-0.1.0/tests/test_output_json.py +70 -0
- aevals-0.1.0/tests/test_output_terminal.py +64 -0
- aevals-0.1.0/tests/test_runner.py +94 -0
- aevals-0.1.0/tests/test_scanner.py +122 -0
- aevals-0.1.0/tests/test_wrapper.py +57 -0
- aevals-0.1.0/website/.gitignore +33 -0
- aevals-0.1.0/website/app/apple-icon.png +0 -0
- aevals-0.1.0/website/app/favicon.ico +0 -0
- aevals-0.1.0/website/app/globals.css +948 -0
- aevals-0.1.0/website/app/layout.tsx +56 -0
- aevals-0.1.0/website/app/page.tsx +10 -0
- aevals-0.1.0/website/components/AsciiCube.tsx +0 -0
- aevals-0.1.0/website/components/CodeEditor.tsx +206 -0
- aevals-0.1.0/website/components/CopyButton.tsx +90 -0
- aevals-0.1.0/website/components/InstallTabs.tsx +76 -0
- aevals-0.1.0/website/components/LandingPage.tsx +350 -0
- aevals-0.1.0/website/components/MobileMenu.tsx +56 -0
- aevals-0.1.0/website/components/ScrambleLogo.tsx +66 -0
- aevals-0.1.0/website/next.config.ts +7 -0
- aevals-0.1.0/website/package.json +22 -0
- aevals-0.1.0/website/pnpm-lock.yaml +599 -0
- aevals-0.1.0/website/public/aevals.svg +9 -0
- aevals-0.1.0/website/public/android-chrome-192x192.png +0 -0
- aevals-0.1.0/website/public/android-chrome-512x512.png +0 -0
- aevals-0.1.0/website/public/favicon-16x16.png +0 -0
- aevals-0.1.0/website/public/favicon-32x32.png +0 -0
- aevals-0.1.0/website/public/llms.txt +42 -0
- aevals-0.1.0/website/tsconfig.json +21 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Bug report
|
|
3
|
+
labels: bug
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
**Describe the bug**
|
|
7
|
+
A clear description of what's broken.
|
|
8
|
+
|
|
9
|
+
**To reproduce**
|
|
10
|
+
Steps to reproduce:
|
|
11
|
+
1. ...
|
|
12
|
+
2. ...
|
|
13
|
+
|
|
14
|
+
**Expected behavior**
|
|
15
|
+
What you expected to happen.
|
|
16
|
+
|
|
17
|
+
**Environment**
|
|
18
|
+
- OS:
|
|
19
|
+
- Python version:
|
|
20
|
+
- aevals version:
|
|
21
|
+
- LLM SDK + version:
|
|
22
|
+
|
|
23
|
+
**Logs / traceback**
|
|
24
|
+
```
|
|
25
|
+
paste here
|
|
26
|
+
```
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Feature request
|
|
3
|
+
labels: enhancement
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
**Problem**
|
|
7
|
+
What are you trying to do that isn't possible or is harder than it should be?
|
|
8
|
+
|
|
9
|
+
**Proposed solution**
|
|
10
|
+
How you'd like it to work.
|
|
11
|
+
|
|
12
|
+
**Alternatives considered**
|
|
13
|
+
Other approaches you've thought about.
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
workflow_call:
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
test:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
strategy:
|
|
14
|
+
matrix:
|
|
15
|
+
python-version: ["3.12"]
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Install dependencies
|
|
25
|
+
run: pip install -e ".[dev]"
|
|
26
|
+
|
|
27
|
+
- name: Lint
|
|
28
|
+
run: ruff check src/ tests/
|
|
29
|
+
|
|
30
|
+
- name: Test
|
|
31
|
+
run: pytest --cov=aevals --cov-report=term-missing --cov-fail-under=90
|
|
32
|
+
|
|
33
|
+
lint:
|
|
34
|
+
runs-on: ubuntu-latest
|
|
35
|
+
steps:
|
|
36
|
+
- uses: actions/checkout@v4
|
|
37
|
+
|
|
38
|
+
- name: Set up Python
|
|
39
|
+
uses: actions/setup-python@v5
|
|
40
|
+
with:
|
|
41
|
+
python-version: "3.12"
|
|
42
|
+
|
|
43
|
+
- name: Install dependencies
|
|
44
|
+
run: pip install -e ".[dev]"
|
|
45
|
+
|
|
46
|
+
- name: Ruff check
|
|
47
|
+
run: ruff check src/ tests/
|
|
48
|
+
|
|
49
|
+
- name: Ruff format check
|
|
50
|
+
run: ruff format --check src/ tests/
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: write
|
|
10
|
+
id-token: write
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
# Gate: ensure CI passes before releasing
|
|
14
|
+
ci:
|
|
15
|
+
uses: ./.github/workflows/ci.yml
|
|
16
|
+
|
|
17
|
+
build:
|
|
18
|
+
needs: ci
|
|
19
|
+
runs-on: ubuntu-latest
|
|
20
|
+
steps:
|
|
21
|
+
- uses: actions/checkout@v4
|
|
22
|
+
|
|
23
|
+
- name: Set up Python
|
|
24
|
+
uses: actions/setup-python@v5
|
|
25
|
+
with:
|
|
26
|
+
python-version: "3.12"
|
|
27
|
+
|
|
28
|
+
- name: Install build tools
|
|
29
|
+
run: pip install build
|
|
30
|
+
|
|
31
|
+
- name: Verify tag matches package version
|
|
32
|
+
run: |
|
|
33
|
+
TAG_VERSION="${GITHUB_REF#refs/tags/v}"
|
|
34
|
+
PKG_VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])")
|
|
35
|
+
if [ "$TAG_VERSION" != "$PKG_VERSION" ]; then
|
|
36
|
+
echo "::error::Tag version ($TAG_VERSION) does not match package version ($PKG_VERSION)"
|
|
37
|
+
exit 1
|
|
38
|
+
fi
|
|
39
|
+
|
|
40
|
+
- name: Build sdist and wheel
|
|
41
|
+
run: python -m build
|
|
42
|
+
|
|
43
|
+
- name: Upload dist artifacts
|
|
44
|
+
uses: actions/upload-artifact@v4
|
|
45
|
+
with:
|
|
46
|
+
name: dist
|
|
47
|
+
path: dist/
|
|
48
|
+
|
|
49
|
+
publish-pypi:
|
|
50
|
+
needs: build
|
|
51
|
+
runs-on: ubuntu-latest
|
|
52
|
+
environment: pypi
|
|
53
|
+
permissions:
|
|
54
|
+
id-token: write
|
|
55
|
+
steps:
|
|
56
|
+
- name: Download dist artifacts
|
|
57
|
+
uses: actions/download-artifact@v4
|
|
58
|
+
with:
|
|
59
|
+
name: dist
|
|
60
|
+
path: dist/
|
|
61
|
+
|
|
62
|
+
- name: Publish to PyPI
|
|
63
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
64
|
+
|
|
65
|
+
github-release:
|
|
66
|
+
needs: build
|
|
67
|
+
runs-on: ubuntu-latest
|
|
68
|
+
permissions:
|
|
69
|
+
contents: write
|
|
70
|
+
steps:
|
|
71
|
+
- uses: actions/checkout@v4
|
|
72
|
+
|
|
73
|
+
- name: Download dist artifacts
|
|
74
|
+
uses: actions/download-artifact@v4
|
|
75
|
+
with:
|
|
76
|
+
name: dist
|
|
77
|
+
path: dist/
|
|
78
|
+
|
|
79
|
+
- name: Create GitHub Release
|
|
80
|
+
uses: softprops/action-gh-release@v2
|
|
81
|
+
with:
|
|
82
|
+
generate_release_notes: true
|
|
83
|
+
files: dist/*
|
aevals-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
# Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
uv.lock
|
|
99
|
+
|
|
100
|
+
# poetry
|
|
101
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
102
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
103
|
+
# commonly ignored for libraries.
|
|
104
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
105
|
+
# poetry.lock
|
|
106
|
+
# poetry.toml
|
|
107
|
+
|
|
108
|
+
# pdm
|
|
109
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
110
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
111
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
112
|
+
# pdm.lock
|
|
113
|
+
# pdm.toml
|
|
114
|
+
.pdm-python
|
|
115
|
+
.pdm-build/
|
|
116
|
+
|
|
117
|
+
# pixi
|
|
118
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
119
|
+
# pixi.lock
|
|
120
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
121
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
122
|
+
.pixi
|
|
123
|
+
|
|
124
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
125
|
+
__pypackages__/
|
|
126
|
+
|
|
127
|
+
# Celery stuff
|
|
128
|
+
celerybeat-schedule
|
|
129
|
+
celerybeat.pid
|
|
130
|
+
|
|
131
|
+
# Redis
|
|
132
|
+
*.rdb
|
|
133
|
+
*.aof
|
|
134
|
+
*.pid
|
|
135
|
+
|
|
136
|
+
# RabbitMQ
|
|
137
|
+
mnesia/
|
|
138
|
+
rabbitmq/
|
|
139
|
+
rabbitmq-data/
|
|
140
|
+
|
|
141
|
+
# ActiveMQ
|
|
142
|
+
activemq-data/
|
|
143
|
+
|
|
144
|
+
# SageMath parsed files
|
|
145
|
+
*.sage.py
|
|
146
|
+
|
|
147
|
+
# Environments
|
|
148
|
+
.env
|
|
149
|
+
.envrc
|
|
150
|
+
.venv
|
|
151
|
+
env/
|
|
152
|
+
venv/
|
|
153
|
+
ENV/
|
|
154
|
+
env.bak/
|
|
155
|
+
venv.bak/
|
|
156
|
+
|
|
157
|
+
# Spyder project settings
|
|
158
|
+
.spyderproject
|
|
159
|
+
.spyproject
|
|
160
|
+
|
|
161
|
+
# Rope project settings
|
|
162
|
+
.ropeproject
|
|
163
|
+
|
|
164
|
+
# mkdocs documentation
|
|
165
|
+
/site
|
|
166
|
+
|
|
167
|
+
# mypy
|
|
168
|
+
.mypy_cache/
|
|
169
|
+
.dmypy.json
|
|
170
|
+
dmypy.json
|
|
171
|
+
|
|
172
|
+
# Pyre type checker
|
|
173
|
+
.pyre/
|
|
174
|
+
|
|
175
|
+
# pytype static type analyzer
|
|
176
|
+
.pytype/
|
|
177
|
+
|
|
178
|
+
# Cython debug symbols
|
|
179
|
+
cython_debug/
|
|
180
|
+
|
|
181
|
+
# PyCharm
|
|
182
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
183
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
184
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
185
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
186
|
+
# .idea/
|
|
187
|
+
|
|
188
|
+
# Abstra
|
|
189
|
+
# Abstra is an AI-powered process automation framework.
|
|
190
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
191
|
+
# Learn more at https://abstra.io/docs
|
|
192
|
+
.abstra/
|
|
193
|
+
|
|
194
|
+
# Visual Studio Code
|
|
195
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
196
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
197
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
198
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
199
|
+
# .vscode/
|
|
200
|
+
|
|
201
|
+
# Ruff stuff:
|
|
202
|
+
.ruff_cache/
|
|
203
|
+
|
|
204
|
+
# PyPI configuration file
|
|
205
|
+
.pypirc
|
|
206
|
+
|
|
207
|
+
# Marimo
|
|
208
|
+
marimo/_static/
|
|
209
|
+
marimo/_lsp/
|
|
210
|
+
__marimo__/
|
|
211
|
+
|
|
212
|
+
# Streamlit
|
|
213
|
+
.streamlit/secrets.toml
|
|
214
|
+
|
|
215
|
+
# OS
|
|
216
|
+
.DS_Store
|
|
217
|
+
|
|
218
|
+
# aevals local data (traces, results, wrapper)
|
|
219
|
+
.aevals/
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
3
|
+
rev: v0.15.5
|
|
4
|
+
hooks:
|
|
5
|
+
- id: ruff
|
|
6
|
+
args: [--fix, --exit-non-zero-on-fix]
|
|
7
|
+
- id: ruff-format
|
|
8
|
+
|
|
9
|
+
- repo: local
|
|
10
|
+
hooks:
|
|
11
|
+
- id: pyright
|
|
12
|
+
name: pyright
|
|
13
|
+
entry: pyright src/
|
|
14
|
+
language: system
|
|
15
|
+
types: [python]
|
|
16
|
+
pass_filenames: false
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
This changelog is automatically generated by [git-cliff](https://git-cliff.org/).
|
|
6
|
+
|
|
7
|
+
## [0.1.0] - 2026-03-14
|
|
8
|
+
|
|
9
|
+
### Added
|
|
10
|
+
|
|
11
|
+
- Core eval framework: subprocess isolation, OTel trace capture, two-track evaluation
|
|
12
|
+
- Five deterministic constraints: `max_duration_ms`, `max_steps`, `tool_sequence`, `no_repeat_calls`, `output_contains`
|
|
13
|
+
- LLM-as-judge rubric evaluation via litellm
|
|
14
|
+
- CLI commands: `aevals init`, `aevals run`, `aevals mcp-serve`
|
|
15
|
+
- MCP server with five tools: scan, init, run, results, trajectory
|
|
16
|
+
- AST-based SDK and entrypoint detection (scanner)
|
|
17
|
+
- Auto-instrumentation for 6 LLM SDKs via OpenLLMetry
|
|
18
|
+
- Rich terminal output and JSON output formats
|
|
19
|
+
- CI workflow with 90% coverage threshold
|
|
20
|
+
- Release workflow with PyPI publishing
|
|
21
|
+
- Example agents: booking-agent, sdr-agent
|
|
22
|
+
- Pre-commit hooks: ruff, pyright
|
aevals-0.1.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Commands
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Install
|
|
9
|
+
uv pip install -e ".[dev]"
|
|
10
|
+
|
|
11
|
+
# Test
|
|
12
|
+
pytest # all tests
|
|
13
|
+
pytest tests/test_constraints.py # single file
|
|
14
|
+
pytest tests/test_constraints.py::TestMaxDuration # single class
|
|
15
|
+
pytest tests/test_constraints.py::TestMaxDuration::test_passes_under_limit # single test
|
|
16
|
+
pytest --cov=aevals --cov-report=term-missing --cov-fail-under=90 # with coverage (CI threshold: 90%)
|
|
17
|
+
|
|
18
|
+
# Lint & format
|
|
19
|
+
ruff check src/ tests/
|
|
20
|
+
ruff check --fix src/ tests/
|
|
21
|
+
ruff format src/ tests/
|
|
22
|
+
ruff format --check src/ tests/ # CI format check (no changes)
|
|
23
|
+
|
|
24
|
+
# Type check
|
|
25
|
+
pyright
|
|
26
|
+
|
|
27
|
+
# CLI
|
|
28
|
+
aevals init # scan project, generate aevals.yaml + .aevals/
|
|
29
|
+
aevals run # run all scenarios
|
|
30
|
+
aevals run --scenario <name> # run specific scenario
|
|
31
|
+
aevals run --json # machine-readable output
|
|
32
|
+
aevals mcp-serve # start MCP server for Claude Code
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Architecture
|
|
36
|
+
|
|
37
|
+
aevals is an eval framework for LLM-based agents. It runs agent code in subprocesses, captures OpenTelemetry traces, and evaluates results against deterministic constraints and LLM-judged rubrics.
|
|
38
|
+
|
|
39
|
+
### Data flow
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
aevals.yaml → scenario selection → subprocess (run_wrapper.py)
|
|
43
|
+
→ OTel instruments LLM calls → trace JSON file
|
|
44
|
+
→ parse into Trajectory → evaluate constraints + rubric → ScenarioResult
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Key modules
|
|
48
|
+
|
|
49
|
+
- **`config/`** — Pydantic models for `aevals.yaml`. `AevalsConfig.resolve_constraints()` merges defaults with scenario overrides.
|
|
50
|
+
- **`capture/otel.py`** — `activate()` sets up OTel tracing in agent subprocess. `parse_trace_file()` converts OTel span JSON into `LLMCall` objects by reading `gen_ai.*` attributes. Auto-instruments 6 LLM SDKs.
|
|
51
|
+
- **`scanner/`** — AST-based detection of SDK imports and entrypoint candidates. Generates `run_wrapper.py` template with marker constants from `constants.py`.
|
|
52
|
+
- **`evals/constraints.py`** — Five deterministic checks (max_duration_ms, max_steps, tool_sequence, no_repeat_calls, output_contains). Zero LLM cost.
|
|
53
|
+
- **`evals/judge.py`** — LLM-as-judge via litellm. Sends trajectory + rubric, parses JSON response with index-based matching and positional fallback.
|
|
54
|
+
- **`evals/runner.py`** — Shared `filter_scenarios()` and `summarize_results()` used by both CLI and MCP server.
|
|
55
|
+
- **`cli/run.py`** — Spawns agent subprocesses concurrently via `asyncio.create_subprocess_exec`. Output extracted between `RESULT_MARKER_START/END` sentinels.
|
|
56
|
+
- **`mcp/server.py`** — Primary interface for Claude Code. Five tools: scan, init, run, results, trajectory.
|
|
57
|
+
- **`constants.py`** — Single source of truth for directory paths and output marker strings.
|
|
58
|
+
|
|
59
|
+
### Subprocess isolation model
|
|
60
|
+
|
|
61
|
+
Each scenario runs in its own subprocess with env vars (`AEVALS_RUN_ID`, `AEVALS_ENTRY`, `AEVALS_TRACE_DIR`). The generated `run_wrapper.py` dynamically imports the agent entry point, pipes input via stdin, and captures output between marker strings in stdout. OTel spans are written to per-run JSON files in `.aevals/traces/`.
|
|
62
|
+
|
|
63
|
+
### Two-track evaluation
|
|
64
|
+
|
|
65
|
+
Pass/fail is `constraints_pass AND rubric_pass`. Constraints are synchronous and deterministic. Rubric evaluation is async via litellm, optional (missing judge config → rubric_pending=True, doesn't fail). Results are Pydantic models throughout.
|
|
66
|
+
|
|
67
|
+
## Conventions
|
|
68
|
+
|
|
69
|
+
- Python 3.12+. Line length 100. Ruff rules: E, F, I, UP, B, SIM, RUF, PT, PIE, C4, RET, PERF.
|
|
70
|
+
- Pyright strict mode. Type hints everywhere.
|
|
71
|
+
- Async tests use `pytest-asyncio` with `asyncio_mode = "auto"` (no `@pytest.mark.asyncio` needed).
|
|
72
|
+
- All shared path constants live in `constants.py` — don't hardcode `.aevals/` paths.
|
|
73
|
+
- Output marker strings (`__AEVALS_RESULT__`, `__AEVALS_END__`) are defined once in `constants.py` and injected into the wrapper template via `.format()`.
|
|
74
|
+
- TOCTOU: use `try/except FileNotFoundError` instead of `if path.exists()` then `path.read_text()`.
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Contributing to aevals
|
|
2
|
+
|
|
3
|
+
Thanks for your interest in contributing. This guide covers setup, workflow, and conventions.
|
|
4
|
+
|
|
5
|
+
## Setup
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
git clone https://github.com/satyaborg/aevals.git
|
|
9
|
+
cd aevals
|
|
10
|
+
pip install -e ".[dev]"
|
|
11
|
+
pre-commit install
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Development workflow
|
|
15
|
+
|
|
16
|
+
1. Create a branch: `git checkout -b type/short-description`
|
|
17
|
+
- Types: `feat/`, `fix/`, `chore/`, `docs/`, `test/`
|
|
18
|
+
2. Make your changes
|
|
19
|
+
3. Run checks:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pytest # tests
|
|
23
|
+
pytest --cov=aevals --cov-fail-under=90 # with coverage
|
|
24
|
+
ruff check src/ tests/ # lint
|
|
25
|
+
ruff format src/ tests/ # format
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
4. Commit with a clear, imperative message under 72 chars
|
|
29
|
+
5. Open a PR against `main`
|
|
30
|
+
|
|
31
|
+
## Code conventions
|
|
32
|
+
|
|
33
|
+
- **Python 3.12+**. Line length 100.
|
|
34
|
+
- **Type hints everywhere**. No `Any` unless forced by a library boundary. Strict pyright.
|
|
35
|
+
- **Double quotes** for strings (ruff default). Trailing commas always.
|
|
36
|
+
- **Imports**: stdlib, third-party, local — separated by blank lines. Sorted by ruff/isort.
|
|
37
|
+
- **Shared constants** live in `constants.py`. Don't hardcode `.aevals/` paths.
|
|
38
|
+
- **File access**: use `try/except FileNotFoundError` instead of `if path.exists()` then read (TOCTOU).
|
|
39
|
+
- **Async tests** use `pytest-asyncio` with `asyncio_mode = "auto"` — no `@pytest.mark.asyncio` needed.
|
|
40
|
+
|
|
41
|
+
## Testing
|
|
42
|
+
|
|
43
|
+
- Write tests for any function with branching logic or >10 lines.
|
|
44
|
+
- Prefer real objects over mocks. Mock only at system boundaries (network, disk, time).
|
|
45
|
+
- Test names: `test_<what>_<condition>_<expected>`.
|
|
46
|
+
- CI enforces 90% coverage.
|
|
47
|
+
|
|
48
|
+
## Pull requests
|
|
49
|
+
|
|
50
|
+
- Keep PRs small and focused. If a PR touches >300 lines, consider splitting it.
|
|
51
|
+
- One logical change per commit.
|
|
52
|
+
- PRs run CI automatically (lint + test).
|
|
53
|
+
|
|
54
|
+
## Lint rules
|
|
55
|
+
|
|
56
|
+
Ruff is configured with: `E`, `F`, `I`, `UP`, `B`, `SIM`, `RUF`, `PT`, `PIE`, `C4`, `RET`, `PERF`.
|
|
57
|
+
|
|
58
|
+
Pre-commit hooks run ruff and pyright automatically on commit.
|
|
59
|
+
|
|
60
|
+
## Releases
|
|
61
|
+
|
|
62
|
+
Releases are automated via GitHub Actions when a version tag is pushed. See `scripts/release.sh`.
|
aevals-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Satya Borgohain
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|