aevals 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. aevals-0.1.0/.github/ISSUE_TEMPLATE/bug_report.md +26 -0
  2. aevals-0.1.0/.github/ISSUE_TEMPLATE/feature_request.md +13 -0
  3. aevals-0.1.0/.github/PULL_REQUEST_TEMPLATE.md +13 -0
  4. aevals-0.1.0/.github/workflows/ci.yml +50 -0
  5. aevals-0.1.0/.github/workflows/release.yml +83 -0
  6. aevals-0.1.0/.gitignore +219 -0
  7. aevals-0.1.0/.pre-commit-config.yaml +16 -0
  8. aevals-0.1.0/CHANGELOG.md +22 -0
  9. aevals-0.1.0/CLAUDE.md +74 -0
  10. aevals-0.1.0/CONTRIBUTING.md +62 -0
  11. aevals-0.1.0/LICENSE +21 -0
  12. aevals-0.1.0/PKG-INFO +190 -0
  13. aevals-0.1.0/README.md +140 -0
  14. aevals-0.1.0/cliff.toml +48 -0
  15. aevals-0.1.0/examples/booking-agent/README.md +32 -0
  16. aevals-0.1.0/examples/booking-agent/aevals.yaml +57 -0
  17. aevals-0.1.0/examples/booking-agent/agent.py +131 -0
  18. aevals-0.1.0/examples/booking-agent/requirements.txt +3 -0
  19. aevals-0.1.0/examples/booking-agent/tools.py +51 -0
  20. aevals-0.1.0/examples/sdr-agent/README.md +40 -0
  21. aevals-0.1.0/examples/sdr-agent/aevals.yaml +127 -0
  22. aevals-0.1.0/examples/sdr-agent/agent.py +258 -0
  23. aevals-0.1.0/examples/sdr-agent/requirements.txt +3 -0
  24. aevals-0.1.0/examples/sdr-agent/tools.py +396 -0
  25. aevals-0.1.0/pyproject.toml +95 -0
  26. aevals-0.1.0/scripts/release.sh +85 -0
  27. aevals-0.1.0/src/aevals/__init__.py +3 -0
  28. aevals-0.1.0/src/aevals/__main__.py +6 -0
  29. aevals-0.1.0/src/aevals/capture/__init__.py +0 -0
  30. aevals-0.1.0/src/aevals/capture/otel.py +236 -0
  31. aevals-0.1.0/src/aevals/capture/schema.py +38 -0
  32. aevals-0.1.0/src/aevals/cli/__init__.py +20 -0
  33. aevals-0.1.0/src/aevals/cli/init.py +80 -0
  34. aevals-0.1.0/src/aevals/cli/mcp.py +13 -0
  35. aevals-0.1.0/src/aevals/cli/run.py +240 -0
  36. aevals-0.1.0/src/aevals/config/__init__.py +0 -0
  37. aevals-0.1.0/src/aevals/config/loader.py +51 -0
  38. aevals-0.1.0/src/aevals/config/schema.py +77 -0
  39. aevals-0.1.0/src/aevals/constants.py +20 -0
  40. aevals-0.1.0/src/aevals/evals/__init__.py +0 -0
  41. aevals-0.1.0/src/aevals/evals/constraints.py +111 -0
  42. aevals-0.1.0/src/aevals/evals/judge.py +178 -0
  43. aevals-0.1.0/src/aevals/evals/runner.py +62 -0
  44. aevals-0.1.0/src/aevals/evals/types.py +37 -0
  45. aevals-0.1.0/src/aevals/mcp/__init__.py +0 -0
  46. aevals-0.1.0/src/aevals/mcp/server.py +204 -0
  47. aevals-0.1.0/src/aevals/output/__init__.py +0 -0
  48. aevals-0.1.0/src/aevals/output/json.py +58 -0
  49. aevals-0.1.0/src/aevals/output/terminal.py +106 -0
  50. aevals-0.1.0/src/aevals/py.typed +0 -0
  51. aevals-0.1.0/src/aevals/scanner/__init__.py +0 -0
  52. aevals-0.1.0/src/aevals/scanner/detect.py +136 -0
  53. aevals-0.1.0/src/aevals/scanner/wrapper.py +53 -0
  54. aevals-0.1.0/tests/__init__.py +0 -0
  55. aevals-0.1.0/tests/conftest.py +206 -0
  56. aevals-0.1.0/tests/test_cli_init.py +81 -0
  57. aevals-0.1.0/tests/test_cli_run.py +101 -0
  58. aevals-0.1.0/tests/test_cli_run_integration.py +207 -0
  59. aevals-0.1.0/tests/test_config_loader.py +89 -0
  60. aevals-0.1.0/tests/test_config_schema.py +126 -0
  61. aevals-0.1.0/tests/test_constants.py +42 -0
  62. aevals-0.1.0/tests/test_constraints.py +196 -0
  63. aevals-0.1.0/tests/test_extract_output.py +54 -0
  64. aevals-0.1.0/tests/test_judge.py +224 -0
  65. aevals-0.1.0/tests/test_mcp_server.py +212 -0
  66. aevals-0.1.0/tests/test_otel.py +126 -0
  67. aevals-0.1.0/tests/test_otel_exporter.py +122 -0
  68. aevals-0.1.0/tests/test_output_json.py +70 -0
  69. aevals-0.1.0/tests/test_output_terminal.py +64 -0
  70. aevals-0.1.0/tests/test_runner.py +94 -0
  71. aevals-0.1.0/tests/test_scanner.py +122 -0
  72. aevals-0.1.0/tests/test_wrapper.py +57 -0
  73. aevals-0.1.0/website/.gitignore +33 -0
  74. aevals-0.1.0/website/app/apple-icon.png +0 -0
  75. aevals-0.1.0/website/app/favicon.ico +0 -0
  76. aevals-0.1.0/website/app/globals.css +948 -0
  77. aevals-0.1.0/website/app/layout.tsx +56 -0
  78. aevals-0.1.0/website/app/page.tsx +10 -0
  79. aevals-0.1.0/website/components/AsciiCube.tsx +0 -0
  80. aevals-0.1.0/website/components/CodeEditor.tsx +206 -0
  81. aevals-0.1.0/website/components/CopyButton.tsx +90 -0
  82. aevals-0.1.0/website/components/InstallTabs.tsx +76 -0
  83. aevals-0.1.0/website/components/LandingPage.tsx +350 -0
  84. aevals-0.1.0/website/components/MobileMenu.tsx +56 -0
  85. aevals-0.1.0/website/components/ScrambleLogo.tsx +66 -0
  86. aevals-0.1.0/website/next.config.ts +7 -0
  87. aevals-0.1.0/website/package.json +22 -0
  88. aevals-0.1.0/website/pnpm-lock.yaml +599 -0
  89. aevals-0.1.0/website/public/aevals.svg +9 -0
  90. aevals-0.1.0/website/public/android-chrome-192x192.png +0 -0
  91. aevals-0.1.0/website/public/android-chrome-512x512.png +0 -0
  92. aevals-0.1.0/website/public/favicon-16x16.png +0 -0
  93. aevals-0.1.0/website/public/favicon-32x32.png +0 -0
  94. aevals-0.1.0/website/public/llms.txt +42 -0
  95. aevals-0.1.0/website/tsconfig.json +21 -0
@@ -0,0 +1,26 @@
1
+ ---
2
+ name: Bug report
3
+ labels: bug
4
+ ---
5
+
6
+ **Describe the bug**
7
+ A clear description of what's broken.
8
+
9
+ **To reproduce**
10
+ Steps to reproduce:
11
+ 1. ...
12
+ 2. ...
13
+
14
+ **Expected behavior**
15
+ What you expected to happen.
16
+
17
+ **Environment**
18
+ - OS:
19
+ - Python version:
20
+ - aevals version:
21
+ - LLM SDK + version:
22
+
23
+ **Logs / traceback**
24
+ ```
25
+ paste here
26
+ ```
@@ -0,0 +1,13 @@
1
+ ---
2
+ name: Feature request
3
+ labels: enhancement
4
+ ---
5
+
6
+ **Problem**
7
+ What are you trying to do that isn't possible or is harder than it should be?
8
+
9
+ **Proposed solution**
10
+ How you'd like it to work.
11
+
12
+ **Alternatives considered**
13
+ Other approaches you've thought about.
@@ -0,0 +1,13 @@
1
+ ## What
2
+
3
+ Brief description of the change.
4
+
5
+ ## Why
6
+
7
+ Motivation or issue reference.
8
+
9
+ ## Test plan
10
+
11
+ - [ ] Tests pass (`pytest`)
12
+ - [ ] Lint passes (`ruff check src/ tests/`)
13
+ - [ ] Covered by existing or new tests
@@ -0,0 +1,50 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+ workflow_call:
9
+
10
+ jobs:
11
+ test:
12
+ runs-on: ubuntu-latest
13
+ strategy:
14
+ matrix:
15
+ python-version: ["3.12"]
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install dependencies
25
+ run: pip install -e ".[dev]"
26
+
27
+ - name: Lint
28
+ run: ruff check src/ tests/
29
+
30
+ - name: Test
31
+ run: pytest --cov=aevals --cov-report=term-missing --cov-fail-under=90
32
+
33
+ lint:
34
+ runs-on: ubuntu-latest
35
+ steps:
36
+ - uses: actions/checkout@v4
37
+
38
+ - name: Set up Python
39
+ uses: actions/setup-python@v5
40
+ with:
41
+ python-version: "3.12"
42
+
43
+ - name: Install dependencies
44
+ run: pip install -e ".[dev]"
45
+
46
+ - name: Ruff check
47
+ run: ruff check src/ tests/
48
+
49
+ - name: Ruff format check
50
+ run: ruff format --check src/ tests/
@@ -0,0 +1,83 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ permissions:
9
+ contents: write
10
+ id-token: write
11
+
12
+ jobs:
13
+ # Gate: ensure CI passes before releasing
14
+ ci:
15
+ uses: ./.github/workflows/ci.yml
16
+
17
+ build:
18
+ needs: ci
19
+ runs-on: ubuntu-latest
20
+ steps:
21
+ - uses: actions/checkout@v4
22
+
23
+ - name: Set up Python
24
+ uses: actions/setup-python@v5
25
+ with:
26
+ python-version: "3.12"
27
+
28
+ - name: Install build tools
29
+ run: pip install build
30
+
31
+ - name: Verify tag matches package version
32
+ run: |
33
+ TAG_VERSION="${GITHUB_REF#refs/tags/v}"
34
+ PKG_VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])")
35
+ if [ "$TAG_VERSION" != "$PKG_VERSION" ]; then
36
+ echo "::error::Tag version ($TAG_VERSION) does not match package version ($PKG_VERSION)"
37
+ exit 1
38
+ fi
39
+
40
+ - name: Build sdist and wheel
41
+ run: python -m build
42
+
43
+ - name: Upload dist artifacts
44
+ uses: actions/upload-artifact@v4
45
+ with:
46
+ name: dist
47
+ path: dist/
48
+
49
+ publish-pypi:
50
+ needs: build
51
+ runs-on: ubuntu-latest
52
+ environment: pypi
53
+ permissions:
54
+ id-token: write
55
+ steps:
56
+ - name: Download dist artifacts
57
+ uses: actions/download-artifact@v4
58
+ with:
59
+ name: dist
60
+ path: dist/
61
+
62
+ - name: Publish to PyPI
63
+ uses: pypa/gh-action-pypi-publish@release/v1
64
+
65
+ github-release:
66
+ needs: build
67
+ runs-on: ubuntu-latest
68
+ permissions:
69
+ contents: write
70
+ steps:
71
+ - uses: actions/checkout@v4
72
+
73
+ - name: Download dist artifacts
74
+ uses: actions/download-artifact@v4
75
+ with:
76
+ name: dist
77
+ path: dist/
78
+
79
+ - name: Create GitHub Release
80
+ uses: softprops/action-gh-release@v2
81
+ with:
82
+ generate_release_notes: true
83
+ files: dist/*
@@ -0,0 +1,219 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ # Pipfile.lock
96
+
97
+ # UV
98
+ uv.lock
99
+
100
+ # poetry
101
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
103
+ # commonly ignored for libraries.
104
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105
+ # poetry.lock
106
+ # poetry.toml
107
+
108
+ # pdm
109
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
111
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
112
+ # pdm.lock
113
+ # pdm.toml
114
+ .pdm-python
115
+ .pdm-build/
116
+
117
+ # pixi
118
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
119
+ # pixi.lock
120
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
121
+ # in the .venv directory. It is recommended not to include this directory in version control.
122
+ .pixi
123
+
124
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
125
+ __pypackages__/
126
+
127
+ # Celery stuff
128
+ celerybeat-schedule
129
+ celerybeat.pid
130
+
131
+ # Redis
132
+ *.rdb
133
+ *.aof
134
+ *.pid
135
+
136
+ # RabbitMQ
137
+ mnesia/
138
+ rabbitmq/
139
+ rabbitmq-data/
140
+
141
+ # ActiveMQ
142
+ activemq-data/
143
+
144
+ # SageMath parsed files
145
+ *.sage.py
146
+
147
+ # Environments
148
+ .env
149
+ .envrc
150
+ .venv
151
+ env/
152
+ venv/
153
+ ENV/
154
+ env.bak/
155
+ venv.bak/
156
+
157
+ # Spyder project settings
158
+ .spyderproject
159
+ .spyproject
160
+
161
+ # Rope project settings
162
+ .ropeproject
163
+
164
+ # mkdocs documentation
165
+ /site
166
+
167
+ # mypy
168
+ .mypy_cache/
169
+ .dmypy.json
170
+ dmypy.json
171
+
172
+ # Pyre type checker
173
+ .pyre/
174
+
175
+ # pytype static type analyzer
176
+ .pytype/
177
+
178
+ # Cython debug symbols
179
+ cython_debug/
180
+
181
+ # PyCharm
182
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
183
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
184
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
185
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
186
+ # .idea/
187
+
188
+ # Abstra
189
+ # Abstra is an AI-powered process automation framework.
190
+ # Ignore directories containing user credentials, local state, and settings.
191
+ # Learn more at https://abstra.io/docs
192
+ .abstra/
193
+
194
+ # Visual Studio Code
195
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
196
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
197
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
198
+ # you could uncomment the following to ignore the entire vscode folder
199
+ # .vscode/
200
+
201
+ # Ruff stuff:
202
+ .ruff_cache/
203
+
204
+ # PyPI configuration file
205
+ .pypirc
206
+
207
+ # Marimo
208
+ marimo/_static/
209
+ marimo/_lsp/
210
+ __marimo__/
211
+
212
+ # Streamlit
213
+ .streamlit/secrets.toml
214
+
215
+ # OS
216
+ .DS_Store
217
+
218
+ # aevals local data (traces, results, wrapper)
219
+ .aevals/
@@ -0,0 +1,16 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.15.5
4
+ hooks:
5
+ - id: ruff
6
+ args: [--fix, --exit-non-zero-on-fix]
7
+ - id: ruff-format
8
+
9
+ - repo: local
10
+ hooks:
11
+ - id: pyright
12
+ name: pyright
13
+ entry: pyright src/
14
+ language: system
15
+ types: [python]
16
+ pass_filenames: false
@@ -0,0 +1,22 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ This changelog is automatically generated by [git-cliff](https://git-cliff.org/).
6
+
7
+ ## [0.1.0] - 2026-03-14
8
+
9
+ ### Added
10
+
11
+ - Core eval framework: subprocess isolation, OTel trace capture, two-track evaluation
12
+ - Five deterministic constraints: `max_duration_ms`, `max_steps`, `tool_sequence`, `no_repeat_calls`, `output_contains`
13
+ - LLM-as-judge rubric evaluation via litellm
14
+ - CLI commands: `aevals init`, `aevals run`, `aevals mcp-serve`
15
+ - MCP server with five tools: scan, init, run, results, trajectory
16
+ - AST-based SDK and entrypoint detection (scanner)
17
+ - Auto-instrumentation for 6 LLM SDKs via OpenLLMetry
18
+ - Rich terminal output and JSON output formats
19
+ - CI workflow with 90% coverage threshold
20
+ - Release workflow with PyPI publishing
21
+ - Example agents: booking-agent, sdr-agent
22
+ - Pre-commit hooks: ruff, pyright
aevals-0.1.0/CLAUDE.md ADDED
@@ -0,0 +1,74 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Commands
6
+
7
+ ```bash
8
+ # Install
9
+ uv pip install -e ".[dev]"
10
+
11
+ # Test
12
+ pytest # all tests
13
+ pytest tests/test_constraints.py # single file
14
+ pytest tests/test_constraints.py::TestMaxDuration # single class
15
+ pytest tests/test_constraints.py::TestMaxDuration::test_passes_under_limit # single test
16
+ pytest --cov=aevals --cov-report=term-missing --cov-fail-under=90 # with coverage (CI threshold: 90%)
17
+
18
+ # Lint & format
19
+ ruff check src/ tests/
20
+ ruff check --fix src/ tests/
21
+ ruff format src/ tests/
22
+ ruff format --check src/ tests/ # CI format check (no changes)
23
+
24
+ # Type check
25
+ pyright
26
+
27
+ # CLI
28
+ aevals init # scan project, generate aevals.yaml + .aevals/
29
+ aevals run # run all scenarios
30
+ aevals run --scenario <name> # run specific scenario
31
+ aevals run --json # machine-readable output
32
+ aevals mcp-serve # start MCP server for Claude Code
33
+ ```
34
+
35
+ ## Architecture
36
+
37
+ aevals is an eval framework for LLM-based agents. It runs agent code in subprocesses, captures OpenTelemetry traces, and evaluates results against deterministic constraints and LLM-judged rubrics.
38
+
39
+ ### Data flow
40
+
41
+ ```
42
+ aevals.yaml → scenario selection → subprocess (run_wrapper.py)
43
+ → OTel instruments LLM calls → trace JSON file
44
+ → parse into Trajectory → evaluate constraints + rubric → ScenarioResult
45
+ ```
46
+
47
+ ### Key modules
48
+
49
+ - **`config/`** — Pydantic models for `aevals.yaml`. `AevalsConfig.resolve_constraints()` merges defaults with scenario overrides.
50
+ - **`capture/otel.py`** — `activate()` sets up OTel tracing in agent subprocess. `parse_trace_file()` converts OTel span JSON into `LLMCall` objects by reading `gen_ai.*` attributes. Auto-instruments 6 LLM SDKs.
51
+ - **`scanner/`** — AST-based detection of SDK imports and entrypoint candidates. Generates `run_wrapper.py` template with marker constants from `constants.py`.
52
+ - **`evals/constraints.py`** — Five deterministic checks (max_duration_ms, max_steps, tool_sequence, no_repeat_calls, output_contains). Zero LLM cost.
53
+ - **`evals/judge.py`** — LLM-as-judge via litellm. Sends trajectory + rubric, parses JSON response with index-based matching and positional fallback.
54
+ - **`evals/runner.py`** — Shared `filter_scenarios()` and `summarize_results()` used by both CLI and MCP server.
55
+ - **`cli/run.py`** — Spawns agent subprocesses concurrently via `asyncio.create_subprocess_exec`. Output extracted between `RESULT_MARKER_START/END` sentinels.
56
+ - **`mcp/server.py`** — Primary interface for Claude Code. Five tools: scan, init, run, results, trajectory.
57
+ - **`constants.py`** — Single source of truth for directory paths and output marker strings.
58
+
59
+ ### Subprocess isolation model
60
+
61
+ Each scenario runs in its own subprocess with env vars (`AEVALS_RUN_ID`, `AEVALS_ENTRY`, `AEVALS_TRACE_DIR`). The generated `run_wrapper.py` dynamically imports the agent entry point, pipes input via stdin, and captures output between marker strings in stdout. OTel spans are written to per-run JSON files in `.aevals/traces/`.
62
+
63
+ ### Two-track evaluation
64
+
65
+ Pass/fail is `constraints_pass AND rubric_pass`. Constraints are synchronous and deterministic. Rubric evaluation is async via litellm, optional (missing judge config → rubric_pending=True, doesn't fail). Results are Pydantic models throughout.
66
+
67
+ ## Conventions
68
+
69
+ - Python 3.12+. Line length 100. Ruff rules: E, F, I, UP, B, SIM, RUF, PT, PIE, C4, RET, PERF.
70
+ - Pyright strict mode. Type hints everywhere.
71
+ - Async tests use `pytest-asyncio` with `asyncio_mode = "auto"` (no `@pytest.mark.asyncio` needed).
72
+ - All shared path constants live in `constants.py` — don't hardcode `.aevals/` paths.
73
+ - Output marker strings (`__AEVALS_RESULT__`, `__AEVALS_END__`) are defined once in `constants.py` and injected into the wrapper template via `.format()`.
74
+ - TOCTOU: use `try/except FileNotFoundError` instead of `if path.exists()` then `path.read_text()`.
@@ -0,0 +1,62 @@
1
+ # Contributing to aevals
2
+
3
+ Thanks for your interest in contributing. This guide covers setup, workflow, and conventions.
4
+
5
+ ## Setup
6
+
7
+ ```bash
8
+ git clone https://github.com/satyaborg/aevals.git
9
+ cd aevals
10
+ pip install -e ".[dev]"
11
+ pre-commit install
12
+ ```
13
+
14
+ ## Development workflow
15
+
16
+ 1. Create a branch: `git checkout -b type/short-description`
17
+ - Types: `feat/`, `fix/`, `chore/`, `docs/`, `test/`
18
+ 2. Make your changes
19
+ 3. Run checks:
20
+
21
+ ```bash
22
+ pytest # tests
23
+ pytest --cov=aevals --cov-fail-under=90 # with coverage
24
+ ruff check src/ tests/ # lint
25
+ ruff format src/ tests/ # format
26
+ ```
27
+
28
+ 4. Commit with a clear, imperative message under 72 chars
29
+ 5. Open a PR against `main`
30
+
31
+ ## Code conventions
32
+
33
+ - **Python 3.12+**. Line length 100.
34
+ - **Type hints everywhere**. No `Any` unless forced by a library boundary. Strict pyright.
35
+ - **Double quotes** for strings (ruff default). Trailing commas always.
36
+ - **Imports**: stdlib, third-party, local — separated by blank lines. Sorted by ruff/isort.
37
+ - **Shared constants** live in `constants.py`. Don't hardcode `.aevals/` paths.
38
+ - **File access**: use `try/except FileNotFoundError` instead of `if path.exists()` then read (TOCTOU).
39
+ - **Async tests** use `pytest-asyncio` with `asyncio_mode = "auto"` — no `@pytest.mark.asyncio` needed.
40
+
41
+ ## Testing
42
+
43
+ - Write tests for any function with branching logic or >10 lines.
44
+ - Prefer real objects over mocks. Mock only at system boundaries (network, disk, time).
45
+ - Test names: `test_<what>_<condition>_<expected>`.
46
+ - CI enforces 90% coverage.
47
+
48
+ ## Pull requests
49
+
50
+ - Keep PRs small and focused. If a PR touches >300 lines, consider splitting it.
51
+ - One logical change per commit.
52
+ - PRs run CI automatically (lint + test).
53
+
54
+ ## Lint rules
55
+
56
+ Ruff is configured with: `E`, `F`, `I`, `UP`, `B`, `SIM`, `RUF`, `PT`, `PIE`, `C4`, `RET`, `PERF`.
57
+
58
+ Pre-commit hooks run ruff and pyright automatically on commit.
59
+
60
+ ## Releases
61
+
62
+ Releases are automated via GitHub Actions when a version tag is pushed. See `scripts/release.sh`.
aevals-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Satya Borgohain
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.