evalgrid 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalgrid-0.1.0/.env.example +2 -0
- evalgrid-0.1.0/.github/workflows/ci.yml +30 -0
- evalgrid-0.1.0/.github/workflows/evalgrid.yml +59 -0
- evalgrid-0.1.0/.github/workflows/publish.yml +72 -0
- evalgrid-0.1.0/.gitignore +72 -0
- evalgrid-0.1.0/CHANGELOG.md +29 -0
- evalgrid-0.1.0/CONTRIBUTING.md +82 -0
- evalgrid-0.1.0/LICENSE +21 -0
- evalgrid-0.1.0/PKG-INFO +263 -0
- evalgrid-0.1.0/README.md +222 -0
- evalgrid-0.1.0/ROADMAP.md +40 -0
- evalgrid-0.1.0/dashboard/index.html +12 -0
- evalgrid-0.1.0/dashboard/package-lock.json +3212 -0
- evalgrid-0.1.0/dashboard/package.json +29 -0
- evalgrid-0.1.0/dashboard/playwright.config.ts +17 -0
- evalgrid-0.1.0/dashboard/postcss.config.js +6 -0
- evalgrid-0.1.0/dashboard/src/App.tsx +343 -0
- evalgrid-0.1.0/dashboard/src/components/Header.tsx +225 -0
- evalgrid-0.1.0/dashboard/src/components/JudgeSettings.tsx +104 -0
- evalgrid-0.1.0/dashboard/src/components/RunDetail.tsx +355 -0
- evalgrid-0.1.0/dashboard/src/components/RunRow.tsx +399 -0
- evalgrid-0.1.0/dashboard/src/components/ScenariosView.tsx +337 -0
- evalgrid-0.1.0/dashboard/src/components/StatusBadge.tsx +46 -0
- evalgrid-0.1.0/dashboard/src/hooks/useRuns.ts +137 -0
- evalgrid-0.1.0/dashboard/src/hooks/useScenarios.ts +19 -0
- evalgrid-0.1.0/dashboard/src/index.css +7 -0
- evalgrid-0.1.0/dashboard/src/main.tsx +10 -0
- evalgrid-0.1.0/dashboard/src/types.ts +77 -0
- evalgrid-0.1.0/dashboard/tailwind.config.js +17 -0
- evalgrid-0.1.0/dashboard/tests/e2e/dashboard-tab.spec.ts +58 -0
- evalgrid-0.1.0/dashboard/tests/e2e/fixtures.ts +80 -0
- evalgrid-0.1.0/dashboard/tests/e2e/helpers.ts +42 -0
- evalgrid-0.1.0/dashboard/tests/e2e/judge-settings.spec.ts +111 -0
- evalgrid-0.1.0/dashboard/tests/e2e/modal.spec.ts +151 -0
- evalgrid-0.1.0/dashboard/tests/e2e/navigation.spec.ts +44 -0
- evalgrid-0.1.0/dashboard/tests/e2e/runs-tab.spec.ts +101 -0
- evalgrid-0.1.0/dashboard/tests/e2e/scenarios-tab.spec.ts +75 -0
- evalgrid-0.1.0/dashboard/tsconfig.json +21 -0
- evalgrid-0.1.0/dashboard/tsconfig.node.json +10 -0
- evalgrid-0.1.0/dashboard/vite.config.ts +15 -0
- evalgrid-0.1.0/docs/screenshots/dashboard-overview.png +0 -0
- evalgrid-0.1.0/docs/screenshots/run-detail.png +0 -0
- evalgrid-0.1.0/docs/screenshots/scenarios.png +0 -0
- evalgrid-0.1.0/evalgrid/__init__.py +3 -0
- evalgrid-0.1.0/evalgrid/__main__.py +4 -0
- evalgrid-0.1.0/evalgrid/api/__init__.py +0 -0
- evalgrid-0.1.0/evalgrid/api/app.py +442 -0
- evalgrid-0.1.0/evalgrid/api/database.py +78 -0
- evalgrid-0.1.0/evalgrid/cli/__init__.py +3 -0
- evalgrid-0.1.0/evalgrid/cli/commands/__init__.py +0 -0
- evalgrid-0.1.0/evalgrid/cli/commands/init.py +139 -0
- evalgrid-0.1.0/evalgrid/cli/commands/run.py +262 -0
- evalgrid-0.1.0/evalgrid/cli/commands/scenario.py +153 -0
- evalgrid-0.1.0/evalgrid/cli/commands/server.py +39 -0
- evalgrid-0.1.0/evalgrid/cli/main.py +20 -0
- evalgrid-0.1.0/evalgrid/core/__init__.py +14 -0
- evalgrid-0.1.0/evalgrid/core/loader.py +106 -0
- evalgrid-0.1.0/evalgrid/core/models.py +99 -0
- evalgrid-0.1.0/evalgrid/core/paths.py +27 -0
- evalgrid-0.1.0/evalgrid/core/runner.py +125 -0
- evalgrid-0.1.0/evalgrid/core/scorer.py +216 -0
- evalgrid-0.1.0/evalgrid/static/assets/index-9gTV8edJ.css +1 -0
- evalgrid-0.1.0/evalgrid/static/assets/index-DiafW5oN.js +205 -0
- evalgrid-0.1.0/evalgrid/static/index.html +13 -0
- evalgrid-0.1.0/examples/coding_fix_bug.yml +56 -0
- evalgrid-0.1.0/examples/sdr_qualify_lead.yml +46 -0
- evalgrid-0.1.0/pyproject.toml +73 -0
- evalgrid-0.1.0/tests/__init__.py +0 -0
- evalgrid-0.1.0/tests/conftest.py +45 -0
- evalgrid-0.1.0/tests/fixtures/invalid_scenario.yml +6 -0
- evalgrid-0.1.0/tests/fixtures/multi_step_scenario.yml +32 -0
- evalgrid-0.1.0/tests/fixtures/valid_scenario.yml +13 -0
- evalgrid-0.1.0/tests/test_api.py +140 -0
- evalgrid-0.1.0/tests/test_cli.py +48 -0
- evalgrid-0.1.0/tests/test_cli_smoke.py +10 -0
- evalgrid-0.1.0/tests/test_database.py +97 -0
- evalgrid-0.1.0/tests/test_loader.py +113 -0
- evalgrid-0.1.0/tests/test_models.py +64 -0
- evalgrid-0.1.0/tests/test_runner.py +108 -0
- evalgrid-0.1.0/tests/test_scorer.py +122 -0
- evalgrid-0.1.0/tests/test_yaml_schema.py +20 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
name: Test
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v6
|
|
16
|
+
|
|
17
|
+
- name: Set up Python 3.11
|
|
18
|
+
uses: actions/setup-python@v6
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.11"
|
|
21
|
+
cache: pip
|
|
22
|
+
|
|
23
|
+
- name: Install package and dev dependencies
|
|
24
|
+
run: pip install -e ".[dev]"
|
|
25
|
+
|
|
26
|
+
- name: Run tests
|
|
27
|
+
run: pytest
|
|
28
|
+
|
|
29
|
+
- name: Verify CLI
|
|
30
|
+
run: evalgrid --help
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
name: evalgrid
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
|
|
6
|
+
jobs:
|
|
7
|
+
eval:
|
|
8
|
+
name: Run Evals
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
timeout-minutes: 30
|
|
11
|
+
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
|
|
15
|
+
- name: Set up Python
|
|
16
|
+
uses: actions/setup-python@v5
|
|
17
|
+
with:
|
|
18
|
+
python-version: "3.11"
|
|
19
|
+
cache: pip
|
|
20
|
+
|
|
21
|
+
- name: Install evalgrid
|
|
22
|
+
run: pip install -e ".[dev]"
|
|
23
|
+
|
|
24
|
+
- name: Initialize evalgrid
|
|
25
|
+
run: evalgrid init
|
|
26
|
+
|
|
27
|
+
- name: Copy example scenarios
|
|
28
|
+
run: |
|
|
29
|
+
if [ -d examples ]; then
|
|
30
|
+
cp examples/*.yml .evalgrid/scenarios/ 2>/dev/null || true
|
|
31
|
+
fi
|
|
32
|
+
|
|
33
|
+
- name: Run Evals (mock mode — no API key required)
|
|
34
|
+
run: |
|
|
35
|
+
evalgrid run \
|
|
36
|
+
--mock \
|
|
37
|
+
--output results.json \
|
|
38
|
+
--config .evalgrid/config.yml
|
|
39
|
+
|
|
40
|
+
- name: Run Evals with real API (optional)
|
|
41
|
+
env:
|
|
42
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
43
|
+
run: |
|
|
44
|
+
if [ -n "$ANTHROPIC_API_KEY" ]; then
|
|
45
|
+
evalgrid run \
|
|
46
|
+
--output results-real.json \
|
|
47
|
+
--config .evalgrid/config.yml
|
|
48
|
+
else
|
|
49
|
+
echo "Skipping real API run — ANTHROPIC_API_KEY not configured"
|
|
50
|
+
fi
|
|
51
|
+
|
|
52
|
+
- name: Upload Results
|
|
53
|
+
uses: actions/upload-artifact@v4
|
|
54
|
+
if: always()
|
|
55
|
+
with:
|
|
56
|
+
name: evalgrid-results-${{ github.run_id }}
|
|
57
|
+
path: results.json
|
|
58
|
+
retention-days: 30
|
|
59
|
+
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
test:
|
|
9
|
+
name: Run tests
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v6
|
|
14
|
+
|
|
15
|
+
- name: Set up Python 3.11
|
|
16
|
+
uses: actions/setup-python@v6
|
|
17
|
+
with:
|
|
18
|
+
python-version: "3.11"
|
|
19
|
+
cache: pip
|
|
20
|
+
|
|
21
|
+
- name: Install package and dev dependencies
|
|
22
|
+
run: pip install -e ".[dev]"
|
|
23
|
+
|
|
24
|
+
- name: Run tests
|
|
25
|
+
run: pytest
|
|
26
|
+
|
|
27
|
+
build:
|
|
28
|
+
name: Build distribution
|
|
29
|
+
needs: test
|
|
30
|
+
runs-on: ubuntu-latest
|
|
31
|
+
|
|
32
|
+
steps:
|
|
33
|
+
- uses: actions/checkout@v6
|
|
34
|
+
|
|
35
|
+
- name: Set up Python 3.11
|
|
36
|
+
uses: actions/setup-python@v6
|
|
37
|
+
with:
|
|
38
|
+
python-version: "3.11"
|
|
39
|
+
cache: pip
|
|
40
|
+
|
|
41
|
+
- name: Install build tools
|
|
42
|
+
run: pip install build
|
|
43
|
+
|
|
44
|
+
- name: Build sdist and wheel
|
|
45
|
+
run: python -m build
|
|
46
|
+
|
|
47
|
+
- name: Upload dist artifacts
|
|
48
|
+
uses: actions/upload-artifact@v4
|
|
49
|
+
with:
|
|
50
|
+
name: dist
|
|
51
|
+
path: dist/
|
|
52
|
+
|
|
53
|
+
publish:
|
|
54
|
+
name: Publish to PyPI
|
|
55
|
+
needs: build
|
|
56
|
+
runs-on: ubuntu-latest
|
|
57
|
+
environment:
|
|
58
|
+
name: pypi
|
|
59
|
+
url: https://pypi.org/project/evalgrid/
|
|
60
|
+
|
|
61
|
+
permissions:
|
|
62
|
+
id-token: write
|
|
63
|
+
|
|
64
|
+
steps:
|
|
65
|
+
- name: Download dist artifacts
|
|
66
|
+
uses: actions/download-artifact@v4
|
|
67
|
+
with:
|
|
68
|
+
name: dist
|
|
69
|
+
path: dist/
|
|
70
|
+
|
|
71
|
+
- name: Publish to PyPI
|
|
72
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.pyo
|
|
5
|
+
*.pyd
|
|
6
|
+
.Python
|
|
7
|
+
*.egg-info/
|
|
8
|
+
dist/
|
|
9
|
+
build/
|
|
10
|
+
.eggs/
|
|
11
|
+
*.egg
|
|
12
|
+
|
|
13
|
+
# Virtual environments
|
|
14
|
+
.venv/
|
|
15
|
+
venv/
|
|
16
|
+
env/
|
|
17
|
+
|
|
18
|
+
# Testing
|
|
19
|
+
.pytest_cache/
|
|
20
|
+
.coverage
|
|
21
|
+
htmlcov/
|
|
22
|
+
coverage.xml
|
|
23
|
+
|
|
24
|
+
# Databases (dev artifacts)
|
|
25
|
+
*.db
|
|
26
|
+
*.sqlite
|
|
27
|
+
|
|
28
|
+
# evalgrid runtime
|
|
29
|
+
.evalgrid/
|
|
30
|
+
.agenteval/
|
|
31
|
+
sdr-demo-result.json
|
|
32
|
+
sdr-real-result.json
|
|
33
|
+
*-results.json
|
|
34
|
+
*-result.json
|
|
35
|
+
CLAUDE.md
|
|
36
|
+
.claude/
|
|
37
|
+
.private/
|
|
38
|
+
|
|
39
|
+
# Node / dashboard
|
|
40
|
+
dashboard/node_modules/
|
|
41
|
+
dashboard/dist/
|
|
42
|
+
dashboard/test-results/
|
|
43
|
+
dashboard/Scenario
|
|
44
|
+
|
|
45
|
+
# IDE
|
|
46
|
+
.vscode/
|
|
47
|
+
.idea/
|
|
48
|
+
*.swp
|
|
49
|
+
*.swo
|
|
50
|
+
|
|
51
|
+
# Environment / secrets
|
|
52
|
+
.env
|
|
53
|
+
.env.local
|
|
54
|
+
.env.*.local
|
|
55
|
+
|
|
56
|
+
# Node (general)
|
|
57
|
+
node_modules/
|
|
58
|
+
|
|
59
|
+
# OS
|
|
60
|
+
.DS_Store
|
|
61
|
+
Thumbs.db
|
|
62
|
+
|
|
63
|
+
# Build artifacts
|
|
64
|
+
*.whl
|
|
65
|
+
*.tar.gz
|
|
66
|
+
|
|
67
|
+
# mypy cache
|
|
68
|
+
.mypy_cache/
|
|
69
|
+
|
|
70
|
+
# ruff cache
|
|
71
|
+
.ruff_cache/
|
|
72
|
+
templates/
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to evalgrid will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
|
6
|
+
|
|
7
|
+
## [0.1.0] - 2026-05-06
|
|
8
|
+
|
|
9
|
+
Initial release.
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
|
|
13
|
+
- `evalgrid init` — scaffold `.evalgrid/` config and example scenario
|
|
14
|
+
- `evalgrid run` — run all scenarios and print pass/fail results
|
|
15
|
+
- `evalgrid run --mock` — dry run with no API calls (CI-friendly)
|
|
16
|
+
- `evalgrid run --tag <tag>` — filter scenarios by tag
|
|
17
|
+
- `evalgrid run --output <file>` — save results to JSON
|
|
18
|
+
- `evalgrid scenario add` — interactively create a new scenario
|
|
19
|
+
- `evalgrid scenario list` — list all scenarios in the scenarios directory
|
|
20
|
+
- `evalgrid scenario validate` — validate scenario YAML files
|
|
21
|
+
- `evalgrid server` — start the React dashboard at `http://localhost:8000`
|
|
22
|
+
- FastAPI REST API: `GET /api/runs`, `POST /api/runs`, `GET /api/runs/{id}`, `GET /api/scenarios`, `POST /api/scenarios`
|
|
23
|
+
- SQLite persistence via SQLAlchemy async (results survive server restarts)
|
|
24
|
+
- React dashboard: pass rate ring chart, avg latency sparkline, run history, per-step score breakdowns
|
|
25
|
+
- Dark/light mode, mobile responsive dashboard
|
|
26
|
+
- LLM-as-judge scoring via litellm (supports Anthropic, OpenAI, Google, Ollama)
|
|
27
|
+
- Mock scorer for CI runs that require no API key
|
|
28
|
+
- 8 example scenarios: Claude Code suite (5 scenarios), hello-world, sdr-qualify-lead, support-triage
|
|
29
|
+
- Python API: `EvalConfig`, `ScenarioLoader`, `ScenarioRunner`
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# Contributing to evalgrid
|
|
2
|
+
|
|
3
|
+
Thank you for your interest in contributing to evalgrid.
|
|
4
|
+
|
|
5
|
+
## Prerequisites
|
|
6
|
+
|
|
7
|
+
- Python 3.11 or newer
|
|
8
|
+
- Node.js 18+ (only if working on the dashboard)
|
|
9
|
+
- Git
|
|
10
|
+
|
|
11
|
+
## Dev Setup
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
git clone https://github.com/naman006-rai/evalgrid.git
|
|
15
|
+
cd evalgrid
|
|
16
|
+
pip install -e ".[dev]"
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
This installs evalgrid in editable mode along with test dependencies (pytest, ruff, mypy).
|
|
20
|
+
|
|
21
|
+
## Running Tests
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pytest tests/ -v
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
All tests must pass before submitting a pull request. The CI workflow runs the full suite automatically.
|
|
28
|
+
|
|
29
|
+
To run a specific test file:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pytest tests/test_api.py -v
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Dashboard (optional)
|
|
36
|
+
|
|
37
|
+
If you are modifying the React dashboard:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
cd dashboard
|
|
41
|
+
npm install
|
|
42
|
+
npm run dev # dev server at localhost:5173
|
|
43
|
+
npm run build # production build
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
After building, copy the output to the static directory:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# Windows (PowerShell)
|
|
50
|
+
Copy-Item -Recurse -Force dashboard\dist\* evalgrid\static\
|
|
51
|
+
|
|
52
|
+
# macOS / Linux
|
|
53
|
+
cp -r dashboard/dist/* evalgrid/static/
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Code Style
|
|
57
|
+
|
|
58
|
+
Python code is linted with `ruff` and type-checked with `mypy`:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
ruff check evalgrid/
|
|
62
|
+
mypy evalgrid/
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## PR Guidelines
|
|
66
|
+
|
|
67
|
+
1. Fork the repo and create a branch: `git checkout -b feat/my-feature`
|
|
68
|
+
2. Write tests for your change (keep coverage above 80%)
|
|
69
|
+
3. Run the full suite: `pytest tests/ -v`
|
|
70
|
+
4. Run linting: `ruff check evalgrid/`
|
|
71
|
+
5. Open a pull request against `main` with a clear description of what changes and why
|
|
72
|
+
6. CI will run evals and tests automatically — fix any failures before requesting review
|
|
73
|
+
|
|
74
|
+
## Reporting Issues
|
|
75
|
+
|
|
76
|
+
Open a GitHub issue at <https://github.com/naman006-rai/evalgrid/issues>.
|
|
77
|
+
|
|
78
|
+
Include:
|
|
79
|
+
- evalgrid version (`pip show evalgrid`)
|
|
80
|
+
- Python version (`python --version`)
|
|
81
|
+
- Steps to reproduce
|
|
82
|
+
- Expected vs actual behavior
|
evalgrid-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Naman Rai
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
evalgrid-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: evalgrid
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Agent evaluation framework — run, score, and track AI agent correctness at scale.
|
|
5
|
+
Project-URL: Homepage, https://github.com/naman006-rai/evalgrid
|
|
6
|
+
Project-URL: Documentation, https://github.com/naman006-rai/evalgrid#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/naman006-rai/evalgrid
|
|
8
|
+
Project-URL: Issues, https://github.com/naman006-rai/evalgrid/issues
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/naman006-rai/evalgrid/issues
|
|
10
|
+
Author-email: Naman Rai <naman.rai006@gmail.com>
|
|
11
|
+
License: MIT
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Keywords: agents,ai,evaluation,llm,testing
|
|
14
|
+
Classifier: Development Status :: 3 - Alpha
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Software Development :: Testing
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
|
+
Requires-Dist: aiosqlite>=0.20.0
|
|
22
|
+
Requires-Dist: click>=8.1.0
|
|
23
|
+
Requires-Dist: fastapi>=0.115.0
|
|
24
|
+
Requires-Dist: httpx>=0.27.0
|
|
25
|
+
Requires-Dist: litellm<2.0.0,>=1.0.0
|
|
26
|
+
Requires-Dist: pydantic>=2.0.0
|
|
27
|
+
Requires-Dist: pyyaml>=6.0
|
|
28
|
+
Requires-Dist: rich>=13.0.0
|
|
29
|
+
Requires-Dist: sqlalchemy>=2.0.0
|
|
30
|
+
Requires-Dist: uvicorn[standard]>=0.32.0
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: mypy>=1.13.0; extra == 'dev'
|
|
33
|
+
Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
|
|
34
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == 'dev'
|
|
35
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
36
|
+
Requires-Dist: ruff>=0.7.0; extra == 'dev'
|
|
37
|
+
Provides-Extra: postgres
|
|
38
|
+
Requires-Dist: asyncpg>=0.30.0; extra == 'postgres'
|
|
39
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'postgres'
|
|
40
|
+
Description-Content-Type: text/markdown
|
|
41
|
+
|
|
42
|
+
<div align="center">
|
|
43
|
+
|
|
44
|
+
<svg width="52" height="52" viewBox="0 0 36 36" fill="none" xmlns="http://www.w3.org/2000/svg">
|
|
45
|
+
<rect x="2" y="2" width="14" height="14" rx="3" fill="#5b5ef4"/>
|
|
46
|
+
<rect x="20" y="2" width="14" height="14" rx="3" fill="#5b5ef4" opacity="0.4"/>
|
|
47
|
+
<rect x="2" y="20" width="14" height="14" rx="3" fill="#5b5ef4" opacity="0.4"/>
|
|
48
|
+
<rect x="20" y="20" width="14" height="14" rx="3" fill="#5b5ef4" opacity="0.15"/>
|
|
49
|
+
<path d="M8 9 L11 12 L16 6" stroke="white" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"/>
|
|
50
|
+
</svg>
|
|
51
|
+
|
|
52
|
+
# evalgrid
|
|
53
|
+
|
|
54
|
+
**Agent evaluation framework — run, score, and track AI agent correctness at scale.**
|
|
55
|
+
|
|
56
|
+
[](https://www.python.org/downloads/)
|
|
57
|
+
[](https://opensource.org/licenses/MIT)
|
|
58
|
+
|
|
59
|
+
AI agents fail in ways unit tests can't catch: the customer support agent stops calling the refund tool after a prompt change. The research agent loops on the same Google search 23 times. The reconciliation agent silently picks the wrong account format on edge cases. evalgrid catches these regressions before they ship.
|
|
60
|
+
|
|
61
|
+
</div>
|
|
62
|
+
|
|
63
|
+
## Status
|
|
64
|
+
|
|
65
|
+
EvalGrid is in early development (v0.1). The framework runs reliably for local-first agent evaluation, but you should expect:
|
|
66
|
+
|
|
67
|
+
- Some rough edges in the dashboard UI
|
|
68
|
+
- Schema changes in scenario YAML between v0.1 and v0.2
|
|
69
|
+
- GitHub status check / CI gate coming in v0.2
|
|
70
|
+
|
|
71
|
+
If you're building agents and want eval-driven workflows now, this works. If you need production-grade CI integration, watch for v0.2.
|
|
72
|
+
|
|
73
|
+
## Screenshots
|
|
74
|
+
|
|
75
|
+

|
|
76
|
+
*Dashboard showing run history with pass rates and per-scenario breakdowns*
|
|
77
|
+
|
|
78
|
+

|
|
79
|
+
*Per-step verdicts with judge reasoning for a failed scenario*
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## Install
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
pip install evalgrid
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Quick start
|
|
90
|
+
|
|
91
|
+
**1. Initialize**
|
|
92
|
+
```bash
|
|
93
|
+
evalgrid init
|
|
94
|
+
```
|
|
95
|
+
Scaffolds `.evalgrid/` with a config file and an example scenario.
|
|
96
|
+
|
|
97
|
+
**2. Write a scenario**
|
|
98
|
+
|
|
99
|
+
`.evalgrid/scenarios/qualify_lead.yml`:
|
|
100
|
+
```yaml
|
|
101
|
+
id: sdr-qualify-lead
|
|
102
|
+
name: SDR Lead Qualification
|
|
103
|
+
description: Verify BANT qualification on an inbound enterprise lead
|
|
104
|
+
agent_type: sdr
|
|
105
|
+
|
|
106
|
+
prompt: |
|
|
107
|
+
You are an expert SDR. Qualify leads using the BANT framework.
|
|
108
|
+
Identify budget, authority, need, and timeline. Recommend a next step.
|
|
109
|
+
|
|
110
|
+
test_input: |
|
|
111
|
+
Prospect: Sarah Chen, VP Engineering at TechCorp (500 employees)
|
|
112
|
+
Budget: $50k allocated this quarter for developer tooling
|
|
113
|
+
She filled out a demo request form 20 minutes ago.
|
|
114
|
+
|
|
115
|
+
expected_actions:
|
|
116
|
+
- action: identify_bant_criteria
|
|
117
|
+
required: true
|
|
118
|
+
description: Identify which BANT criteria are present
|
|
119
|
+
- action: ask_timeline_question
|
|
120
|
+
required: true
|
|
121
|
+
description: Ask about the buying timeline
|
|
122
|
+
- action: recommend_next_step
|
|
123
|
+
required: false
|
|
124
|
+
description: Propose a clear next step (demo, call, etc.)
|
|
125
|
+
|
|
126
|
+
tags: [sdr, bant]
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
**3. Run**
|
|
130
|
+
```bash
|
|
131
|
+
export ANTHROPIC_API_KEY=sk-ant-...
|
|
132
|
+
evalgrid run
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
**4. See results**
|
|
136
|
+
```
|
|
137
|
+
Running 1 scenario(s)...
|
|
138
|
+
|
|
139
|
+
✓ SDR Lead Qualification sdr PASS 0.94 3/3
|
|
140
|
+
|
|
141
|
+
1/1 scenarios passed (100%)
|
|
142
|
+
Results saved to .evalgrid/results/
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
**5. Open the dashboard**
|
|
146
|
+
```bash
|
|
147
|
+
evalgrid server
|
|
148
|
+
```
|
|
149
|
+
Then visit `http://localhost:8000` for the full run history, pass rate trends, and per-step breakdowns.
|
|
150
|
+
|
|
151
|
+
> **Note:** The dashboard binds to `127.0.0.1` by default and is designed for local use only. Do not expose it to a public network.
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## Commands
|
|
156
|
+
|
|
157
|
+
| Command | Description |
|
|
158
|
+
|---------|-------------|
|
|
159
|
+
| `evalgrid init` | Scaffold `.evalgrid/` in your project |
|
|
160
|
+
| `evalgrid run` | Run all scenarios and print pass/fail |
|
|
161
|
+
| `evalgrid run --tag sdr` | Filter by tag |
|
|
162
|
+
| `evalgrid run --mock` | Dry run with no API calls (great for CI) |
|
|
163
|
+
| `evalgrid run --output results.json` | Save results to a file |
|
|
164
|
+
| `evalgrid scenario add` | Interactively create a new scenario |
|
|
165
|
+
| `evalgrid scenario list` | List all scenarios |
|
|
166
|
+
| `evalgrid server` | Start the React dashboard |
|
|
167
|
+
|
|
168
|
+
## Scenario format
|
|
169
|
+
|
|
170
|
+
```yaml
|
|
171
|
+
id: unique-scenario-id
|
|
172
|
+
name: Human Readable Name
|
|
173
|
+
description: What this scenario tests
|
|
174
|
+
agent_type: sdr | coding | support | research | finance | legal | generic
|
|
175
|
+
|
|
176
|
+
prompt: |
|
|
177
|
+
System prompt for your agent...
|
|
178
|
+
|
|
179
|
+
test_input: |
|
|
180
|
+
The user message / input to evaluate...
|
|
181
|
+
|
|
182
|
+
expected_actions:
|
|
183
|
+
- action: action_name
|
|
184
|
+
required: true
|
|
185
|
+
description: What this action entails
|
|
186
|
+
|
|
187
|
+
success_criteria: >
|
|
188
|
+
Plain-language description of what a passing response looks like.
|
|
189
|
+
|
|
190
|
+
tags: [tag1, tag2]
|
|
191
|
+
timeout_seconds: 60
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Architecture: agents and judges
|
|
195
|
+
|
|
196
|
+
EvalGrid is a runner-and-scorer framework. The agent (the LLM being tested) and the judge (the LLM doing the scoring) are configured independently and can use different providers via LiteLLM.
|
|
197
|
+
|
|
198
|
+
The example scenarios that ship with v0.1 use Claude for both — that's a demo choice, not an architectural constraint. Common real-world setups include:
|
|
199
|
+
|
|
200
|
+
- GPT-4o agent, Claude Sonnet judge (cross-model validation)
|
|
201
|
+
- Claude Haiku agent, Claude Sonnet judge (cost-efficient testing)
|
|
202
|
+
- Llama 3.1 agent, GPT-4o judge (open-source agent, frontier judge)
|
|
203
|
+
|
|
204
|
+
Configure in `.evalgrid/config.yml`:
|
|
205
|
+
|
|
206
|
+
```yaml
|
|
207
|
+
model: gpt-4o # the agent being tested
|
|
208
|
+
judge:
|
|
209
|
+
provider: anthropic
|
|
210
|
+
model: claude-sonnet-4-6 # the model scoring the agent
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
## Examples
|
|
214
|
+
|
|
215
|
+
Example scenarios are in the [`examples/`](./examples/) directory. Copy them to get started:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
cp examples/*.yml .evalgrid/scenarios/
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## GitHub Actions
|
|
222
|
+
|
|
223
|
+
Add to `.github/workflows/ci.yml`:
|
|
224
|
+
|
|
225
|
+
```yaml
|
|
226
|
+
- name: Run evalgrid
|
|
227
|
+
env:
|
|
228
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
229
|
+
run: evalgrid run --output results.json
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
## Python API
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
from evalgrid.core import EvalConfig, ScenarioLoader, ScenarioRunner
|
|
236
|
+
|
|
237
|
+
config = EvalConfig(model="claude-sonnet-4-6")
|
|
238
|
+
loader = ScenarioLoader()
|
|
239
|
+
scenarios = loader.load_dir(".evalgrid/scenarios")
|
|
240
|
+
|
|
241
|
+
runner = ScenarioRunner(config)
|
|
242
|
+
results = runner.run_all(scenarios, on_result=lambda r: print(r.scenario_name, r.status))
|
|
243
|
+
|
|
244
|
+
for result in results:
|
|
245
|
+
print(f"{result.scenario_name}: {result.pass_rate:.0%}")
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
## Security
|
|
249
|
+
|
|
250
|
+
evalgrid is a **local-only developer tool**. Keep these constraints in mind:
|
|
251
|
+
|
|
252
|
+
- The API server binds to `127.0.0.1` by default. Use `--bind` to change this, but be aware that exposing the server to a network grants unauthenticated access to your scenarios and eval runs.
|
|
253
|
+
- Scenario files are sandboxed to `~/.evalgrid/scenarios/`. Paths supplied via the API are validated and rejected if they escape this directory.
|
|
254
|
+
- Provider API keys (e.g. `ANTHROPIC_API_KEY`) are read from environment variables only — never pass them through the API.
|
|
255
|
+
- Do not run `evalgrid server` on a shared or public-facing machine.
|
|
256
|
+
|
|
257
|
+
## Contributing
|
|
258
|
+
|
|
259
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for full setup instructions and PR guidelines.
|
|
260
|
+
|
|
261
|
+
## License
|
|
262
|
+
|
|
263
|
+
MIT. See [LICENSE](LICENSE). Copyright (c) 2026 Naman Rai.
|