evalgrid 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. evalgrid-0.1.0/.env.example +2 -0
  2. evalgrid-0.1.0/.github/workflows/ci.yml +30 -0
  3. evalgrid-0.1.0/.github/workflows/evalgrid.yml +59 -0
  4. evalgrid-0.1.0/.github/workflows/publish.yml +72 -0
  5. evalgrid-0.1.0/.gitignore +72 -0
  6. evalgrid-0.1.0/CHANGELOG.md +29 -0
  7. evalgrid-0.1.0/CONTRIBUTING.md +82 -0
  8. evalgrid-0.1.0/LICENSE +21 -0
  9. evalgrid-0.1.0/PKG-INFO +263 -0
  10. evalgrid-0.1.0/README.md +222 -0
  11. evalgrid-0.1.0/ROADMAP.md +40 -0
  12. evalgrid-0.1.0/dashboard/index.html +12 -0
  13. evalgrid-0.1.0/dashboard/package-lock.json +3212 -0
  14. evalgrid-0.1.0/dashboard/package.json +29 -0
  15. evalgrid-0.1.0/dashboard/playwright.config.ts +17 -0
  16. evalgrid-0.1.0/dashboard/postcss.config.js +6 -0
  17. evalgrid-0.1.0/dashboard/src/App.tsx +343 -0
  18. evalgrid-0.1.0/dashboard/src/components/Header.tsx +225 -0
  19. evalgrid-0.1.0/dashboard/src/components/JudgeSettings.tsx +104 -0
  20. evalgrid-0.1.0/dashboard/src/components/RunDetail.tsx +355 -0
  21. evalgrid-0.1.0/dashboard/src/components/RunRow.tsx +399 -0
  22. evalgrid-0.1.0/dashboard/src/components/ScenariosView.tsx +337 -0
  23. evalgrid-0.1.0/dashboard/src/components/StatusBadge.tsx +46 -0
  24. evalgrid-0.1.0/dashboard/src/hooks/useRuns.ts +137 -0
  25. evalgrid-0.1.0/dashboard/src/hooks/useScenarios.ts +19 -0
  26. evalgrid-0.1.0/dashboard/src/index.css +7 -0
  27. evalgrid-0.1.0/dashboard/src/main.tsx +10 -0
  28. evalgrid-0.1.0/dashboard/src/types.ts +77 -0
  29. evalgrid-0.1.0/dashboard/tailwind.config.js +17 -0
  30. evalgrid-0.1.0/dashboard/tests/e2e/dashboard-tab.spec.ts +58 -0
  31. evalgrid-0.1.0/dashboard/tests/e2e/fixtures.ts +80 -0
  32. evalgrid-0.1.0/dashboard/tests/e2e/helpers.ts +42 -0
  33. evalgrid-0.1.0/dashboard/tests/e2e/judge-settings.spec.ts +111 -0
  34. evalgrid-0.1.0/dashboard/tests/e2e/modal.spec.ts +151 -0
  35. evalgrid-0.1.0/dashboard/tests/e2e/navigation.spec.ts +44 -0
  36. evalgrid-0.1.0/dashboard/tests/e2e/runs-tab.spec.ts +101 -0
  37. evalgrid-0.1.0/dashboard/tests/e2e/scenarios-tab.spec.ts +75 -0
  38. evalgrid-0.1.0/dashboard/tsconfig.json +21 -0
  39. evalgrid-0.1.0/dashboard/tsconfig.node.json +10 -0
  40. evalgrid-0.1.0/dashboard/vite.config.ts +15 -0
  41. evalgrid-0.1.0/docs/screenshots/dashboard-overview.png +0 -0
  42. evalgrid-0.1.0/docs/screenshots/run-detail.png +0 -0
  43. evalgrid-0.1.0/docs/screenshots/scenarios.png +0 -0
  44. evalgrid-0.1.0/evalgrid/__init__.py +3 -0
  45. evalgrid-0.1.0/evalgrid/__main__.py +4 -0
  46. evalgrid-0.1.0/evalgrid/api/__init__.py +0 -0
  47. evalgrid-0.1.0/evalgrid/api/app.py +442 -0
  48. evalgrid-0.1.0/evalgrid/api/database.py +78 -0
  49. evalgrid-0.1.0/evalgrid/cli/__init__.py +3 -0
  50. evalgrid-0.1.0/evalgrid/cli/commands/__init__.py +0 -0
  51. evalgrid-0.1.0/evalgrid/cli/commands/init.py +139 -0
  52. evalgrid-0.1.0/evalgrid/cli/commands/run.py +262 -0
  53. evalgrid-0.1.0/evalgrid/cli/commands/scenario.py +153 -0
  54. evalgrid-0.1.0/evalgrid/cli/commands/server.py +39 -0
  55. evalgrid-0.1.0/evalgrid/cli/main.py +20 -0
  56. evalgrid-0.1.0/evalgrid/core/__init__.py +14 -0
  57. evalgrid-0.1.0/evalgrid/core/loader.py +106 -0
  58. evalgrid-0.1.0/evalgrid/core/models.py +99 -0
  59. evalgrid-0.1.0/evalgrid/core/paths.py +27 -0
  60. evalgrid-0.1.0/evalgrid/core/runner.py +125 -0
  61. evalgrid-0.1.0/evalgrid/core/scorer.py +216 -0
  62. evalgrid-0.1.0/evalgrid/static/assets/index-9gTV8edJ.css +1 -0
  63. evalgrid-0.1.0/evalgrid/static/assets/index-DiafW5oN.js +205 -0
  64. evalgrid-0.1.0/evalgrid/static/index.html +13 -0
  65. evalgrid-0.1.0/examples/coding_fix_bug.yml +56 -0
  66. evalgrid-0.1.0/examples/sdr_qualify_lead.yml +46 -0
  67. evalgrid-0.1.0/pyproject.toml +73 -0
  68. evalgrid-0.1.0/tests/__init__.py +0 -0
  69. evalgrid-0.1.0/tests/conftest.py +45 -0
  70. evalgrid-0.1.0/tests/fixtures/invalid_scenario.yml +6 -0
  71. evalgrid-0.1.0/tests/fixtures/multi_step_scenario.yml +32 -0
  72. evalgrid-0.1.0/tests/fixtures/valid_scenario.yml +13 -0
  73. evalgrid-0.1.0/tests/test_api.py +140 -0
  74. evalgrid-0.1.0/tests/test_cli.py +48 -0
  75. evalgrid-0.1.0/tests/test_cli_smoke.py +10 -0
  76. evalgrid-0.1.0/tests/test_database.py +97 -0
  77. evalgrid-0.1.0/tests/test_loader.py +113 -0
  78. evalgrid-0.1.0/tests/test_models.py +64 -0
  79. evalgrid-0.1.0/tests/test_runner.py +108 -0
  80. evalgrid-0.1.0/tests/test_scorer.py +122 -0
  81. evalgrid-0.1.0/tests/test_yaml_schema.py +20 -0
@@ -0,0 +1,2 @@
1
+ ANTHROPIC_API_KEY=sk-ant-your-key-here
2
+ OPENAI_API_KEY=sk-your-key-here-optional
@@ -0,0 +1,30 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ name: Test
12
+ runs-on: ubuntu-latest
13
+
14
+ steps:
15
+ - uses: actions/checkout@v6
16
+
17
+ - name: Set up Python 3.11
18
+ uses: actions/setup-python@v6
19
+ with:
20
+ python-version: "3.11"
21
+ cache: pip
22
+
23
+ - name: Install package and dev dependencies
24
+ run: pip install -e ".[dev]"
25
+
26
+ - name: Run tests
27
+ run: pytest
28
+
29
+ - name: Verify CLI
30
+ run: evalgrid --help
@@ -0,0 +1,59 @@
1
+ name: evalgrid
2
+
3
+ on:
4
+ workflow_dispatch:
5
+
6
+ jobs:
7
+ eval:
8
+ name: Run Evals
9
+ runs-on: ubuntu-latest
10
+ timeout-minutes: 30
11
+
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+
15
+ - name: Set up Python
16
+ uses: actions/setup-python@v5
17
+ with:
18
+ python-version: "3.11"
19
+ cache: pip
20
+
21
+ - name: Install evalgrid
22
+ run: pip install -e ".[dev]"
23
+
24
+ - name: Initialize evalgrid
25
+ run: evalgrid init
26
+
27
+ - name: Copy example scenarios
28
+ run: |
29
+ if [ -d examples ]; then
30
+ cp examples/*.yml .evalgrid/scenarios/ 2>/dev/null || true
31
+ fi
32
+
33
+ - name: Run Evals (mock mode — no API key required)
34
+ run: |
35
+ evalgrid run \
36
+ --mock \
37
+ --output results.json \
38
+ --config .evalgrid/config.yml
39
+
40
+ - name: Run Evals with real API (optional)
41
+ env:
42
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
43
+ run: |
44
+ if [ -n "$ANTHROPIC_API_KEY" ]; then
45
+ evalgrid run \
46
+ --output results-real.json \
47
+ --config .evalgrid/config.yml
48
+ else
49
+ echo "Skipping real API run — ANTHROPIC_API_KEY not configured"
50
+ fi
51
+
52
+ - name: Upload Results
53
+ uses: actions/upload-artifact@v4
54
+ if: always()
55
+ with:
56
+ name: evalgrid-results-${{ github.run_id }}
57
+ path: results.json
58
+ retention-days: 30
59
+
@@ -0,0 +1,72 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ test:
9
+ name: Run tests
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - uses: actions/checkout@v6
14
+
15
+ - name: Set up Python 3.11
16
+ uses: actions/setup-python@v6
17
+ with:
18
+ python-version: "3.11"
19
+ cache: pip
20
+
21
+ - name: Install package and dev dependencies
22
+ run: pip install -e ".[dev]"
23
+
24
+ - name: Run tests
25
+ run: pytest
26
+
27
+ build:
28
+ name: Build distribution
29
+ needs: test
30
+ runs-on: ubuntu-latest
31
+
32
+ steps:
33
+ - uses: actions/checkout@v6
34
+
35
+ - name: Set up Python 3.11
36
+ uses: actions/setup-python@v6
37
+ with:
38
+ python-version: "3.11"
39
+ cache: pip
40
+
41
+ - name: Install build tools
42
+ run: pip install build
43
+
44
+ - name: Build sdist and wheel
45
+ run: python -m build
46
+
47
+ - name: Upload dist artifacts
48
+ uses: actions/upload-artifact@v4
49
+ with:
50
+ name: dist
51
+ path: dist/
52
+
53
+ publish:
54
+ name: Publish to PyPI
55
+ needs: build
56
+ runs-on: ubuntu-latest
57
+ environment:
58
+ name: pypi
59
+ url: https://pypi.org/project/evalgrid/
60
+
61
+ permissions:
62
+ id-token: write
63
+
64
+ steps:
65
+ - name: Download dist artifacts
66
+ uses: actions/download-artifact@v4
67
+ with:
68
+ name: dist
69
+ path: dist/
70
+
71
+ - name: Publish to PyPI
72
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,72 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.pyo
5
+ *.pyd
6
+ .Python
7
+ *.egg-info/
8
+ dist/
9
+ build/
10
+ .eggs/
11
+ *.egg
12
+
13
+ # Virtual environments
14
+ .venv/
15
+ venv/
16
+ env/
17
+
18
+ # Testing
19
+ .pytest_cache/
20
+ .coverage
21
+ htmlcov/
22
+ coverage.xml
23
+
24
+ # Databases (dev artifacts)
25
+ *.db
26
+ *.sqlite
27
+
28
+ # evalgrid runtime
29
+ .evalgrid/
30
+ .agenteval/
31
+ sdr-demo-result.json
32
+ sdr-real-result.json
33
+ *-results.json
34
+ *-result.json
35
+ CLAUDE.md
36
+ .claude/
37
+ .private/
38
+
39
+ # Node / dashboard
40
+ dashboard/node_modules/
41
+ dashboard/dist/
42
+ dashboard/test-results/
43
+ dashboard/Scenario
44
+
45
+ # IDE
46
+ .vscode/
47
+ .idea/
48
+ *.swp
49
+ *.swo
50
+
51
+ # Environment / secrets
52
+ .env
53
+ .env.local
54
+ .env.*.local
55
+
56
+ # Node (general)
57
+ node_modules/
58
+
59
+ # OS
60
+ .DS_Store
61
+ Thumbs.db
62
+
63
+ # Build artifacts
64
+ *.whl
65
+ *.tar.gz
66
+
67
+ # mypy cache
68
+ .mypy_cache/
69
+
70
+ # ruff cache
71
+ .ruff_cache/
72
+ templates/
@@ -0,0 +1,29 @@
1
+ # Changelog
2
+
3
+ All notable changes to evalgrid will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
6
+
7
+ ## [0.1.0] - 2026-05-06
8
+
9
+ Initial release.
10
+
11
+ ### Added
12
+
13
+ - `evalgrid init` — scaffold `.evalgrid/` config and example scenario
14
+ - `evalgrid run` — run all scenarios and print pass/fail results
15
+ - `evalgrid run --mock` — dry run with no API calls (CI-friendly)
16
+ - `evalgrid run --tag <tag>` — filter scenarios by tag
17
+ - `evalgrid run --output <file>` — save results to JSON
18
+ - `evalgrid scenario add` — interactively create a new scenario
19
+ - `evalgrid scenario list` — list all scenarios in the scenarios directory
20
+ - `evalgrid scenario validate` — validate scenario YAML files
21
+ - `evalgrid server` — start the React dashboard at `http://localhost:8000`
22
+ - FastAPI REST API: `GET /api/runs`, `POST /api/runs`, `GET /api/runs/{id}`, `GET /api/scenarios`, `POST /api/scenarios`
23
+ - SQLite persistence via SQLAlchemy async (results survive server restarts)
24
+ - React dashboard: pass rate ring chart, avg latency sparkline, run history, per-step score breakdowns
25
+ - Dark/light mode, mobile responsive dashboard
26
+ - LLM-as-judge scoring via litellm (supports Anthropic, OpenAI, Google, Ollama)
27
+ - Mock scorer for CI runs that require no API key
28
+ - 8 example scenarios: Claude Code suite (5 scenarios), hello-world, sdr-qualify-lead, support-triage
29
+ - Python API: `EvalConfig`, `ScenarioLoader`, `ScenarioRunner`
@@ -0,0 +1,82 @@
1
+ # Contributing to evalgrid
2
+
3
+ Thank you for your interest in contributing to evalgrid.
4
+
5
+ ## Prerequisites
6
+
7
+ - Python 3.11 or newer
8
+ - Node.js 18+ (only if working on the dashboard)
9
+ - Git
10
+
11
+ ## Dev Setup
12
+
13
+ ```bash
14
+ git clone https://github.com/naman006-rai/evalgrid.git
15
+ cd evalgrid
16
+ pip install -e ".[dev]"
17
+ ```
18
+
19
+ This installs evalgrid in editable mode along with test dependencies (pytest, ruff, mypy).
20
+
21
+ ## Running Tests
22
+
23
+ ```bash
24
+ pytest tests/ -v
25
+ ```
26
+
27
+ All tests must pass before submitting a pull request. The CI workflow runs the full suite automatically.
28
+
29
+ To run a specific test file:
30
+
31
+ ```bash
32
+ pytest tests/test_api.py -v
33
+ ```
34
+
35
+ ## Dashboard (optional)
36
+
37
+ If you are modifying the React dashboard:
38
+
39
+ ```bash
40
+ cd dashboard
41
+ npm install
42
+ npm run dev # dev server at localhost:5173
43
+ npm run build # production build
44
+ ```
45
+
46
+ After building, copy the output to the static directory:
47
+
48
+ ```bash
49
+ # Windows (PowerShell)
50
+ Copy-Item -Recurse -Force dashboard\dist\* evalgrid\static\
51
+
52
+ # macOS / Linux
53
+ cp -r dashboard/dist/* evalgrid/static/
54
+ ```
55
+
56
+ ## Code Style
57
+
58
+ Python code is linted with `ruff` and type-checked with `mypy`:
59
+
60
+ ```bash
61
+ ruff check evalgrid/
62
+ mypy evalgrid/
63
+ ```
64
+
65
+ ## PR Guidelines
66
+
67
+ 1. Fork the repo and create a branch: `git checkout -b feat/my-feature`
68
+ 2. Write tests for your change (keep coverage above 80%)
69
+ 3. Run the full suite: `pytest tests/ -v`
70
+ 4. Run linting: `ruff check evalgrid/`
71
+ 5. Open a pull request against `main` with a clear description of what changes and why
72
+ 6. CI will run evals and tests automatically — fix any failures before requesting review
73
+
74
+ ## Reporting Issues
75
+
76
+ Open a GitHub issue at <https://github.com/naman006-rai/evalgrid/issues>.
77
+
78
+ Include:
79
+ - evalgrid version (`pip show evalgrid`)
80
+ - Python version (`python --version`)
81
+ - Steps to reproduce
82
+ - Expected vs actual behavior
evalgrid-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Naman Rai
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,263 @@
1
+ Metadata-Version: 2.4
2
+ Name: evalgrid
3
+ Version: 0.1.0
4
+ Summary: Agent evaluation framework — run, score, and track AI agent correctness at scale.
5
+ Project-URL: Homepage, https://github.com/naman006-rai/evalgrid
6
+ Project-URL: Documentation, https://github.com/naman006-rai/evalgrid#readme
7
+ Project-URL: Repository, https://github.com/naman006-rai/evalgrid
8
+ Project-URL: Issues, https://github.com/naman006-rai/evalgrid/issues
9
+ Project-URL: Bug Tracker, https://github.com/naman006-rai/evalgrid/issues
10
+ Author-email: Naman Rai <naman.rai006@gmail.com>
11
+ License: MIT
12
+ License-File: LICENSE
13
+ Keywords: agents,ai,evaluation,llm,testing
14
+ Classifier: Development Status :: 3 - Alpha
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Software Development :: Testing
20
+ Requires-Python: >=3.11
21
+ Requires-Dist: aiosqlite>=0.20.0
22
+ Requires-Dist: click>=8.1.0
23
+ Requires-Dist: fastapi>=0.115.0
24
+ Requires-Dist: httpx>=0.27.0
25
+ Requires-Dist: litellm<2.0.0,>=1.0.0
26
+ Requires-Dist: pydantic>=2.0.0
27
+ Requires-Dist: pyyaml>=6.0
28
+ Requires-Dist: rich>=13.0.0
29
+ Requires-Dist: sqlalchemy>=2.0.0
30
+ Requires-Dist: uvicorn[standard]>=0.32.0
31
+ Provides-Extra: dev
32
+ Requires-Dist: mypy>=1.13.0; extra == 'dev'
33
+ Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
34
+ Requires-Dist: pytest-cov>=5.0.0; extra == 'dev'
35
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
36
+ Requires-Dist: ruff>=0.7.0; extra == 'dev'
37
+ Provides-Extra: postgres
38
+ Requires-Dist: asyncpg>=0.30.0; extra == 'postgres'
39
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == 'postgres'
40
+ Description-Content-Type: text/markdown
41
+
42
+ <div align="center">
43
+
44
+ <svg width="52" height="52" viewBox="0 0 36 36" fill="none" xmlns="http://www.w3.org/2000/svg">
45
+ <rect x="2" y="2" width="14" height="14" rx="3" fill="#5b5ef4"/>
46
+ <rect x="20" y="2" width="14" height="14" rx="3" fill="#5b5ef4" opacity="0.4"/>
47
+ <rect x="2" y="20" width="14" height="14" rx="3" fill="#5b5ef4" opacity="0.4"/>
48
+ <rect x="20" y="20" width="14" height="14" rx="3" fill="#5b5ef4" opacity="0.15"/>
49
+ <path d="M8 9 L11 12 L16 6" stroke="white" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"/>
50
+ </svg>
51
+
52
+ # evalgrid
53
+
54
+ **Agent evaluation framework — run, score, and track AI agent correctness at scale.**
55
+
56
+ [![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
57
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
58
+
59
+ AI agents fail in ways unit tests can't catch: the customer support agent stops calling the refund tool after a prompt change. The research agent loops on the same Google search 23 times. The reconciliation agent silently picks the wrong account format on edge cases. evalgrid catches these regressions before they ship.
60
+
61
+ </div>
62
+
63
+ ## Status
64
+
65
+ EvalGrid is in early development (v0.1). The framework runs reliably for local-first agent evaluation, but you should expect:
66
+
67
+ - Some rough edges in the dashboard UI
68
+ - Schema changes in scenario YAML between v0.1 and v0.2
69
+ - GitHub status check / CI gate coming in v0.2
70
+
71
+ If you're building agents and want eval-driven workflows now, this works. If you need production-grade CI integration, watch for v0.2.
72
+
73
+ ## Screenshots
74
+
75
+ ![Dashboard overview](docs/screenshots/dashboard-overview.png)
76
+ *Dashboard showing run history with pass rates and per-scenario breakdowns*
77
+
78
+ ![Run detail](docs/screenshots/run-detail.png)
79
+ *Per-step verdicts with judge reasoning for a failed scenario*
80
+
81
+ ---
82
+
83
+ ## Install
84
+
85
+ ```bash
86
+ pip install evalgrid
87
+ ```
88
+
89
+ ## Quick start
90
+
91
+ **1. Initialize**
92
+ ```bash
93
+ evalgrid init
94
+ ```
95
+ Scaffolds `.evalgrid/` with a config file and an example scenario.
96
+
97
+ **2. Write a scenario**
98
+
99
+ `.evalgrid/scenarios/qualify_lead.yml`:
100
+ ```yaml
101
+ id: sdr-qualify-lead
102
+ name: SDR Lead Qualification
103
+ description: Verify BANT qualification on an inbound enterprise lead
104
+ agent_type: sdr
105
+
106
+ prompt: |
107
+ You are an expert SDR. Qualify leads using the BANT framework.
108
+ Identify budget, authority, need, and timeline. Recommend a next step.
109
+
110
+ test_input: |
111
+ Prospect: Sarah Chen, VP Engineering at TechCorp (500 employees)
112
+ Budget: $50k allocated this quarter for developer tooling
113
+ She filled out a demo request form 20 minutes ago.
114
+
115
+ expected_actions:
116
+ - action: identify_bant_criteria
117
+ required: true
118
+ description: Identify which BANT criteria are present
119
+ - action: ask_timeline_question
120
+ required: true
121
+ description: Ask about the buying timeline
122
+ - action: recommend_next_step
123
+ required: false
124
+ description: Propose a clear next step (demo, call, etc.)
125
+
126
+ tags: [sdr, bant]
127
+ ```
128
+
129
+ **3. Run**
130
+ ```bash
131
+ export ANTHROPIC_API_KEY=sk-ant-...
132
+ evalgrid run
133
+ ```
134
+
135
+ **4. See results**
136
+ ```
137
+ Running 1 scenario(s)...
138
+
139
+ ✓ SDR Lead Qualification sdr PASS 0.94 3/3
140
+
141
+ 1/1 scenarios passed (100%)
142
+ Results saved to .evalgrid/results/
143
+ ```
144
+
145
+ **5. Open the dashboard**
146
+ ```bash
147
+ evalgrid server
148
+ ```
149
+ Then visit `http://localhost:8000` for the full run history, pass rate trends, and per-step breakdowns.
150
+
151
+ > **Note:** The dashboard binds to `127.0.0.1` by default and is designed for local use only. Do not expose it to a public network.
152
+
153
+ ---
154
+
155
+ ## Commands
156
+
157
+ | Command | Description |
158
+ |---------|-------------|
159
+ | `evalgrid init` | Scaffold `.evalgrid/` in your project |
160
+ | `evalgrid run` | Run all scenarios and print pass/fail |
161
+ | `evalgrid run --tag sdr` | Filter by tag |
162
+ | `evalgrid run --mock` | Dry run with no API calls (great for CI) |
163
+ | `evalgrid run --output results.json` | Save results to a file |
164
+ | `evalgrid scenario add` | Interactively create a new scenario |
165
+ | `evalgrid scenario list` | List all scenarios |
166
+ | `evalgrid server` | Start the React dashboard |
167
+
168
+ ## Scenario format
169
+
170
+ ```yaml
171
+ id: unique-scenario-id
172
+ name: Human Readable Name
173
+ description: What this scenario tests
174
+ agent_type: sdr | coding | support | research | finance | legal | generic
175
+
176
+ prompt: |
177
+ System prompt for your agent...
178
+
179
+ test_input: |
180
+ The user message / input to evaluate...
181
+
182
+ expected_actions:
183
+ - action: action_name
184
+ required: true
185
+ description: What this action entails
186
+
187
+ success_criteria: >
188
+ Plain-language description of what a passing response looks like.
189
+
190
+ tags: [tag1, tag2]
191
+ timeout_seconds: 60
192
+ ```
193
+
194
+ ## Architecture: agents and judges
195
+
196
+ EvalGrid is a runner-and-scorer framework. The agent (the LLM being tested) and the judge (the LLM doing the scoring) are configured independently and can use different providers via LiteLLM.
197
+
198
+ The example scenarios that ship with v0.1 use Claude for both — that's a demo choice, not an architectural constraint. Common real-world setups include:
199
+
200
+ - GPT-4o agent, Claude Sonnet judge (cross-model validation)
201
+ - Claude Haiku agent, Claude Sonnet judge (cost-efficient testing)
202
+ - Llama 3.1 agent, GPT-4o judge (open-source agent, frontier judge)
203
+
204
+ Configure in `.evalgrid/config.yml`:
205
+
206
+ ```yaml
207
+ model: gpt-4o # the agent being tested
208
+ judge:
209
+ provider: anthropic
210
+ model: claude-sonnet-4-6 # the model scoring the agent
211
+ ```
212
+
213
+ ## Examples
214
+
215
+ Example scenarios are in the [`examples/`](./examples/) directory. Copy them to get started:
216
+
217
+ ```bash
218
+ cp examples/*.yml .evalgrid/scenarios/
219
+ ```
220
+
221
+ ## GitHub Actions
222
+
223
+ Add to `.github/workflows/ci.yml`:
224
+
225
+ ```yaml
226
+ - name: Run evalgrid
227
+ env:
228
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
229
+ run: evalgrid run --output results.json
230
+ ```
231
+
232
+ ## Python API
233
+
234
+ ```python
235
+ from evalgrid.core import EvalConfig, ScenarioLoader, ScenarioRunner
236
+
237
+ config = EvalConfig(model="claude-sonnet-4-6")
238
+ loader = ScenarioLoader()
239
+ scenarios = loader.load_dir(".evalgrid/scenarios")
240
+
241
+ runner = ScenarioRunner(config)
242
+ results = runner.run_all(scenarios, on_result=lambda r: print(r.scenario_name, r.status))
243
+
244
+ for result in results:
245
+ print(f"{result.scenario_name}: {result.pass_rate:.0%}")
246
+ ```
247
+
248
+ ## Security
249
+
250
+ evalgrid is a **local-only developer tool**. Keep these constraints in mind:
251
+
252
+ - The API server binds to `127.0.0.1` by default. Use `--bind` to change this, but be aware that exposing the server to a network grants unauthenticated access to your scenarios and eval runs.
253
+ - Scenario files are sandboxed to `~/.evalgrid/scenarios/`. Paths supplied via the API are validated and rejected if they escape this directory.
254
+ - Provider API keys (e.g. `ANTHROPIC_API_KEY`) are read from environment variables only — never pass them through the API.
255
+ - Do not run `evalgrid server` on a shared or public-facing machine.
256
+
257
+ ## Contributing
258
+
259
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for full setup instructions and PR guidelines.
260
+
261
+ ## License
262
+
263
+ MIT. See [LICENSE](LICENSE). Copyright (c) 2026 Naman Rai.