evaldeck 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. evaldeck-0.1.0/.claude/settings.local.json +31 -0
  2. evaldeck-0.1.0/.devcontainer/Dockerfile +30 -0
  3. evaldeck-0.1.0/.devcontainer/devcontainer.json +60 -0
  4. evaldeck-0.1.0/.github/workflows/ci.yaml +65 -0
  5. evaldeck-0.1.0/.github/workflows/docs.yaml +50 -0
  6. evaldeck-0.1.0/.github/workflows/publish.yaml +28 -0
  7. evaldeck-0.1.0/.gitignore +93 -0
  8. evaldeck-0.1.0/.pre-commit-config.yaml +27 -0
  9. evaldeck-0.1.0/CONTRIBUTING.md +227 -0
  10. evaldeck-0.1.0/LICENSE +190 -0
  11. evaldeck-0.1.0/PKG-INFO +363 -0
  12. evaldeck-0.1.0/README.md +312 -0
  13. evaldeck-0.1.0/docs/api/config.md +7 -0
  14. evaldeck-0.1.0/docs/api/evalcase.md +29 -0
  15. evaldeck-0.1.0/docs/api/evaluation-result.md +23 -0
  16. evaldeck-0.1.0/docs/api/evaluator.md +15 -0
  17. evaldeck-0.1.0/docs/api/grade-result.md +20 -0
  18. evaldeck-0.1.0/docs/api/graders/base.md +15 -0
  19. evaldeck-0.1.0/docs/api/graders/code.md +71 -0
  20. evaldeck-0.1.0/docs/api/graders/llm.md +17 -0
  21. evaldeck-0.1.0/docs/api/index.md +86 -0
  22. evaldeck-0.1.0/docs/api/metrics.md +48 -0
  23. evaldeck-0.1.0/docs/api/step.md +7 -0
  24. evaldeck-0.1.0/docs/api/trace.md +39 -0
  25. evaldeck-0.1.0/docs/concepts/architecture.md +279 -0
  26. evaldeck-0.1.0/docs/concepts/evaluation-workflow.md +376 -0
  27. evaldeck-0.1.0/docs/concepts/grading-strategies.md +318 -0
  28. evaldeck-0.1.0/docs/concepts/index.md +147 -0
  29. evaldeck-0.1.0/docs/concepts/traces.md +361 -0
  30. evaldeck-0.1.0/docs/contributing/adding-graders.md +331 -0
  31. evaldeck-0.1.0/docs/contributing/adding-integrations.md +245 -0
  32. evaldeck-0.1.0/docs/contributing/adding-metrics.md +299 -0
  33. evaldeck-0.1.0/docs/contributing/code-standards.md +287 -0
  34. evaldeck-0.1.0/docs/contributing/index.md +56 -0
  35. evaldeck-0.1.0/docs/contributing/setup.md +220 -0
  36. evaldeck-0.1.0/docs/examples/basic-usage.md +248 -0
  37. evaldeck-0.1.0/docs/examples/index.md +97 -0
  38. evaldeck-0.1.0/docs/examples/langchain-agent.md +344 -0
  39. evaldeck-0.1.0/docs/examples/llm-judge.md +322 -0
  40. evaldeck-0.1.0/docs/examples/tool-calls.md +232 -0
  41. evaldeck-0.1.0/docs/getting-started/first-evaluation.md +287 -0
  42. evaldeck-0.1.0/docs/getting-started/index.md +74 -0
  43. evaldeck-0.1.0/docs/getting-started/installation.md +177 -0
  44. evaldeck-0.1.0/docs/getting-started/quickstart.md +183 -0
  45. evaldeck-0.1.0/docs/includes/abbreviations.md +8 -0
  46. evaldeck-0.1.0/docs/index.md +146 -0
  47. evaldeck-0.1.0/docs/stylesheets/extra.css +106 -0
  48. evaldeck-0.1.0/docs/user-guide/ci-cd.md +411 -0
  49. evaldeck-0.1.0/docs/user-guide/cli.md +293 -0
  50. evaldeck-0.1.0/docs/user-guide/configuration.md +301 -0
  51. evaldeck-0.1.0/docs/user-guide/graders/code-based.md +367 -0
  52. evaldeck-0.1.0/docs/user-guide/graders/custom.md +339 -0
  53. evaldeck-0.1.0/docs/user-guide/graders/index.md +194 -0
  54. evaldeck-0.1.0/docs/user-guide/graders/llm-based.md +327 -0
  55. evaldeck-0.1.0/docs/user-guide/index.md +176 -0
  56. evaldeck-0.1.0/docs/user-guide/integrations/index.md +221 -0
  57. evaldeck-0.1.0/docs/user-guide/integrations/manual.md +322 -0
  58. evaldeck-0.1.0/docs/user-guide/integrations/opentelemetry.md +226 -0
  59. evaldeck-0.1.0/docs/user-guide/metrics.md +331 -0
  60. evaldeck-0.1.0/docs/user-guide/test-cases.md +447 -0
  61. evaldeck-0.1.0/examples/basic_usage.py +117 -0
  62. evaldeck-0.1.0/examples/langchain_react_agent.py +343 -0
  63. evaldeck-0.1.0/mkdocs.yml +192 -0
  64. evaldeck-0.1.0/pyproject.toml +102 -0
  65. evaldeck-0.1.0/src/evaldeck/__init__.py +88 -0
  66. evaldeck-0.1.0/src/evaldeck/cli.py +324 -0
  67. evaldeck-0.1.0/src/evaldeck/config.py +223 -0
  68. evaldeck-0.1.0/src/evaldeck/evaluator.py +566 -0
  69. evaldeck-0.1.0/src/evaldeck/graders/__init__.py +36 -0
  70. evaldeck-0.1.0/src/evaldeck/graders/base.py +146 -0
  71. evaldeck-0.1.0/src/evaldeck/graders/code.py +484 -0
  72. evaldeck-0.1.0/src/evaldeck/graders/llm.py +344 -0
  73. evaldeck-0.1.0/src/evaldeck/integrations/__init__.py +29 -0
  74. evaldeck-0.1.0/src/evaldeck/integrations/opentelemetry.py +416 -0
  75. evaldeck-0.1.0/src/evaldeck/metrics/__init__.py +25 -0
  76. evaldeck-0.1.0/src/evaldeck/metrics/base.py +62 -0
  77. evaldeck-0.1.0/src/evaldeck/metrics/builtin.py +195 -0
  78. evaldeck-0.1.0/src/evaldeck/results.py +211 -0
  79. evaldeck-0.1.0/src/evaldeck/test_case.py +162 -0
  80. evaldeck-0.1.0/src/evaldeck/trace.py +215 -0
  81. evaldeck-0.1.0/tests/__init__.py +1 -0
  82. evaldeck-0.1.0/tests/conftest.py +52 -0
  83. evaldeck-0.1.0/tests/test_evaluator.py +429 -0
  84. evaldeck-0.1.0/tests/test_graders.py +247 -0
  85. evaldeck-0.1.0/tests/test_trace.py +115 -0
@@ -0,0 +1,31 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(python -m pytest:*)",
5
+ "Bash(grep:*)",
6
+ "Bash(python:*)",
7
+ "Bash(python3:*)",
8
+ "Bash(PYTHONPATH=src python3:*)",
9
+ "Bash(PYTHONPATH=src /opt/homebrew/bin/python3.11:*)",
10
+ "Bash(wc:*)",
11
+ "WebFetch(domain:docs.langchain.com)",
12
+ "WebSearch",
13
+ "WebFetch(domain:arize-ai.github.io)",
14
+ "WebFetch(domain:docs.arize.com)",
15
+ "WebFetch(domain:arize.com)",
16
+ "WebFetch(domain:github.com)",
17
+ "Bash(ls:*)",
18
+ "Bash(find:*)",
19
+ "Bash(pip install:*)",
20
+ "Bash(pip3 install:*)",
21
+ "Bash(mkdocs serve --help:*)",
22
+ "Bash(git init:*)",
23
+ "Bash(git add:*)",
24
+ "Bash(git commit:*)",
25
+ "Bash(git branch:*)",
26
+ "Bash(git remote add:*)",
27
+ "Bash(git push:*)",
28
+ "Bash(ruff check:*)"
29
+ ]
30
+ }
31
+ }
@@ -0,0 +1,30 @@
1
+ FROM mcr.microsoft.com/devcontainers/python:1-3.11-bullseye
2
+
3
+ # Remove yarn repo (has expired GPG key) and install system dependencies
4
+ RUN rm -f /etc/apt/sources.list.d/yarn.list && \
5
+ apt-get update && apt-get install -y --no-install-recommends \
6
+ build-essential \
7
+ curl \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Set working directory
11
+ WORKDIR /workspaces/evaldeck
12
+
13
+ # Upgrade pip
14
+ RUN pip install --upgrade pip
15
+
16
+ # Install Python development tools
17
+ RUN pip install --no-cache-dir \
18
+ ruff \
19
+ mypy \
20
+ pytest \
21
+ pytest-asyncio \
22
+ pre-commit \
23
+ build \
24
+ twine
25
+
26
+ # Create directory for evaldeck output
27
+ RUN mkdir -p /workspaces/evaldeck/.evaldeck
28
+
29
+ # Set Python path
30
+ ENV PYTHONPATH="/workspaces/evaldeck/src:${PYTHONPATH}"
@@ -0,0 +1,60 @@
1
+ {
2
+ "name": "Evaldeck Development",
3
+ "build": {
4
+ "dockerfile": "Dockerfile",
5
+ "context": ".."
6
+ },
7
+ "features": {
8
+ "ghcr.io/devcontainers/features/git:1": {},
9
+ "ghcr.io/devcontainers/features/github-cli:1": {}
10
+ },
11
+ "customizations": {
12
+ "vscode": {
13
+ "extensions": [
14
+ "ms-python.python",
15
+ "ms-python.vscode-pylance",
16
+ "charliermarsh.ruff",
17
+ "tamasfe.even-better-toml",
18
+ "redhat.vscode-yaml",
19
+ "github.copilot"
20
+ ],
21
+ "settings": {
22
+ "python.defaultInterpreterPath": "/usr/local/bin/python",
23
+ "python.testing.pytestEnabled": true,
24
+ "python.testing.pytestArgs": [
25
+ "tests"
26
+ ],
27
+ "editor.formatOnSave": true,
28
+ "editor.codeActionsOnSave": {
29
+ "source.organizeImports": "explicit",
30
+ "source.fixAll": "explicit"
31
+ },
32
+ "[python]": {
33
+ "editor.defaultFormatter": "charliermarsh.ruff"
34
+ },
35
+ "files.exclude": {
36
+ "**/__pycache__": true,
37
+ "**/*.pyc": true,
38
+ "**/.pytest_cache": true,
39
+ "**/.mypy_cache": true,
40
+ "**/.ruff_cache": true,
41
+ "**/*.egg-info": true
42
+ }
43
+ }
44
+ }
45
+ },
46
+ "postCreateCommand": "pip install -e '.[dev,all,docs]' && (git rev-parse --git-dir > /dev/null 2>&1 && pre-commit install || echo 'Skipping pre-commit install (not a git repo)')",
47
+ "runArgs": [
48
+ "--env-file",
49
+ "${localWorkspaceFolder}/.env"
50
+ ],
51
+ "forwardPorts": [],
52
+ "remoteUser": "vscode",
53
+ "mounts": [
54
+ "source=${localWorkspaceFolder}/.evaldeck,target=/workspaces/evaldeck/.evaldeck,type=bind,consistency=cached"
55
+ ],
56
+ "remoteEnv": {
57
+ "PYTHONDONTWRITEBYTECODE": "1",
58
+ "PYTHONUNBUFFERED": "1"
59
+ }
60
+ }
@@ -0,0 +1,65 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install dependencies
25
+ run: |
26
+ python -m pip install --upgrade pip
27
+ pip install -e ".[dev]"
28
+
29
+ - name: Lint with ruff
30
+ run: |
31
+ ruff check src/ tests/
32
+
33
+ - name: Type check with mypy
34
+ run: |
35
+ mypy src/
36
+
37
+ - name: Run tests
38
+ run: |
39
+ pytest tests/ -v --tb=short
40
+
41
+ build:
42
+ runs-on: ubuntu-latest
43
+ needs: test
44
+
45
+ steps:
46
+ - uses: actions/checkout@v4
47
+
48
+ - name: Set up Python
49
+ uses: actions/setup-python@v5
50
+ with:
51
+ python-version: "3.11"
52
+
53
+ - name: Install build dependencies
54
+ run: |
55
+ python -m pip install --upgrade pip
56
+ pip install build
57
+
58
+ - name: Build package
59
+ run: |
60
+ python -m build
61
+
62
+ - name: Check package
63
+ run: |
64
+ pip install twine
65
+ twine check dist/*
@@ -0,0 +1,50 @@
1
+ name: Deploy Docs
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ workflow_dispatch:
8
+
9
+ permissions:
10
+ contents: read
11
+ pages: write
12
+ id-token: write
13
+
14
+ concurrency:
15
+ group: "pages"
16
+ cancel-in-progress: false
17
+
18
+ jobs:
19
+ build:
20
+ runs-on: ubuntu-latest
21
+ steps:
22
+ - uses: actions/checkout@v4
23
+
24
+ - name: Set up Python
25
+ uses: actions/setup-python@v5
26
+ with:
27
+ python-version: "3.11"
28
+
29
+ - name: Install dependencies
30
+ run: |
31
+ pip install -e ".[docs]"
32
+
33
+ - name: Build docs
34
+ run: mkdocs build
35
+
36
+ - name: Upload artifact
37
+ uses: actions/upload-pages-artifact@v3
38
+ with:
39
+ path: site/
40
+
41
+ deploy:
42
+ environment:
43
+ name: github-pages
44
+ url: ${{ steps.deployment.outputs.page_url }}
45
+ runs-on: ubuntu-latest
46
+ needs: build
47
+ steps:
48
+ - name: Deploy to GitHub Pages
49
+ id: deployment
50
+ uses: actions/deploy-pages@v4
@@ -0,0 +1,28 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ publish:
9
+ runs-on: ubuntu-latest
10
+ environment: pypi
11
+ permissions:
12
+ id-token: write # Required for trusted publishing
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v5
18
+ with:
19
+ python-version: "3.11"
20
+
21
+ - name: Install build tools
22
+ run: pip install build
23
+
24
+ - name: Build package
25
+ run: python -m build
26
+
27
+ - name: Publish to PyPI
28
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,93 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+
27
+ # PyInstaller
28
+ *.manifest
29
+ *.spec
30
+
31
+ # Installer logs
32
+ pip-log.txt
33
+ pip-delete-this-directory.txt
34
+
35
+ # Unit test / coverage reports
36
+ htmlcov/
37
+ .tox/
38
+ .nox/
39
+ .coverage
40
+ .coverage.*
41
+ .cache
42
+ nosetests.xml
43
+ coverage.xml
44
+ *.cover
45
+ *.py,cover
46
+ .hypothesis/
47
+ .pytest_cache/
48
+
49
+ # Translations
50
+ *.mo
51
+ *.pot
52
+
53
+ # Environments
54
+ .env
55
+ .venv
56
+ env/
57
+ venv/
58
+ ENV/
59
+ env.bak/
60
+ venv.bak/
61
+
62
+ # IDE
63
+ .idea/
64
+ .vscode/
65
+ *.swp
66
+ *.swo
67
+ *~
68
+ .project
69
+ .pydevproject
70
+ .settings/
71
+
72
+ # mypy
73
+ .mypy_cache/
74
+ .dmypy.json
75
+ dmypy.json
76
+
77
+ # ruff
78
+ .ruff_cache/
79
+
80
+ # Evaldeck specific
81
+ .evaldeck/
82
+ evaldeck_results/
83
+ *.evaldeck.json
84
+
85
+ # Jupyter
86
+ .ipynb_checkpoints/
87
+
88
+ # OS
89
+ .DS_Store
90
+ Thumbs.db
91
+
92
+ # Internal
93
+ internal/
@@ -0,0 +1,27 @@
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v4.5.0
4
+ hooks:
5
+ - id: trailing-whitespace
6
+ - id: end-of-file-fixer
7
+ - id: check-yaml
8
+ - id: check-added-large-files
9
+ - id: check-merge-conflict
10
+
11
+ - repo: https://github.com/astral-sh/ruff-pre-commit
12
+ rev: v0.1.9
13
+ hooks:
14
+ - id: ruff
15
+ args: [--fix]
16
+ - id: ruff-format
17
+
18
+ - repo: https://github.com/pre-commit/mirrors-mypy
19
+ rev: v1.8.0
20
+ hooks:
21
+ - id: mypy
22
+ additional_dependencies:
23
+ - pydantic>=2.0
24
+ - click>=8.0
25
+ - types-PyYAML
26
+ args: [--ignore-missing-imports]
27
+ files: ^src/
@@ -0,0 +1,227 @@
1
+ # Contributing to Evaldeck
2
+
3
+ Thank you for your interest in contributing to Evaldeck! This document provides guidelines and instructions for contributing.
4
+
5
+ ## Code of Conduct
6
+
7
+ By participating in this project, you agree to maintain a respectful and inclusive environment for everyone.
8
+
9
+ ## How to Contribute
10
+
11
+ ### Reporting Bugs
12
+
13
+ Before submitting a bug report:
14
+ 1. Check existing issues to avoid duplicates
15
+ 2. Use the latest version of Evaldeck
16
+ 3. Collect relevant information (Python version, OS, stack trace)
17
+
18
+ When submitting a bug report, include:
19
+ - A clear, descriptive title
20
+ - Steps to reproduce the issue
21
+ - Expected vs actual behavior
22
+ - Code samples if applicable
23
+ - Environment details
24
+
25
+ ### Suggesting Features
26
+
27
+ Feature requests are welcome! Please:
28
+ 1. Check existing issues and discussions first
29
+ 2. Describe the use case and problem you're trying to solve
30
+ 3. Explain how the feature would work
31
+ 4. Consider if it fits Evaldeck's scope (agent evaluation)
32
+
33
+ ### Pull Requests
34
+
35
+ 1. **Fork and clone** the repository
36
+ 2. **Create a branch** from `main`:
37
+ ```bash
38
+ git checkout -b feature/your-feature-name
39
+ ```
40
+ 3. **Set up development environment**:
41
+ ```bash
42
+ pip install -e ".[dev]"
43
+ pre-commit install
44
+ ```
45
+ 4. **Make your changes** following our coding standards
46
+ 5. **Add tests** for new functionality
47
+ 6. **Run the test suite**:
48
+ ```bash
49
+ pytest
50
+ ruff check .
51
+ mypy src/
52
+ ```
53
+ 7. **Commit your changes** with a clear message:
54
+ ```bash
55
+ git commit -m "feat: add support for X"
56
+ ```
57
+ 8. **Push and open a PR** against `main`
58
+
59
+ ## Development Setup
60
+
61
+ ### Prerequisites
62
+
63
+ - Python 3.10+
64
+ - Git
65
+
66
+ ### Installation
67
+
68
+ ```bash
69
+ # Clone your fork
70
+ git clone https://github.com/YOUR_USERNAME/evaldeck.git
71
+ cd evaldeck
72
+
73
+ # Create virtual environment
74
+ python -m venv venv
75
+ source venv/bin/activate # or `venv\Scripts\activate` on Windows
76
+
77
+ # Install in development mode
78
+ pip install -e ".[dev]"
79
+
80
+ # Install pre-commit hooks
81
+ pre-commit install
82
+ ```
83
+
84
+ ### Running Tests
85
+
86
+ ```bash
87
+ # Run all tests
88
+ pytest
89
+
90
+ # Run with coverage
91
+ pytest --cov=evaldeck
92
+
93
+ # Run specific test file
94
+ pytest tests/test_evaluator.py
95
+
96
+ # Run specific test
97
+ pytest tests/test_evaluator.py::test_basic_evaluation
98
+ ```
99
+
100
+ ### Code Quality
101
+
102
+ We use:
103
+ - **Ruff** for linting and formatting
104
+ - **mypy** for type checking
105
+
106
+ ```bash
107
+ # Lint
108
+ ruff check .
109
+
110
+ # Format
111
+ ruff format .
112
+
113
+ # Type check
114
+ mypy src/
115
+ ```
116
+
117
+ ## Coding Standards
118
+
119
+ ### Style
120
+
121
+ - Follow PEP 8
122
+ - Use type hints for all public functions
123
+ - Maximum line length: 100 characters
124
+ - Use descriptive variable and function names
125
+
126
+ ### Documentation
127
+
128
+ - Add docstrings to public functions and classes
129
+ - Update README.md if adding user-facing features
130
+ - Add inline comments for complex logic
131
+
132
+ ### Testing
133
+
134
+ - Write tests for all new functionality
135
+ - Maintain or improve test coverage
136
+ - Use descriptive test names: `test_evaluator_returns_failure_when_tool_missing`
137
+
138
+ ### Commit Messages
139
+
140
+ Follow [Conventional Commits](https://www.conventionalcommits.org/):
141
+
142
+ ```
143
+ feat: add CrewAI integration
144
+ fix: handle empty trace gracefully
145
+ docs: update installation instructions
146
+ test: add tests for LLM grader
147
+ refactor: simplify metric calculation
148
+ chore: update dependencies
149
+ ```
150
+
151
+ ## Project Structure
152
+
153
+ ```
154
+ evaldeck/
155
+ ├── src/evaldeck/
156
+ │ ├── __init__.py # Public API exports
157
+ │ ├── cli.py # CLI commands
158
+ │ ├── config.py # Configuration loading
159
+ │ ├── evaluator.py # Main evaluation engine
160
+ │ ├── trace.py # Trace data models
161
+ │ ├── test_case.py # Test case data models
162
+ │ ├── graders/ # Grader implementations
163
+ │ │ ├── __init__.py
164
+ │ │ ├── base.py
165
+ │ │ ├── code.py
166
+ │ │ └── llm.py
167
+ │ ├── metrics/ # Metric implementations
168
+ │ │ ├── __init__.py
169
+ │ │ └── ...
170
+ │ └── integrations/ # Framework adapters
171
+ │ ├── __init__.py
172
+ │ └── langchain.py
173
+ ├── tests/
174
+ │ ├── conftest.py
175
+ │ ├── test_evaluator.py
176
+ │ └── ...
177
+ ├── examples/
178
+ │ └── ...
179
+ └── docs/
180
+ └── ...
181
+ ```
182
+
183
+ ## Adding a New Integration
184
+
185
+ To add support for a new agent framework:
186
+
187
+ 1. Create `src/evaldeck/integrations/your_framework.py`
188
+ 2. Implement a tracer/adapter that captures execution into `Trace` format
189
+ 3. Add optional dependency to `pyproject.toml`
190
+ 4. Add tests in `tests/integrations/test_your_framework.py`
191
+ 5. Update README.md with usage example
192
+ 6. Add example in `examples/`
193
+
194
+ ## Adding a New Grader
195
+
196
+ To add a new grader type:
197
+
198
+ 1. Create grader class inheriting from `BaseGrader`
199
+ 2. Implement `grade(trace, test_case) -> GradeResult`
200
+ 3. Add tests
201
+ 4. Export from `evaldeck.graders`
202
+ 5. Document in README.md
203
+
204
+ ## Adding a New Metric
205
+
206
+ To add a new metric:
207
+
208
+ 1. Create metric class inheriting from `BaseMetric`
209
+ 2. Implement `calculate(trace, test_case) -> MetricResult`
210
+ 3. Add tests
211
+ 4. Export from `evaldeck.metrics`
212
+ 5. Document in README.md
213
+
214
+ ## Getting Help
215
+
216
+ - Open a [Discussion](https://github.com/tantra-run/evaldeck-py/discussions) for questions
217
+ - Join our Discord (coming soon)
218
+ - Tag maintainers on complex issues
219
+
220
+ ## Recognition
221
+
222
+ Contributors will be recognized in:
223
+ - CONTRIBUTORS.md
224
+ - Release notes
225
+ - Project documentation
226
+
227
+ Thank you for contributing to Evaldeck!