evaldeck 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evaldeck-0.1.0/.claude/settings.local.json +31 -0
- evaldeck-0.1.0/.devcontainer/Dockerfile +30 -0
- evaldeck-0.1.0/.devcontainer/devcontainer.json +60 -0
- evaldeck-0.1.0/.github/workflows/ci.yaml +65 -0
- evaldeck-0.1.0/.github/workflows/docs.yaml +50 -0
- evaldeck-0.1.0/.github/workflows/publish.yaml +28 -0
- evaldeck-0.1.0/.gitignore +93 -0
- evaldeck-0.1.0/.pre-commit-config.yaml +27 -0
- evaldeck-0.1.0/CONTRIBUTING.md +227 -0
- evaldeck-0.1.0/LICENSE +190 -0
- evaldeck-0.1.0/PKG-INFO +363 -0
- evaldeck-0.1.0/README.md +312 -0
- evaldeck-0.1.0/docs/api/config.md +7 -0
- evaldeck-0.1.0/docs/api/evalcase.md +29 -0
- evaldeck-0.1.0/docs/api/evaluation-result.md +23 -0
- evaldeck-0.1.0/docs/api/evaluator.md +15 -0
- evaldeck-0.1.0/docs/api/grade-result.md +20 -0
- evaldeck-0.1.0/docs/api/graders/base.md +15 -0
- evaldeck-0.1.0/docs/api/graders/code.md +71 -0
- evaldeck-0.1.0/docs/api/graders/llm.md +17 -0
- evaldeck-0.1.0/docs/api/index.md +86 -0
- evaldeck-0.1.0/docs/api/metrics.md +48 -0
- evaldeck-0.1.0/docs/api/step.md +7 -0
- evaldeck-0.1.0/docs/api/trace.md +39 -0
- evaldeck-0.1.0/docs/concepts/architecture.md +279 -0
- evaldeck-0.1.0/docs/concepts/evaluation-workflow.md +376 -0
- evaldeck-0.1.0/docs/concepts/grading-strategies.md +318 -0
- evaldeck-0.1.0/docs/concepts/index.md +147 -0
- evaldeck-0.1.0/docs/concepts/traces.md +361 -0
- evaldeck-0.1.0/docs/contributing/adding-graders.md +331 -0
- evaldeck-0.1.0/docs/contributing/adding-integrations.md +245 -0
- evaldeck-0.1.0/docs/contributing/adding-metrics.md +299 -0
- evaldeck-0.1.0/docs/contributing/code-standards.md +287 -0
- evaldeck-0.1.0/docs/contributing/index.md +56 -0
- evaldeck-0.1.0/docs/contributing/setup.md +220 -0
- evaldeck-0.1.0/docs/examples/basic-usage.md +248 -0
- evaldeck-0.1.0/docs/examples/index.md +97 -0
- evaldeck-0.1.0/docs/examples/langchain-agent.md +344 -0
- evaldeck-0.1.0/docs/examples/llm-judge.md +322 -0
- evaldeck-0.1.0/docs/examples/tool-calls.md +232 -0
- evaldeck-0.1.0/docs/getting-started/first-evaluation.md +287 -0
- evaldeck-0.1.0/docs/getting-started/index.md +74 -0
- evaldeck-0.1.0/docs/getting-started/installation.md +177 -0
- evaldeck-0.1.0/docs/getting-started/quickstart.md +183 -0
- evaldeck-0.1.0/docs/includes/abbreviations.md +8 -0
- evaldeck-0.1.0/docs/index.md +146 -0
- evaldeck-0.1.0/docs/stylesheets/extra.css +106 -0
- evaldeck-0.1.0/docs/user-guide/ci-cd.md +411 -0
- evaldeck-0.1.0/docs/user-guide/cli.md +293 -0
- evaldeck-0.1.0/docs/user-guide/configuration.md +301 -0
- evaldeck-0.1.0/docs/user-guide/graders/code-based.md +367 -0
- evaldeck-0.1.0/docs/user-guide/graders/custom.md +339 -0
- evaldeck-0.1.0/docs/user-guide/graders/index.md +194 -0
- evaldeck-0.1.0/docs/user-guide/graders/llm-based.md +327 -0
- evaldeck-0.1.0/docs/user-guide/index.md +176 -0
- evaldeck-0.1.0/docs/user-guide/integrations/index.md +221 -0
- evaldeck-0.1.0/docs/user-guide/integrations/manual.md +322 -0
- evaldeck-0.1.0/docs/user-guide/integrations/opentelemetry.md +226 -0
- evaldeck-0.1.0/docs/user-guide/metrics.md +331 -0
- evaldeck-0.1.0/docs/user-guide/test-cases.md +447 -0
- evaldeck-0.1.0/examples/basic_usage.py +117 -0
- evaldeck-0.1.0/examples/langchain_react_agent.py +343 -0
- evaldeck-0.1.0/mkdocs.yml +192 -0
- evaldeck-0.1.0/pyproject.toml +102 -0
- evaldeck-0.1.0/src/evaldeck/__init__.py +88 -0
- evaldeck-0.1.0/src/evaldeck/cli.py +324 -0
- evaldeck-0.1.0/src/evaldeck/config.py +223 -0
- evaldeck-0.1.0/src/evaldeck/evaluator.py +566 -0
- evaldeck-0.1.0/src/evaldeck/graders/__init__.py +36 -0
- evaldeck-0.1.0/src/evaldeck/graders/base.py +146 -0
- evaldeck-0.1.0/src/evaldeck/graders/code.py +484 -0
- evaldeck-0.1.0/src/evaldeck/graders/llm.py +344 -0
- evaldeck-0.1.0/src/evaldeck/integrations/__init__.py +29 -0
- evaldeck-0.1.0/src/evaldeck/integrations/opentelemetry.py +416 -0
- evaldeck-0.1.0/src/evaldeck/metrics/__init__.py +25 -0
- evaldeck-0.1.0/src/evaldeck/metrics/base.py +62 -0
- evaldeck-0.1.0/src/evaldeck/metrics/builtin.py +195 -0
- evaldeck-0.1.0/src/evaldeck/results.py +211 -0
- evaldeck-0.1.0/src/evaldeck/test_case.py +162 -0
- evaldeck-0.1.0/src/evaldeck/trace.py +215 -0
- evaldeck-0.1.0/tests/__init__.py +1 -0
- evaldeck-0.1.0/tests/conftest.py +52 -0
- evaldeck-0.1.0/tests/test_evaluator.py +429 -0
- evaldeck-0.1.0/tests/test_graders.py +247 -0
- evaldeck-0.1.0/tests/test_trace.py +115 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(python -m pytest:*)",
|
|
5
|
+
"Bash(grep:*)",
|
|
6
|
+
"Bash(python:*)",
|
|
7
|
+
"Bash(python3:*)",
|
|
8
|
+
"Bash(PYTHONPATH=src python3:*)",
|
|
9
|
+
"Bash(PYTHONPATH=src /opt/homebrew/bin/python3.11:*)",
|
|
10
|
+
"Bash(wc:*)",
|
|
11
|
+
"WebFetch(domain:docs.langchain.com)",
|
|
12
|
+
"WebSearch",
|
|
13
|
+
"WebFetch(domain:arize-ai.github.io)",
|
|
14
|
+
"WebFetch(domain:docs.arize.com)",
|
|
15
|
+
"WebFetch(domain:arize.com)",
|
|
16
|
+
"WebFetch(domain:github.com)",
|
|
17
|
+
"Bash(ls:*)",
|
|
18
|
+
"Bash(find:*)",
|
|
19
|
+
"Bash(pip install:*)",
|
|
20
|
+
"Bash(pip3 install:*)",
|
|
21
|
+
"Bash(mkdocs serve --help:*)",
|
|
22
|
+
"Bash(git init:*)",
|
|
23
|
+
"Bash(git add:*)",
|
|
24
|
+
"Bash(git commit:*)",
|
|
25
|
+
"Bash(git branch:*)",
|
|
26
|
+
"Bash(git remote add:*)",
|
|
27
|
+
"Bash(git push:*)",
|
|
28
|
+
"Bash(ruff check:*)"
|
|
29
|
+
]
|
|
30
|
+
}
|
|
31
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
FROM mcr.microsoft.com/devcontainers/python:1-3.11-bullseye
|
|
2
|
+
|
|
3
|
+
# Remove yarn repo (has expired GPG key) and install system dependencies
|
|
4
|
+
RUN rm -f /etc/apt/sources.list.d/yarn.list && \
|
|
5
|
+
apt-get update && apt-get install -y --no-install-recommends \
|
|
6
|
+
build-essential \
|
|
7
|
+
curl \
|
|
8
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
9
|
+
|
|
10
|
+
# Set working directory
|
|
11
|
+
WORKDIR /workspaces/evaldeck
|
|
12
|
+
|
|
13
|
+
# Upgrade pip
|
|
14
|
+
RUN pip install --upgrade pip
|
|
15
|
+
|
|
16
|
+
# Install Python development tools
|
|
17
|
+
RUN pip install --no-cache-dir \
|
|
18
|
+
ruff \
|
|
19
|
+
mypy \
|
|
20
|
+
pytest \
|
|
21
|
+
pytest-asyncio \
|
|
22
|
+
pre-commit \
|
|
23
|
+
build \
|
|
24
|
+
twine
|
|
25
|
+
|
|
26
|
+
# Create directory for evaldeck output
|
|
27
|
+
RUN mkdir -p /workspaces/evaldeck/.evaldeck
|
|
28
|
+
|
|
29
|
+
# Set Python path
|
|
30
|
+
ENV PYTHONPATH="/workspaces/evaldeck/src:${PYTHONPATH}"
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "Evaldeck Development",
|
|
3
|
+
"build": {
|
|
4
|
+
"dockerfile": "Dockerfile",
|
|
5
|
+
"context": ".."
|
|
6
|
+
},
|
|
7
|
+
"features": {
|
|
8
|
+
"ghcr.io/devcontainers/features/git:1": {},
|
|
9
|
+
"ghcr.io/devcontainers/features/github-cli:1": {}
|
|
10
|
+
},
|
|
11
|
+
"customizations": {
|
|
12
|
+
"vscode": {
|
|
13
|
+
"extensions": [
|
|
14
|
+
"ms-python.python",
|
|
15
|
+
"ms-python.vscode-pylance",
|
|
16
|
+
"charliermarsh.ruff",
|
|
17
|
+
"tamasfe.even-better-toml",
|
|
18
|
+
"redhat.vscode-yaml",
|
|
19
|
+
"github.copilot"
|
|
20
|
+
],
|
|
21
|
+
"settings": {
|
|
22
|
+
"python.defaultInterpreterPath": "/usr/local/bin/python",
|
|
23
|
+
"python.testing.pytestEnabled": true,
|
|
24
|
+
"python.testing.pytestArgs": [
|
|
25
|
+
"tests"
|
|
26
|
+
],
|
|
27
|
+
"editor.formatOnSave": true,
|
|
28
|
+
"editor.codeActionsOnSave": {
|
|
29
|
+
"source.organizeImports": "explicit",
|
|
30
|
+
"source.fixAll": "explicit"
|
|
31
|
+
},
|
|
32
|
+
"[python]": {
|
|
33
|
+
"editor.defaultFormatter": "charliermarsh.ruff"
|
|
34
|
+
},
|
|
35
|
+
"files.exclude": {
|
|
36
|
+
"**/__pycache__": true,
|
|
37
|
+
"**/*.pyc": true,
|
|
38
|
+
"**/.pytest_cache": true,
|
|
39
|
+
"**/.mypy_cache": true,
|
|
40
|
+
"**/.ruff_cache": true,
|
|
41
|
+
"**/*.egg-info": true
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
"postCreateCommand": "pip install -e '.[dev,all,docs]' && (git rev-parse --git-dir > /dev/null 2>&1 && pre-commit install || echo 'Skipping pre-commit install (not a git repo)')",
|
|
47
|
+
"runArgs": [
|
|
48
|
+
"--env-file",
|
|
49
|
+
"${localWorkspaceFolder}/.env"
|
|
50
|
+
],
|
|
51
|
+
"forwardPorts": [],
|
|
52
|
+
"remoteUser": "vscode",
|
|
53
|
+
"mounts": [
|
|
54
|
+
"source=${localWorkspaceFolder}/.evaldeck,target=/workspaces/evaldeck/.evaldeck,type=bind,consistency=cached"
|
|
55
|
+
],
|
|
56
|
+
"remoteEnv": {
|
|
57
|
+
"PYTHONDONTWRITEBYTECODE": "1",
|
|
58
|
+
"PYTHONUNBUFFERED": "1"
|
|
59
|
+
}
|
|
60
|
+
}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Install dependencies
|
|
25
|
+
run: |
|
|
26
|
+
python -m pip install --upgrade pip
|
|
27
|
+
pip install -e ".[dev]"
|
|
28
|
+
|
|
29
|
+
- name: Lint with ruff
|
|
30
|
+
run: |
|
|
31
|
+
ruff check src/ tests/
|
|
32
|
+
|
|
33
|
+
- name: Type check with mypy
|
|
34
|
+
run: |
|
|
35
|
+
mypy src/
|
|
36
|
+
|
|
37
|
+
- name: Run tests
|
|
38
|
+
run: |
|
|
39
|
+
pytest tests/ -v --tb=short
|
|
40
|
+
|
|
41
|
+
build:
|
|
42
|
+
runs-on: ubuntu-latest
|
|
43
|
+
needs: test
|
|
44
|
+
|
|
45
|
+
steps:
|
|
46
|
+
- uses: actions/checkout@v4
|
|
47
|
+
|
|
48
|
+
- name: Set up Python
|
|
49
|
+
uses: actions/setup-python@v5
|
|
50
|
+
with:
|
|
51
|
+
python-version: "3.11"
|
|
52
|
+
|
|
53
|
+
- name: Install build dependencies
|
|
54
|
+
run: |
|
|
55
|
+
python -m pip install --upgrade pip
|
|
56
|
+
pip install build
|
|
57
|
+
|
|
58
|
+
- name: Build package
|
|
59
|
+
run: |
|
|
60
|
+
python -m build
|
|
61
|
+
|
|
62
|
+
- name: Check package
|
|
63
|
+
run: |
|
|
64
|
+
pip install twine
|
|
65
|
+
twine check dist/*
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
name: Deploy Docs
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches:
|
|
6
|
+
- main
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: read
|
|
11
|
+
pages: write
|
|
12
|
+
id-token: write
|
|
13
|
+
|
|
14
|
+
concurrency:
|
|
15
|
+
group: "pages"
|
|
16
|
+
cancel-in-progress: false
|
|
17
|
+
|
|
18
|
+
jobs:
|
|
19
|
+
build:
|
|
20
|
+
runs-on: ubuntu-latest
|
|
21
|
+
steps:
|
|
22
|
+
- uses: actions/checkout@v4
|
|
23
|
+
|
|
24
|
+
- name: Set up Python
|
|
25
|
+
uses: actions/setup-python@v5
|
|
26
|
+
with:
|
|
27
|
+
python-version: "3.11"
|
|
28
|
+
|
|
29
|
+
- name: Install dependencies
|
|
30
|
+
run: |
|
|
31
|
+
pip install -e ".[docs]"
|
|
32
|
+
|
|
33
|
+
- name: Build docs
|
|
34
|
+
run: mkdocs build
|
|
35
|
+
|
|
36
|
+
- name: Upload artifact
|
|
37
|
+
uses: actions/upload-pages-artifact@v3
|
|
38
|
+
with:
|
|
39
|
+
path: site/
|
|
40
|
+
|
|
41
|
+
deploy:
|
|
42
|
+
environment:
|
|
43
|
+
name: github-pages
|
|
44
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
|
45
|
+
runs-on: ubuntu-latest
|
|
46
|
+
needs: build
|
|
47
|
+
steps:
|
|
48
|
+
- name: Deploy to GitHub Pages
|
|
49
|
+
id: deployment
|
|
50
|
+
uses: actions/deploy-pages@v4
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
publish:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
environment: pypi
|
|
11
|
+
permissions:
|
|
12
|
+
id-token: write # Required for trusted publishing
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- name: Set up Python
|
|
17
|
+
uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: "3.11"
|
|
20
|
+
|
|
21
|
+
- name: Install build tools
|
|
22
|
+
run: pip install build
|
|
23
|
+
|
|
24
|
+
- name: Build package
|
|
25
|
+
run: python -m build
|
|
26
|
+
|
|
27
|
+
- name: Publish to PyPI
|
|
28
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
*.egg-info/
|
|
24
|
+
.installed.cfg
|
|
25
|
+
*.egg
|
|
26
|
+
|
|
27
|
+
# PyInstaller
|
|
28
|
+
*.manifest
|
|
29
|
+
*.spec
|
|
30
|
+
|
|
31
|
+
# Installer logs
|
|
32
|
+
pip-log.txt
|
|
33
|
+
pip-delete-this-directory.txt
|
|
34
|
+
|
|
35
|
+
# Unit test / coverage reports
|
|
36
|
+
htmlcov/
|
|
37
|
+
.tox/
|
|
38
|
+
.nox/
|
|
39
|
+
.coverage
|
|
40
|
+
.coverage.*
|
|
41
|
+
.cache
|
|
42
|
+
nosetests.xml
|
|
43
|
+
coverage.xml
|
|
44
|
+
*.cover
|
|
45
|
+
*.py,cover
|
|
46
|
+
.hypothesis/
|
|
47
|
+
.pytest_cache/
|
|
48
|
+
|
|
49
|
+
# Translations
|
|
50
|
+
*.mo
|
|
51
|
+
*.pot
|
|
52
|
+
|
|
53
|
+
# Environments
|
|
54
|
+
.env
|
|
55
|
+
.venv
|
|
56
|
+
env/
|
|
57
|
+
venv/
|
|
58
|
+
ENV/
|
|
59
|
+
env.bak/
|
|
60
|
+
venv.bak/
|
|
61
|
+
|
|
62
|
+
# IDE
|
|
63
|
+
.idea/
|
|
64
|
+
.vscode/
|
|
65
|
+
*.swp
|
|
66
|
+
*.swo
|
|
67
|
+
*~
|
|
68
|
+
.project
|
|
69
|
+
.pydevproject
|
|
70
|
+
.settings/
|
|
71
|
+
|
|
72
|
+
# mypy
|
|
73
|
+
.mypy_cache/
|
|
74
|
+
.dmypy.json
|
|
75
|
+
dmypy.json
|
|
76
|
+
|
|
77
|
+
# ruff
|
|
78
|
+
.ruff_cache/
|
|
79
|
+
|
|
80
|
+
# Evaldeck specific
|
|
81
|
+
.evaldeck/
|
|
82
|
+
evaldeck_results/
|
|
83
|
+
*.evaldeck.json
|
|
84
|
+
|
|
85
|
+
# Jupyter
|
|
86
|
+
.ipynb_checkpoints/
|
|
87
|
+
|
|
88
|
+
# OS
|
|
89
|
+
.DS_Store
|
|
90
|
+
Thumbs.db
|
|
91
|
+
|
|
92
|
+
# Internal
|
|
93
|
+
internal/
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
3
|
+
rev: v4.5.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: trailing-whitespace
|
|
6
|
+
- id: end-of-file-fixer
|
|
7
|
+
- id: check-yaml
|
|
8
|
+
- id: check-added-large-files
|
|
9
|
+
- id: check-merge-conflict
|
|
10
|
+
|
|
11
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
12
|
+
rev: v0.1.9
|
|
13
|
+
hooks:
|
|
14
|
+
- id: ruff
|
|
15
|
+
args: [--fix]
|
|
16
|
+
- id: ruff-format
|
|
17
|
+
|
|
18
|
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
|
19
|
+
rev: v1.8.0
|
|
20
|
+
hooks:
|
|
21
|
+
- id: mypy
|
|
22
|
+
additional_dependencies:
|
|
23
|
+
- pydantic>=2.0
|
|
24
|
+
- click>=8.0
|
|
25
|
+
- types-PyYAML
|
|
26
|
+
args: [--ignore-missing-imports]
|
|
27
|
+
files: ^src/
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
# Contributing to Evaldeck
|
|
2
|
+
|
|
3
|
+
Thank you for your interest in contributing to Evaldeck! This document provides guidelines and instructions for contributing.
|
|
4
|
+
|
|
5
|
+
## Code of Conduct
|
|
6
|
+
|
|
7
|
+
By participating in this project, you agree to maintain a respectful and inclusive environment for everyone.
|
|
8
|
+
|
|
9
|
+
## How to Contribute
|
|
10
|
+
|
|
11
|
+
### Reporting Bugs
|
|
12
|
+
|
|
13
|
+
Before submitting a bug report:
|
|
14
|
+
1. Check existing issues to avoid duplicates
|
|
15
|
+
2. Use the latest version of Evaldeck
|
|
16
|
+
3. Collect relevant information (Python version, OS, stack trace)
|
|
17
|
+
|
|
18
|
+
When submitting a bug report, include:
|
|
19
|
+
- A clear, descriptive title
|
|
20
|
+
- Steps to reproduce the issue
|
|
21
|
+
- Expected vs actual behavior
|
|
22
|
+
- Code samples if applicable
|
|
23
|
+
- Environment details
|
|
24
|
+
|
|
25
|
+
### Suggesting Features
|
|
26
|
+
|
|
27
|
+
Feature requests are welcome! Please:
|
|
28
|
+
1. Check existing issues and discussions first
|
|
29
|
+
2. Describe the use case and problem you're trying to solve
|
|
30
|
+
3. Explain how the feature would work
|
|
31
|
+
4. Consider if it fits Evaldeck's scope (agent evaluation)
|
|
32
|
+
|
|
33
|
+
### Pull Requests
|
|
34
|
+
|
|
35
|
+
1. **Fork and clone** the repository
|
|
36
|
+
2. **Create a branch** from `main`:
|
|
37
|
+
```bash
|
|
38
|
+
git checkout -b feature/your-feature-name
|
|
39
|
+
```
|
|
40
|
+
3. **Set up development environment**:
|
|
41
|
+
```bash
|
|
42
|
+
pip install -e ".[dev]"
|
|
43
|
+
pre-commit install
|
|
44
|
+
```
|
|
45
|
+
4. **Make your changes** following our coding standards
|
|
46
|
+
5. **Add tests** for new functionality
|
|
47
|
+
6. **Run the test suite**:
|
|
48
|
+
```bash
|
|
49
|
+
pytest
|
|
50
|
+
ruff check .
|
|
51
|
+
mypy src/
|
|
52
|
+
```
|
|
53
|
+
7. **Commit your changes** with a clear message:
|
|
54
|
+
```bash
|
|
55
|
+
git commit -m "feat: add support for X"
|
|
56
|
+
```
|
|
57
|
+
8. **Push and open a PR** against `main`
|
|
58
|
+
|
|
59
|
+
## Development Setup
|
|
60
|
+
|
|
61
|
+
### Prerequisites
|
|
62
|
+
|
|
63
|
+
- Python 3.10+
|
|
64
|
+
- Git
|
|
65
|
+
|
|
66
|
+
### Installation
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
# Clone your fork
|
|
70
|
+
git clone https://github.com/YOUR_USERNAME/evaldeck.git
|
|
71
|
+
cd evaldeck
|
|
72
|
+
|
|
73
|
+
# Create virtual environment
|
|
74
|
+
python -m venv venv
|
|
75
|
+
source venv/bin/activate # or `venv\Scripts\activate` on Windows
|
|
76
|
+
|
|
77
|
+
# Install in development mode
|
|
78
|
+
pip install -e ".[dev]"
|
|
79
|
+
|
|
80
|
+
# Install pre-commit hooks
|
|
81
|
+
pre-commit install
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Running Tests
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
# Run all tests
|
|
88
|
+
pytest
|
|
89
|
+
|
|
90
|
+
# Run with coverage
|
|
91
|
+
pytest --cov=evaldeck
|
|
92
|
+
|
|
93
|
+
# Run specific test file
|
|
94
|
+
pytest tests/test_evaluator.py
|
|
95
|
+
|
|
96
|
+
# Run specific test
|
|
97
|
+
pytest tests/test_evaluator.py::test_basic_evaluation
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Code Quality
|
|
101
|
+
|
|
102
|
+
We use:
|
|
103
|
+
- **Ruff** for linting and formatting
|
|
104
|
+
- **mypy** for type checking
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
# Lint
|
|
108
|
+
ruff check .
|
|
109
|
+
|
|
110
|
+
# Format
|
|
111
|
+
ruff format .
|
|
112
|
+
|
|
113
|
+
# Type check
|
|
114
|
+
mypy src/
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Coding Standards
|
|
118
|
+
|
|
119
|
+
### Style
|
|
120
|
+
|
|
121
|
+
- Follow PEP 8
|
|
122
|
+
- Use type hints for all public functions
|
|
123
|
+
- Maximum line length: 100 characters
|
|
124
|
+
- Use descriptive variable and function names
|
|
125
|
+
|
|
126
|
+
### Documentation
|
|
127
|
+
|
|
128
|
+
- Add docstrings to public functions and classes
|
|
129
|
+
- Update README.md if adding user-facing features
|
|
130
|
+
- Add inline comments for complex logic
|
|
131
|
+
|
|
132
|
+
### Testing
|
|
133
|
+
|
|
134
|
+
- Write tests for all new functionality
|
|
135
|
+
- Maintain or improve test coverage
|
|
136
|
+
- Use descriptive test names: `test_evaluator_returns_failure_when_tool_missing`
|
|
137
|
+
|
|
138
|
+
### Commit Messages
|
|
139
|
+
|
|
140
|
+
Follow [Conventional Commits](https://www.conventionalcommits.org/):
|
|
141
|
+
|
|
142
|
+
```
|
|
143
|
+
feat: add CrewAI integration
|
|
144
|
+
fix: handle empty trace gracefully
|
|
145
|
+
docs: update installation instructions
|
|
146
|
+
test: add tests for LLM grader
|
|
147
|
+
refactor: simplify metric calculation
|
|
148
|
+
chore: update dependencies
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## Project Structure
|
|
152
|
+
|
|
153
|
+
```
|
|
154
|
+
evaldeck/
|
|
155
|
+
├── src/evaldeck/
|
|
156
|
+
│ ├── __init__.py # Public API exports
|
|
157
|
+
│ ├── cli.py # CLI commands
|
|
158
|
+
│ ├── config.py # Configuration loading
|
|
159
|
+
│ ├── evaluator.py # Main evaluation engine
|
|
160
|
+
│ ├── trace.py # Trace data models
|
|
161
|
+
│ ├── test_case.py # Test case data models
|
|
162
|
+
│ ├── graders/ # Grader implementations
|
|
163
|
+
│ │ ├── __init__.py
|
|
164
|
+
│ │ ├── base.py
|
|
165
|
+
│ │ ├── code.py
|
|
166
|
+
│ │ └── llm.py
|
|
167
|
+
│ ├── metrics/ # Metric implementations
|
|
168
|
+
│ │ ├── __init__.py
|
|
169
|
+
│ │ └── ...
|
|
170
|
+
│ └── integrations/ # Framework adapters
|
|
171
|
+
│ ├── __init__.py
|
|
172
|
+
│ └── langchain.py
|
|
173
|
+
├── tests/
|
|
174
|
+
│ ├── conftest.py
|
|
175
|
+
│ ├── test_evaluator.py
|
|
176
|
+
│ └── ...
|
|
177
|
+
├── examples/
|
|
178
|
+
│ └── ...
|
|
179
|
+
└── docs/
|
|
180
|
+
└── ...
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## Adding a New Integration
|
|
184
|
+
|
|
185
|
+
To add support for a new agent framework:
|
|
186
|
+
|
|
187
|
+
1. Create `src/evaldeck/integrations/your_framework.py`
|
|
188
|
+
2. Implement a tracer/adapter that captures execution into `Trace` format
|
|
189
|
+
3. Add optional dependency to `pyproject.toml`
|
|
190
|
+
4. Add tests in `tests/integrations/test_your_framework.py`
|
|
191
|
+
5. Update README.md with usage example
|
|
192
|
+
6. Add example in `examples/`
|
|
193
|
+
|
|
194
|
+
## Adding a New Grader
|
|
195
|
+
|
|
196
|
+
To add a new grader type:
|
|
197
|
+
|
|
198
|
+
1. Create grader class inheriting from `BaseGrader`
|
|
199
|
+
2. Implement `grade(trace, test_case) -> GradeResult`
|
|
200
|
+
3. Add tests
|
|
201
|
+
4. Export from `evaldeck.graders`
|
|
202
|
+
5. Document in README.md
|
|
203
|
+
|
|
204
|
+
## Adding a New Metric
|
|
205
|
+
|
|
206
|
+
To add a new metric:
|
|
207
|
+
|
|
208
|
+
1. Create metric class inheriting from `BaseMetric`
|
|
209
|
+
2. Implement `calculate(trace, test_case) -> MetricResult`
|
|
210
|
+
3. Add tests
|
|
211
|
+
4. Export from `evaldeck.metrics`
|
|
212
|
+
5. Document in README.md
|
|
213
|
+
|
|
214
|
+
## Getting Help
|
|
215
|
+
|
|
216
|
+
- Open a [Discussion](https://github.com/tantra-run/evaldeck-py/discussions) for questions
|
|
217
|
+
- Join our Discord (coming soon)
|
|
218
|
+
- Tag maintainers on complex issues
|
|
219
|
+
|
|
220
|
+
## Recognition
|
|
221
|
+
|
|
222
|
+
Contributors will be recognized in:
|
|
223
|
+
- CONTRIBUTORS.md
|
|
224
|
+
- Release notes
|
|
225
|
+
- Project documentation
|
|
226
|
+
|
|
227
|
+
Thank you for contributing to Evaldeck!
|