agenteval-ai 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agenteval_ai-0.1.0/.github/ISSUE_TEMPLATE/bug_report.yml +32 -0
- agenteval_ai-0.1.0/.github/ISSUE_TEMPLATE/feature_request.yml +17 -0
- agenteval_ai-0.1.0/.github/ISSUE_TEMPLATE/new_evaluator.yml +22 -0
- agenteval_ai-0.1.0/.github/pull_request_template.md +14 -0
- agenteval_ai-0.1.0/.github/workflows/ci.yml +45 -0
- agenteval_ai-0.1.0/.github/workflows/release.yml +28 -0
- agenteval_ai-0.1.0/.gitignore +22 -0
- agenteval_ai-0.1.0/.python-version +1 -0
- agenteval_ai-0.1.0/CHANGELOG.md +35 -0
- agenteval_ai-0.1.0/CONTRIBUTING.md +211 -0
- agenteval_ai-0.1.0/LICENSE +21 -0
- agenteval_ai-0.1.0/Makefile +44 -0
- agenteval_ai-0.1.0/PKG-INFO +491 -0
- agenteval_ai-0.1.0/README.md +438 -0
- agenteval_ai-0.1.0/SECURITY.md +35 -0
- agenteval_ai-0.1.0/action.yml +99 -0
- agenteval_ai-0.1.0/docs/demo.gif +0 -0
- agenteval_ai-0.1.0/docs/demo.tape +15 -0
- agenteval_ai-0.1.0/docs/demo_sim.sh +92 -0
- agenteval_ai-0.1.0/docs/docs/guides/ci-cd.md +526 -0
- agenteval_ai-0.1.0/docs/docs/guides/custom-evaluators.md +568 -0
- agenteval_ai-0.1.0/docs/docs/guides/local-evals.md +323 -0
- agenteval_ai-0.1.0/docs/docs/guides/production-failures.md +442 -0
- agenteval_ai-0.1.0/docs/docs/guides/providers.md +252 -0
- agenteval_ai-0.1.0/docs/docs/index.md +59 -0
- agenteval_ai-0.1.0/docs/docs/quickstart.md +117 -0
- agenteval_ai-0.1.0/docs/docs/reference/cli.md +602 -0
- agenteval_ai-0.1.0/docs/docs/reference/evaluators.md +506 -0
- agenteval_ai-0.1.0/docs/docs/reference/interceptors.md +468 -0
- agenteval_ai-0.1.0/docs/mkdocs.yml +61 -0
- agenteval_ai-0.1.0/docs/report-preview.png +0 -0
- agenteval_ai-0.1.0/examples/bedrock_agent/agent.py +142 -0
- agenteval_ai-0.1.0/examples/bedrock_agent/conftest.py +16 -0
- agenteval_ai-0.1.0/examples/bedrock_agent/test_bedrock_agent.py +84 -0
- agenteval_ai-0.1.0/examples/langchain_agent/agent.py +50 -0
- agenteval_ai-0.1.0/examples/langchain_agent/conftest.py +17 -0
- agenteval_ai-0.1.0/examples/langchain_agent/test_langchain_agent.py +50 -0
- agenteval_ai-0.1.0/examples/ollama_local/agent.py +21 -0
- agenteval_ai-0.1.0/examples/ollama_local/conftest.py +11 -0
- agenteval_ai-0.1.0/examples/ollama_local/test_ollama_agent.py +58 -0
- agenteval_ai-0.1.0/examples/openai_agent/agent.py +132 -0
- agenteval_ai-0.1.0/examples/openai_agent/conftest.py +17 -0
- agenteval_ai-0.1.0/examples/openai_agent/test_openai_agent.py +83 -0
- agenteval_ai-0.1.0/examples/quickstart/conftest.py +14 -0
- agenteval_ai-0.1.0/examples/quickstart/test_hello.py +23 -0
- agenteval_ai-0.1.0/pyproject.toml +112 -0
- agenteval_ai-0.1.0/src/agenteval/__init__.py +24 -0
- agenteval_ai-0.1.0/src/agenteval/cli/__init__.py +0 -0
- agenteval_ai-0.1.0/src/agenteval/cli/main.py +185 -0
- agenteval_ai-0.1.0/src/agenteval/cli/scaffold.py +224 -0
- agenteval_ai-0.1.0/src/agenteval/core/__init__.py +0 -0
- agenteval_ai-0.1.0/src/agenteval/core/config.py +72 -0
- agenteval_ai-0.1.0/src/agenteval/core/eval_model.py +102 -0
- agenteval_ai-0.1.0/src/agenteval/core/models.py +113 -0
- agenteval_ai-0.1.0/src/agenteval/core/runner.py +161 -0
- agenteval_ai-0.1.0/src/agenteval/evaluators/__init__.py +54 -0
- agenteval_ai-0.1.0/src/agenteval/evaluators/base.py +51 -0
- agenteval_ai-0.1.0/src/agenteval/evaluators/context_utilization.py +90 -0
- agenteval_ai-0.1.0/src/agenteval/evaluators/convergence.py +46 -0
- agenteval_ai-0.1.0/src/agenteval/evaluators/cost.py +49 -0
- agenteval_ai-0.1.0/src/agenteval/evaluators/guardrail.py +89 -0
- agenteval_ai-0.1.0/src/agenteval/evaluators/hallucination.py +102 -0
- agenteval_ai-0.1.0/src/agenteval/evaluators/latency.py +52 -0
- agenteval_ai-0.1.0/src/agenteval/evaluators/llm_judge.py +31 -0
- agenteval_ai-0.1.0/src/agenteval/evaluators/loop_detector.py +122 -0
- agenteval_ai-0.1.0/src/agenteval/evaluators/output_structure.py +139 -0
- agenteval_ai-0.1.0/src/agenteval/evaluators/regression.py +57 -0
- agenteval_ai-0.1.0/src/agenteval/evaluators/security.py +80 -0
- agenteval_ai-0.1.0/src/agenteval/evaluators/similarity.py +67 -0
- agenteval_ai-0.1.0/src/agenteval/evaluators/tool_call.py +76 -0
- agenteval_ai-0.1.0/src/agenteval/interceptors/__init__.py +20 -0
- agenteval_ai-0.1.0/src/agenteval/interceptors/anthropic.py +151 -0
- agenteval_ai-0.1.0/src/agenteval/interceptors/base.py +57 -0
- agenteval_ai-0.1.0/src/agenteval/interceptors/bedrock.py +140 -0
- agenteval_ai-0.1.0/src/agenteval/interceptors/data/pricing.json +58 -0
- agenteval_ai-0.1.0/src/agenteval/interceptors/openai.py +128 -0
- agenteval_ai-0.1.0/src/agenteval/interceptors/pricing.py +41 -0
- agenteval_ai-0.1.0/src/agenteval/mcp/__init__.py +0 -0
- agenteval_ai-0.1.0/src/agenteval/mcp/installer.py +89 -0
- agenteval_ai-0.1.0/src/agenteval/mcp/server.py +270 -0
- agenteval_ai-0.1.0/src/agenteval/providers/__init__.py +19 -0
- agenteval_ai-0.1.0/src/agenteval/providers/base.py +35 -0
- agenteval_ai-0.1.0/src/agenteval/providers/bedrock.py +117 -0
- agenteval_ai-0.1.0/src/agenteval/providers/ollama.py +41 -0
- agenteval_ai-0.1.0/src/agenteval/providers/openai.py +55 -0
- agenteval_ai-0.1.0/src/agenteval/py.typed +0 -0
- agenteval_ai-0.1.0/src/agenteval/pytest_plugin/__init__.py +0 -0
- agenteval_ai-0.1.0/src/agenteval/pytest_plugin/_collector.py +31 -0
- agenteval_ai-0.1.0/src/agenteval/pytest_plugin/assertions.py +203 -0
- agenteval_ai-0.1.0/src/agenteval/pytest_plugin/fixtures.py +71 -0
- agenteval_ai-0.1.0/src/agenteval/pytest_plugin/plugin.py +171 -0
- agenteval_ai-0.1.0/src/agenteval/reporting/__init__.py +13 -0
- agenteval_ai-0.1.0/src/agenteval/reporting/base.py +14 -0
- agenteval_ai-0.1.0/src/agenteval/reporting/console.py +58 -0
- agenteval_ai-0.1.0/src/agenteval/reporting/html.py +487 -0
- agenteval_ai-0.1.0/src/agenteval/reporting/json.py +18 -0
- agenteval_ai-0.1.0/src/agenteval/skill/__init__.py +0 -0
- agenteval_ai-0.1.0/src/agenteval/skill/adapters/__init__.py +0 -0
- agenteval_ai-0.1.0/src/agenteval/skill/adapters/claude_code.py +21 -0
- agenteval_ai-0.1.0/src/agenteval/skill/adapters/copilot.py +34 -0
- agenteval_ai-0.1.0/src/agenteval/skill/adapters/cursor.py +24 -0
- agenteval_ai-0.1.0/src/agenteval/skill/adapters/windsurf.py +24 -0
- agenteval_ai-0.1.0/src/agenteval/skill/core/__init__.py +0 -0
- agenteval_ai-0.1.0/src/agenteval/skill/core/check_regression.md +15 -0
- agenteval_ai-0.1.0/src/agenteval/skill/core/cost_audit.md +15 -0
- agenteval_ai-0.1.0/src/agenteval/skill/core/eval_agent.md +27 -0
- agenteval_ai-0.1.0/src/agenteval/skill/core/explain_failure.md +15 -0
- agenteval_ai-0.1.0/src/agenteval/skill/core/generate_tests.md +15 -0
- agenteval_ai-0.1.0/src/agenteval/skill/core/security_audit.md +15 -0
- agenteval_ai-0.1.0/src/agenteval/skill/installer.py +36 -0
- agenteval_ai-0.1.0/tests/__init__.py +0 -0
- agenteval_ai-0.1.0/tests/agent_evals/conftest.py +22 -0
- agenteval_ai-0.1.0/tests/agent_evals/test_example.py +25 -0
- agenteval_ai-0.1.0/tests/fixtures/.gitkeep +0 -0
- agenteval_ai-0.1.0/tests/integration/__init__.py +0 -0
- agenteval_ai-0.1.0/tests/unit/__init__.py +0 -0
- agenteval_ai-0.1.0/tests/unit/test_anthropic_interceptor.py +25 -0
- agenteval_ai-0.1.0/tests/unit/test_assertions.py +96 -0
- agenteval_ai-0.1.0/tests/unit/test_bedrock_interceptor.py +36 -0
- agenteval_ai-0.1.0/tests/unit/test_cli.py +37 -0
- agenteval_ai-0.1.0/tests/unit/test_config.py +82 -0
- agenteval_ai-0.1.0/tests/unit/test_context_utilization_evaluator.py +57 -0
- agenteval_ai-0.1.0/tests/unit/test_convergence_evaluator.py +68 -0
- agenteval_ai-0.1.0/tests/unit/test_cost_evaluator.py +47 -0
- agenteval_ai-0.1.0/tests/unit/test_eval_model.py +62 -0
- agenteval_ai-0.1.0/tests/unit/test_eval_providers.py +127 -0
- agenteval_ai-0.1.0/tests/unit/test_evaluator_base.py +63 -0
- agenteval_ai-0.1.0/tests/unit/test_guardrail_evaluator.py +75 -0
- agenteval_ai-0.1.0/tests/unit/test_hallucination_evaluator.py +67 -0
- agenteval_ai-0.1.0/tests/unit/test_interceptor_base.py +48 -0
- agenteval_ai-0.1.0/tests/unit/test_latency_evaluator.py +47 -0
- agenteval_ai-0.1.0/tests/unit/test_llm_judge.py +54 -0
- agenteval_ai-0.1.0/tests/unit/test_loop_detector.py +91 -0
- agenteval_ai-0.1.0/tests/unit/test_mcp_installer.py +87 -0
- agenteval_ai-0.1.0/tests/unit/test_mcp_server.py +31 -0
- agenteval_ai-0.1.0/tests/unit/test_models.py +191 -0
- agenteval_ai-0.1.0/tests/unit/test_openai_interceptor.py +52 -0
- agenteval_ai-0.1.0/tests/unit/test_output_structure_evaluator.py +108 -0
- agenteval_ai-0.1.0/tests/unit/test_pricing.py +54 -0
- agenteval_ai-0.1.0/tests/unit/test_pytest_plugin.py +7 -0
- agenteval_ai-0.1.0/tests/unit/test_regression_evaluator.py +71 -0
- agenteval_ai-0.1.0/tests/unit/test_reporting.py +96 -0
- agenteval_ai-0.1.0/tests/unit/test_runner.py +45 -0
- agenteval_ai-0.1.0/tests/unit/test_scaffold.py +57 -0
- agenteval_ai-0.1.0/tests/unit/test_security_evaluator.py +97 -0
- agenteval_ai-0.1.0/tests/unit/test_similarity_evaluator.py +67 -0
- agenteval_ai-0.1.0/tests/unit/test_skill_installer.py +36 -0
- agenteval_ai-0.1.0/tests/unit/test_tool_call_evaluator.py +98 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
name: Bug Report
|
|
2
|
+
description: Report a bug in agenteval
|
|
3
|
+
labels: ["bug"]
|
|
4
|
+
body:
|
|
5
|
+
- type: textarea
|
|
6
|
+
id: description
|
|
7
|
+
attributes:
|
|
8
|
+
label: Description
|
|
9
|
+
description: What happened?
|
|
10
|
+
validations:
|
|
11
|
+
required: true
|
|
12
|
+
- type: textarea
|
|
13
|
+
id: reproduction
|
|
14
|
+
attributes:
|
|
15
|
+
label: Steps to reproduce
|
|
16
|
+
description: Minimal code or steps to reproduce the issue
|
|
17
|
+
validations:
|
|
18
|
+
required: true
|
|
19
|
+
- type: textarea
|
|
20
|
+
id: expected
|
|
21
|
+
attributes:
|
|
22
|
+
label: Expected behavior
|
|
23
|
+
- type: input
|
|
24
|
+
id: version
|
|
25
|
+
attributes:
|
|
26
|
+
label: agenteval version
|
|
27
|
+
placeholder: "0.1.0"
|
|
28
|
+
- type: input
|
|
29
|
+
id: python
|
|
30
|
+
attributes:
|
|
31
|
+
label: Python version
|
|
32
|
+
placeholder: "3.12"
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
name: Feature Request
|
|
2
|
+
description: Suggest a new feature
|
|
3
|
+
labels: ["enhancement"]
|
|
4
|
+
body:
|
|
5
|
+
- type: textarea
|
|
6
|
+
id: problem
|
|
7
|
+
attributes:
|
|
8
|
+
label: Problem
|
|
9
|
+
description: What problem does this solve?
|
|
10
|
+
validations:
|
|
11
|
+
required: true
|
|
12
|
+
- type: textarea
|
|
13
|
+
id: solution
|
|
14
|
+
attributes:
|
|
15
|
+
label: Proposed solution
|
|
16
|
+
validations:
|
|
17
|
+
required: true
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
name: New Evaluator Proposal
|
|
2
|
+
description: Propose a new built-in or community evaluator
|
|
3
|
+
labels: ["evaluator", "enhancement"]
|
|
4
|
+
body:
|
|
5
|
+
- type: input
|
|
6
|
+
id: name
|
|
7
|
+
attributes:
|
|
8
|
+
label: Evaluator name
|
|
9
|
+
placeholder: "toxicity"
|
|
10
|
+
validations:
|
|
11
|
+
required: true
|
|
12
|
+
- type: textarea
|
|
13
|
+
id: description
|
|
14
|
+
attributes:
|
|
15
|
+
label: What does it catch?
|
|
16
|
+
validations:
|
|
17
|
+
required: true
|
|
18
|
+
- type: textarea
|
|
19
|
+
id: implementation
|
|
20
|
+
attributes:
|
|
21
|
+
label: Implementation approach
|
|
22
|
+
description: How would this evaluator work?
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Install uv
|
|
20
|
+
uses: astral-sh/setup-uv@v4
|
|
21
|
+
|
|
22
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
23
|
+
run: uv python install ${{ matrix.python-version }}
|
|
24
|
+
|
|
25
|
+
- name: Create venv and install
|
|
26
|
+
run: |
|
|
27
|
+
uv venv .venv
|
|
28
|
+
source .venv/bin/activate
|
|
29
|
+
uv pip install -e ".[all,dev]"
|
|
30
|
+
|
|
31
|
+
- name: Lint
|
|
32
|
+
run: .venv/bin/ruff check src/ tests/
|
|
33
|
+
|
|
34
|
+
- name: Type check
|
|
35
|
+
run: .venv/bin/mypy src/agenteval/
|
|
36
|
+
|
|
37
|
+
- name: Test
|
|
38
|
+
run: .venv/bin/pytest tests/unit/ -v --cov=agenteval --cov-report=xml --cov-report=term-missing
|
|
39
|
+
|
|
40
|
+
- name: Upload coverage
|
|
41
|
+
if: matrix.python-version == '3.12'
|
|
42
|
+
uses: codecov/codecov-action@v4
|
|
43
|
+
with:
|
|
44
|
+
file: ./coverage.xml
|
|
45
|
+
fail_ci_if_error: false
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags: ["v*"]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
id-token: write
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
publish:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
environment: release
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- name: Install uv
|
|
19
|
+
uses: astral-sh/setup-uv@v4
|
|
20
|
+
|
|
21
|
+
- name: Set up Python
|
|
22
|
+
run: uv python install 3.12
|
|
23
|
+
|
|
24
|
+
- name: Build package
|
|
25
|
+
run: uv build
|
|
26
|
+
|
|
27
|
+
- name: Publish to PyPI
|
|
28
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Python-generated files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[oc]
|
|
4
|
+
build/
|
|
5
|
+
dist/
|
|
6
|
+
wheels/
|
|
7
|
+
*.egg-info
|
|
8
|
+
|
|
9
|
+
# Virtual environments
|
|
10
|
+
.venv
|
|
11
|
+
|
|
12
|
+
# IDE
|
|
13
|
+
.idea/
|
|
14
|
+
.vscode/
|
|
15
|
+
.cursor/
|
|
16
|
+
|
|
17
|
+
# Generated reports
|
|
18
|
+
docs/report_*.html
|
|
19
|
+
docs/report_*.json
|
|
20
|
+
|
|
21
|
+
# Lock files
|
|
22
|
+
uv.lock
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.1.0] - 2026-04-09
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- Core data models: Trace, LLMCall, ToolCall, Turn, EvalResult, TestResult, SuiteResult
|
|
15
|
+
- InterceptorRegistry with auto-detection of installed SDKs
|
|
16
|
+
- 5 provider interceptors: OpenAI, AWS Bedrock, Google Vertex AI, Azure OpenAI, Anthropic
|
|
17
|
+
- PricingEngine with bundled pricing data for all providers
|
|
18
|
+
- 13 built-in evaluators:
|
|
19
|
+
- Structural: ToolCall, Cost, Latency, LoopDetector, OutputStructure
|
|
20
|
+
- Semantic: LLMJudge, Hallucination, Similarity
|
|
21
|
+
- Safety: Security, Guardrail
|
|
22
|
+
- Operational: Regression, Convergence, ContextUtilization
|
|
23
|
+
- Evaluator plugin interface with Python entry point discovery
|
|
24
|
+
- Eval model providers: OpenAI and Ollama ($0 local evals)
|
|
25
|
+
- pytest plugin with fixtures, markers, and CLI flags
|
|
26
|
+
- Trace convenience assertions (tool_called, no_loops, no_pii_leaked, etc.)
|
|
27
|
+
- CLI: `agenteval run`, `agenteval init`, `agenteval version`
|
|
28
|
+
- MCP server with 8 tools + `agenteval mcp serve/install`
|
|
29
|
+
- 6 cross-platform skills with adapters for Claude Code, Copilot, Cursor, Windsurf
|
|
30
|
+
- 3 report formats: console (rich), HTML, JSON
|
|
31
|
+
- GitHub Action with PR comment bot
|
|
32
|
+
- CI/CD: GitHub Actions for testing + PyPI publishing via Trusted Publisher
|
|
33
|
+
- 5 example projects: quickstart, OpenAI, Bedrock, LangChain, Ollama
|
|
34
|
+
- mkdocs documentation site
|
|
35
|
+
- CONTRIBUTING.md with evaluator/interceptor contribution guides
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
# Contributing to agenteval
|
|
2
|
+
|
|
3
|
+
Thanks for your interest in contributing! agenteval is an open-source project and we welcome contributions of all kinds.
|
|
4
|
+
|
|
5
|
+
## Getting Started
|
|
6
|
+
|
|
7
|
+
### Development setup
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
# Clone the repo
|
|
11
|
+
git clone https://github.com/devbrat-anand/agenteval.git
|
|
12
|
+
cd agenteval
|
|
13
|
+
|
|
14
|
+
# Install uv (if not already installed)
|
|
15
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
16
|
+
|
|
17
|
+
# Install dev dependencies
|
|
18
|
+
uv pip install -e ".[dev]" --system
|
|
19
|
+
|
|
20
|
+
# Verify setup
|
|
21
|
+
pytest tests/ -v
|
|
22
|
+
ruff check src/ tests/
|
|
23
|
+
mypy src/agenteval/
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### Running tests
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# All tests
|
|
30
|
+
pytest tests/ -v
|
|
31
|
+
|
|
32
|
+
# Unit tests only
|
|
33
|
+
pytest tests/unit/ -v
|
|
34
|
+
|
|
35
|
+
# Specific test file
|
|
36
|
+
pytest tests/unit/test_models.py -v
|
|
37
|
+
|
|
38
|
+
# With coverage
|
|
39
|
+
pytest tests/ --cov=agenteval --cov-report=term -v
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Code quality
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
# Lint
|
|
46
|
+
ruff check src/ tests/
|
|
47
|
+
|
|
48
|
+
# Format
|
|
49
|
+
ruff format src/ tests/
|
|
50
|
+
|
|
51
|
+
# Type check
|
|
52
|
+
mypy src/agenteval/
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Contributing an Evaluator
|
|
56
|
+
|
|
57
|
+
The most impactful way to contribute. Here's how:
|
|
58
|
+
|
|
59
|
+
### 1. Create the evaluator
|
|
60
|
+
|
|
61
|
+
Create `src/agenteval/evaluators/your_evaluator.py`:
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
"""Your evaluator description."""
|
|
65
|
+
|
|
66
|
+
from __future__ import annotations
|
|
67
|
+
|
|
68
|
+
from agenteval.core.models import EvalResult, Trace
|
|
69
|
+
from agenteval.evaluators.base import Evaluator
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class YourEvaluator(Evaluator):
|
|
73
|
+
"""One-line description of what this evaluator checks."""
|
|
74
|
+
|
|
75
|
+
name = "your_evaluator"
|
|
76
|
+
|
|
77
|
+
def evaluate(self, trace: Trace, criteria: dict) -> EvalResult:
|
|
78
|
+
# Your evaluation logic
|
|
79
|
+
score = 1.0
|
|
80
|
+
passed = True
|
|
81
|
+
reason = "Check passed"
|
|
82
|
+
|
|
83
|
+
# Return an EvalResult
|
|
84
|
+
return EvalResult(
|
|
85
|
+
evaluator=self.name,
|
|
86
|
+
score=score,
|
|
87
|
+
passed=passed,
|
|
88
|
+
reason=reason,
|
|
89
|
+
details={},
|
|
90
|
+
)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### 2. Register the entry point
|
|
94
|
+
|
|
95
|
+
Add to `pyproject.toml`:
|
|
96
|
+
|
|
97
|
+
```toml
|
|
98
|
+
[project.entry-points."agenteval.evaluators"]
|
|
99
|
+
your_evaluator = "agenteval.evaluators.your_evaluator:YourEvaluator"
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### 3. Write tests
|
|
103
|
+
|
|
104
|
+
Create `tests/unit/test_your_evaluator.py`:
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from agenteval.core.models import Trace, Turn
|
|
108
|
+
from agenteval.evaluators.your_evaluator import YourEvaluator
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _make_trace(**overrides) -> Trace:
|
|
112
|
+
defaults = {
|
|
113
|
+
"agent_name": "test", "input": "query", "output": "answer",
|
|
114
|
+
"turns": [], "total_cost_usd": 0.01, "total_latency_ms": 500,
|
|
115
|
+
"total_input_tokens": 100, "total_output_tokens": 50, "metadata": {},
|
|
116
|
+
}
|
|
117
|
+
defaults.update(overrides)
|
|
118
|
+
return Trace(**defaults)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def test_your_evaluator_passes():
|
|
122
|
+
trace = _make_trace(output="good response")
|
|
123
|
+
evaluator = YourEvaluator()
|
|
124
|
+
result = evaluator.evaluate(trace)
|
|
125
|
+
assert result.passed
|
|
126
|
+
assert result.score > 0.0
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def test_your_evaluator_fails():
|
|
130
|
+
trace = _make_trace(output="bad response")
|
|
131
|
+
evaluator = YourEvaluator()
|
|
132
|
+
result = evaluator.evaluate(trace)
|
|
133
|
+
assert not result.passed
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### 4. Submit a PR
|
|
137
|
+
|
|
138
|
+
- Run `pytest tests/ -v` — all tests pass
|
|
139
|
+
- Run `ruff check src/ tests/` — no lint errors
|
|
140
|
+
- Run `mypy src/agenteval/` — no type errors
|
|
141
|
+
- Create a PR with a clear description
|
|
142
|
+
|
|
143
|
+
## Contributing an Interceptor
|
|
144
|
+
|
|
145
|
+
Provider interceptors capture LLM calls at the SDK or transport level.
|
|
146
|
+
|
|
147
|
+
### 1. Create the interceptor
|
|
148
|
+
|
|
149
|
+
Create `src/agenteval/interceptors/your_provider.py`:
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
"""Your provider interceptor."""
|
|
153
|
+
|
|
154
|
+
from __future__ import annotations
|
|
155
|
+
|
|
156
|
+
from agenteval.core.models import LLMCall
|
|
157
|
+
from agenteval.interceptors.base import Interceptor
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class YourProviderInterceptor(Interceptor):
|
|
161
|
+
"""Intercept calls to YourProvider SDK."""
|
|
162
|
+
|
|
163
|
+
name = "your_provider"
|
|
164
|
+
|
|
165
|
+
def install(self) -> None:
|
|
166
|
+
"""Monkey-patch or hook into the provider SDK."""
|
|
167
|
+
# Hook into the SDK here
|
|
168
|
+
pass
|
|
169
|
+
|
|
170
|
+
def uninstall(self) -> None:
|
|
171
|
+
"""Restore original SDK behavior."""
|
|
172
|
+
pass
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### 2. Register in InterceptorRegistry
|
|
176
|
+
|
|
177
|
+
Add the auto-detect logic in `src/agenteval/interceptors/base.py`.
|
|
178
|
+
|
|
179
|
+
### 3. Add pricing data
|
|
180
|
+
|
|
181
|
+
Add model pricing to `src/agenteval/interceptors/pricing.json`.
|
|
182
|
+
|
|
183
|
+
## Issue Labels
|
|
184
|
+
|
|
185
|
+
| Label | Use for |
|
|
186
|
+
|---|---|
|
|
187
|
+
| `bug` | Something is broken |
|
|
188
|
+
| `enhancement` | New feature or improvement |
|
|
189
|
+
| `evaluator` | New evaluator proposal |
|
|
190
|
+
| `good-first-issue` | Good for newcomers |
|
|
191
|
+
| `documentation` | Documentation improvements |
|
|
192
|
+
|
|
193
|
+
## Code Style
|
|
194
|
+
|
|
195
|
+
- Python 3.10+, type hints everywhere
|
|
196
|
+
- `ruff` for linting and formatting
|
|
197
|
+
- `mypy` strict mode on public API
|
|
198
|
+
- Docstrings on all public functions
|
|
199
|
+
- Tests for all new code (TDD preferred)
|
|
200
|
+
|
|
201
|
+
## Pull Request Checklist
|
|
202
|
+
|
|
203
|
+
- [ ] Tests added/updated
|
|
204
|
+
- [ ] `ruff check` passes
|
|
205
|
+
- [ ] `mypy` passes
|
|
206
|
+
- [ ] Documentation updated (if applicable)
|
|
207
|
+
- [ ] Commit messages follow conventional commits
|
|
208
|
+
|
|
209
|
+
## License
|
|
210
|
+
|
|
211
|
+
By contributing, you agree that your contributions will be licensed under the MIT License.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Devbrat Anand
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
.PHONY: install dev test lint typecheck format clean build publish docs
|
|
2
|
+
|
|
3
|
+
install: ## Install agenteval with all extras
|
|
4
|
+
uv pip install -e ".[all]"
|
|
5
|
+
|
|
6
|
+
dev: ## Install with dev dependencies
|
|
7
|
+
uv pip install -e ".[all,dev]"
|
|
8
|
+
|
|
9
|
+
test: ## Run tests with coverage
|
|
10
|
+
pytest tests/ -v --cov=agenteval --cov-report=term-missing
|
|
11
|
+
|
|
12
|
+
test-unit: ## Run unit tests only
|
|
13
|
+
pytest tests/unit/ -v
|
|
14
|
+
|
|
15
|
+
lint: ## Run linter
|
|
16
|
+
ruff check src/ tests/
|
|
17
|
+
|
|
18
|
+
typecheck: ## Run type checker
|
|
19
|
+
mypy src/agenteval/
|
|
20
|
+
|
|
21
|
+
format: ## Format code
|
|
22
|
+
ruff format src/ tests/
|
|
23
|
+
ruff check --fix src/ tests/
|
|
24
|
+
|
|
25
|
+
clean: ## Remove build artifacts
|
|
26
|
+
rm -rf build/ dist/ *.egg-info .pytest_cache .mypy_cache .ruff_cache
|
|
27
|
+
find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
|
|
28
|
+
|
|
29
|
+
build: ## Build package
|
|
30
|
+
uv build
|
|
31
|
+
|
|
32
|
+
publish: build ## Publish to PyPI (use trusted publisher in CI)
|
|
33
|
+
uv publish
|
|
34
|
+
|
|
35
|
+
docs: ## Serve docs locally
|
|
36
|
+
mkdocs serve
|
|
37
|
+
|
|
38
|
+
docs-build: ## Build docs
|
|
39
|
+
mkdocs build
|
|
40
|
+
|
|
41
|
+
help: ## Show this help
|
|
42
|
+
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m %s\n", $$1, $$2}'
|
|
43
|
+
|
|
44
|
+
.DEFAULT_GOAL := help
|