agenteval-ai 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. agenteval_ai-0.1.0/.github/ISSUE_TEMPLATE/bug_report.yml +32 -0
  2. agenteval_ai-0.1.0/.github/ISSUE_TEMPLATE/feature_request.yml +17 -0
  3. agenteval_ai-0.1.0/.github/ISSUE_TEMPLATE/new_evaluator.yml +22 -0
  4. agenteval_ai-0.1.0/.github/pull_request_template.md +14 -0
  5. agenteval_ai-0.1.0/.github/workflows/ci.yml +45 -0
  6. agenteval_ai-0.1.0/.github/workflows/release.yml +28 -0
  7. agenteval_ai-0.1.0/.gitignore +22 -0
  8. agenteval_ai-0.1.0/.python-version +1 -0
  9. agenteval_ai-0.1.0/CHANGELOG.md +35 -0
  10. agenteval_ai-0.1.0/CONTRIBUTING.md +211 -0
  11. agenteval_ai-0.1.0/LICENSE +21 -0
  12. agenteval_ai-0.1.0/Makefile +44 -0
  13. agenteval_ai-0.1.0/PKG-INFO +491 -0
  14. agenteval_ai-0.1.0/README.md +438 -0
  15. agenteval_ai-0.1.0/SECURITY.md +35 -0
  16. agenteval_ai-0.1.0/action.yml +99 -0
  17. agenteval_ai-0.1.0/docs/demo.gif +0 -0
  18. agenteval_ai-0.1.0/docs/demo.tape +15 -0
  19. agenteval_ai-0.1.0/docs/demo_sim.sh +92 -0
  20. agenteval_ai-0.1.0/docs/docs/guides/ci-cd.md +526 -0
  21. agenteval_ai-0.1.0/docs/docs/guides/custom-evaluators.md +568 -0
  22. agenteval_ai-0.1.0/docs/docs/guides/local-evals.md +323 -0
  23. agenteval_ai-0.1.0/docs/docs/guides/production-failures.md +442 -0
  24. agenteval_ai-0.1.0/docs/docs/guides/providers.md +252 -0
  25. agenteval_ai-0.1.0/docs/docs/index.md +59 -0
  26. agenteval_ai-0.1.0/docs/docs/quickstart.md +117 -0
  27. agenteval_ai-0.1.0/docs/docs/reference/cli.md +602 -0
  28. agenteval_ai-0.1.0/docs/docs/reference/evaluators.md +506 -0
  29. agenteval_ai-0.1.0/docs/docs/reference/interceptors.md +468 -0
  30. agenteval_ai-0.1.0/docs/mkdocs.yml +61 -0
  31. agenteval_ai-0.1.0/docs/report-preview.png +0 -0
  32. agenteval_ai-0.1.0/examples/bedrock_agent/agent.py +142 -0
  33. agenteval_ai-0.1.0/examples/bedrock_agent/conftest.py +16 -0
  34. agenteval_ai-0.1.0/examples/bedrock_agent/test_bedrock_agent.py +84 -0
  35. agenteval_ai-0.1.0/examples/langchain_agent/agent.py +50 -0
  36. agenteval_ai-0.1.0/examples/langchain_agent/conftest.py +17 -0
  37. agenteval_ai-0.1.0/examples/langchain_agent/test_langchain_agent.py +50 -0
  38. agenteval_ai-0.1.0/examples/ollama_local/agent.py +21 -0
  39. agenteval_ai-0.1.0/examples/ollama_local/conftest.py +11 -0
  40. agenteval_ai-0.1.0/examples/ollama_local/test_ollama_agent.py +58 -0
  41. agenteval_ai-0.1.0/examples/openai_agent/agent.py +132 -0
  42. agenteval_ai-0.1.0/examples/openai_agent/conftest.py +17 -0
  43. agenteval_ai-0.1.0/examples/openai_agent/test_openai_agent.py +83 -0
  44. agenteval_ai-0.1.0/examples/quickstart/conftest.py +14 -0
  45. agenteval_ai-0.1.0/examples/quickstart/test_hello.py +23 -0
  46. agenteval_ai-0.1.0/pyproject.toml +112 -0
  47. agenteval_ai-0.1.0/src/agenteval/__init__.py +24 -0
  48. agenteval_ai-0.1.0/src/agenteval/cli/__init__.py +0 -0
  49. agenteval_ai-0.1.0/src/agenteval/cli/main.py +185 -0
  50. agenteval_ai-0.1.0/src/agenteval/cli/scaffold.py +224 -0
  51. agenteval_ai-0.1.0/src/agenteval/core/__init__.py +0 -0
  52. agenteval_ai-0.1.0/src/agenteval/core/config.py +72 -0
  53. agenteval_ai-0.1.0/src/agenteval/core/eval_model.py +102 -0
  54. agenteval_ai-0.1.0/src/agenteval/core/models.py +113 -0
  55. agenteval_ai-0.1.0/src/agenteval/core/runner.py +161 -0
  56. agenteval_ai-0.1.0/src/agenteval/evaluators/__init__.py +54 -0
  57. agenteval_ai-0.1.0/src/agenteval/evaluators/base.py +51 -0
  58. agenteval_ai-0.1.0/src/agenteval/evaluators/context_utilization.py +90 -0
  59. agenteval_ai-0.1.0/src/agenteval/evaluators/convergence.py +46 -0
  60. agenteval_ai-0.1.0/src/agenteval/evaluators/cost.py +49 -0
  61. agenteval_ai-0.1.0/src/agenteval/evaluators/guardrail.py +89 -0
  62. agenteval_ai-0.1.0/src/agenteval/evaluators/hallucination.py +102 -0
  63. agenteval_ai-0.1.0/src/agenteval/evaluators/latency.py +52 -0
  64. agenteval_ai-0.1.0/src/agenteval/evaluators/llm_judge.py +31 -0
  65. agenteval_ai-0.1.0/src/agenteval/evaluators/loop_detector.py +122 -0
  66. agenteval_ai-0.1.0/src/agenteval/evaluators/output_structure.py +139 -0
  67. agenteval_ai-0.1.0/src/agenteval/evaluators/regression.py +57 -0
  68. agenteval_ai-0.1.0/src/agenteval/evaluators/security.py +80 -0
  69. agenteval_ai-0.1.0/src/agenteval/evaluators/similarity.py +67 -0
  70. agenteval_ai-0.1.0/src/agenteval/evaluators/tool_call.py +76 -0
  71. agenteval_ai-0.1.0/src/agenteval/interceptors/__init__.py +20 -0
  72. agenteval_ai-0.1.0/src/agenteval/interceptors/anthropic.py +151 -0
  73. agenteval_ai-0.1.0/src/agenteval/interceptors/base.py +57 -0
  74. agenteval_ai-0.1.0/src/agenteval/interceptors/bedrock.py +140 -0
  75. agenteval_ai-0.1.0/src/agenteval/interceptors/data/pricing.json +58 -0
  76. agenteval_ai-0.1.0/src/agenteval/interceptors/openai.py +128 -0
  77. agenteval_ai-0.1.0/src/agenteval/interceptors/pricing.py +41 -0
  78. agenteval_ai-0.1.0/src/agenteval/mcp/__init__.py +0 -0
  79. agenteval_ai-0.1.0/src/agenteval/mcp/installer.py +89 -0
  80. agenteval_ai-0.1.0/src/agenteval/mcp/server.py +270 -0
  81. agenteval_ai-0.1.0/src/agenteval/providers/__init__.py +19 -0
  82. agenteval_ai-0.1.0/src/agenteval/providers/base.py +35 -0
  83. agenteval_ai-0.1.0/src/agenteval/providers/bedrock.py +117 -0
  84. agenteval_ai-0.1.0/src/agenteval/providers/ollama.py +41 -0
  85. agenteval_ai-0.1.0/src/agenteval/providers/openai.py +55 -0
  86. agenteval_ai-0.1.0/src/agenteval/py.typed +0 -0
  87. agenteval_ai-0.1.0/src/agenteval/pytest_plugin/__init__.py +0 -0
  88. agenteval_ai-0.1.0/src/agenteval/pytest_plugin/_collector.py +31 -0
  89. agenteval_ai-0.1.0/src/agenteval/pytest_plugin/assertions.py +203 -0
  90. agenteval_ai-0.1.0/src/agenteval/pytest_plugin/fixtures.py +71 -0
  91. agenteval_ai-0.1.0/src/agenteval/pytest_plugin/plugin.py +171 -0
  92. agenteval_ai-0.1.0/src/agenteval/reporting/__init__.py +13 -0
  93. agenteval_ai-0.1.0/src/agenteval/reporting/base.py +14 -0
  94. agenteval_ai-0.1.0/src/agenteval/reporting/console.py +58 -0
  95. agenteval_ai-0.1.0/src/agenteval/reporting/html.py +487 -0
  96. agenteval_ai-0.1.0/src/agenteval/reporting/json.py +18 -0
  97. agenteval_ai-0.1.0/src/agenteval/skill/__init__.py +0 -0
  98. agenteval_ai-0.1.0/src/agenteval/skill/adapters/__init__.py +0 -0
  99. agenteval_ai-0.1.0/src/agenteval/skill/adapters/claude_code.py +21 -0
  100. agenteval_ai-0.1.0/src/agenteval/skill/adapters/copilot.py +34 -0
  101. agenteval_ai-0.1.0/src/agenteval/skill/adapters/cursor.py +24 -0
  102. agenteval_ai-0.1.0/src/agenteval/skill/adapters/windsurf.py +24 -0
  103. agenteval_ai-0.1.0/src/agenteval/skill/core/__init__.py +0 -0
  104. agenteval_ai-0.1.0/src/agenteval/skill/core/check_regression.md +15 -0
  105. agenteval_ai-0.1.0/src/agenteval/skill/core/cost_audit.md +15 -0
  106. agenteval_ai-0.1.0/src/agenteval/skill/core/eval_agent.md +27 -0
  107. agenteval_ai-0.1.0/src/agenteval/skill/core/explain_failure.md +15 -0
  108. agenteval_ai-0.1.0/src/agenteval/skill/core/generate_tests.md +15 -0
  109. agenteval_ai-0.1.0/src/agenteval/skill/core/security_audit.md +15 -0
  110. agenteval_ai-0.1.0/src/agenteval/skill/installer.py +36 -0
  111. agenteval_ai-0.1.0/tests/__init__.py +0 -0
  112. agenteval_ai-0.1.0/tests/agent_evals/conftest.py +22 -0
  113. agenteval_ai-0.1.0/tests/agent_evals/test_example.py +25 -0
  114. agenteval_ai-0.1.0/tests/fixtures/.gitkeep +0 -0
  115. agenteval_ai-0.1.0/tests/integration/__init__.py +0 -0
  116. agenteval_ai-0.1.0/tests/unit/__init__.py +0 -0
  117. agenteval_ai-0.1.0/tests/unit/test_anthropic_interceptor.py +25 -0
  118. agenteval_ai-0.1.0/tests/unit/test_assertions.py +96 -0
  119. agenteval_ai-0.1.0/tests/unit/test_bedrock_interceptor.py +36 -0
  120. agenteval_ai-0.1.0/tests/unit/test_cli.py +37 -0
  121. agenteval_ai-0.1.0/tests/unit/test_config.py +82 -0
  122. agenteval_ai-0.1.0/tests/unit/test_context_utilization_evaluator.py +57 -0
  123. agenteval_ai-0.1.0/tests/unit/test_convergence_evaluator.py +68 -0
  124. agenteval_ai-0.1.0/tests/unit/test_cost_evaluator.py +47 -0
  125. agenteval_ai-0.1.0/tests/unit/test_eval_model.py +62 -0
  126. agenteval_ai-0.1.0/tests/unit/test_eval_providers.py +127 -0
  127. agenteval_ai-0.1.0/tests/unit/test_evaluator_base.py +63 -0
  128. agenteval_ai-0.1.0/tests/unit/test_guardrail_evaluator.py +75 -0
  129. agenteval_ai-0.1.0/tests/unit/test_hallucination_evaluator.py +67 -0
  130. agenteval_ai-0.1.0/tests/unit/test_interceptor_base.py +48 -0
  131. agenteval_ai-0.1.0/tests/unit/test_latency_evaluator.py +47 -0
  132. agenteval_ai-0.1.0/tests/unit/test_llm_judge.py +54 -0
  133. agenteval_ai-0.1.0/tests/unit/test_loop_detector.py +91 -0
  134. agenteval_ai-0.1.0/tests/unit/test_mcp_installer.py +87 -0
  135. agenteval_ai-0.1.0/tests/unit/test_mcp_server.py +31 -0
  136. agenteval_ai-0.1.0/tests/unit/test_models.py +191 -0
  137. agenteval_ai-0.1.0/tests/unit/test_openai_interceptor.py +52 -0
  138. agenteval_ai-0.1.0/tests/unit/test_output_structure_evaluator.py +108 -0
  139. agenteval_ai-0.1.0/tests/unit/test_pricing.py +54 -0
  140. agenteval_ai-0.1.0/tests/unit/test_pytest_plugin.py +7 -0
  141. agenteval_ai-0.1.0/tests/unit/test_regression_evaluator.py +71 -0
  142. agenteval_ai-0.1.0/tests/unit/test_reporting.py +96 -0
  143. agenteval_ai-0.1.0/tests/unit/test_runner.py +45 -0
  144. agenteval_ai-0.1.0/tests/unit/test_scaffold.py +57 -0
  145. agenteval_ai-0.1.0/tests/unit/test_security_evaluator.py +97 -0
  146. agenteval_ai-0.1.0/tests/unit/test_similarity_evaluator.py +67 -0
  147. agenteval_ai-0.1.0/tests/unit/test_skill_installer.py +36 -0
  148. agenteval_ai-0.1.0/tests/unit/test_tool_call_evaluator.py +98 -0
@@ -0,0 +1,32 @@
1
+ name: Bug Report
2
+ description: Report a bug in agenteval
3
+ labels: ["bug"]
4
+ body:
5
+ - type: textarea
6
+ id: description
7
+ attributes:
8
+ label: Description
9
+ description: What happened?
10
+ validations:
11
+ required: true
12
+ - type: textarea
13
+ id: reproduction
14
+ attributes:
15
+ label: Steps to reproduce
16
+ description: Minimal code or steps to reproduce the issue
17
+ validations:
18
+ required: true
19
+ - type: textarea
20
+ id: expected
21
+ attributes:
22
+ label: Expected behavior
23
+ - type: input
24
+ id: version
25
+ attributes:
26
+ label: agenteval version
27
+ placeholder: "0.1.0"
28
+ - type: input
29
+ id: python
30
+ attributes:
31
+ label: Python version
32
+ placeholder: "3.12"
@@ -0,0 +1,17 @@
1
+ name: Feature Request
2
+ description: Suggest a new feature
3
+ labels: ["enhancement"]
4
+ body:
5
+ - type: textarea
6
+ id: problem
7
+ attributes:
8
+ label: Problem
9
+ description: What problem does this solve?
10
+ validations:
11
+ required: true
12
+ - type: textarea
13
+ id: solution
14
+ attributes:
15
+ label: Proposed solution
16
+ validations:
17
+ required: true
@@ -0,0 +1,22 @@
1
+ name: New Evaluator Proposal
2
+ description: Propose a new built-in or community evaluator
3
+ labels: ["evaluator", "enhancement"]
4
+ body:
5
+ - type: input
6
+ id: name
7
+ attributes:
8
+ label: Evaluator name
9
+ placeholder: "toxicity"
10
+ validations:
11
+ required: true
12
+ - type: textarea
13
+ id: description
14
+ attributes:
15
+ label: What does it catch?
16
+ validations:
17
+ required: true
18
+ - type: textarea
19
+ id: implementation
20
+ attributes:
21
+ label: Implementation approach
22
+ description: How would this evaluator work?
@@ -0,0 +1,14 @@
1
+ ## What
2
+
3
+ <!-- Brief description of changes -->
4
+
5
+ ## Why
6
+
7
+ <!-- What problem does this solve? -->
8
+
9
+ ## Checklist
10
+
11
+ - [ ] Tests added/updated
12
+ - [ ] `ruff check` passes
13
+ - [ ] `mypy` passes
14
+ - [ ] Documentation updated (if applicable)
@@ -0,0 +1,45 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Install uv
20
+ uses: astral-sh/setup-uv@v4
21
+
22
+ - name: Set up Python ${{ matrix.python-version }}
23
+ run: uv python install ${{ matrix.python-version }}
24
+
25
+ - name: Create venv and install
26
+ run: |
27
+ uv venv .venv
28
+ source .venv/bin/activate
29
+ uv pip install -e ".[all,dev]"
30
+
31
+ - name: Lint
32
+ run: .venv/bin/ruff check src/ tests/
33
+
34
+ - name: Type check
35
+ run: .venv/bin/mypy src/agenteval/
36
+
37
+ - name: Test
38
+ run: .venv/bin/pytest tests/unit/ -v --cov=agenteval --cov-report=xml --cov-report=term-missing
39
+
40
+ - name: Upload coverage
41
+ if: matrix.python-version == '3.12'
42
+ uses: codecov/codecov-action@v4
43
+ with:
44
+ file: ./coverage.xml
45
+ fail_ci_if_error: false
@@ -0,0 +1,28 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags: ["v*"]
6
+
7
+ permissions:
8
+ id-token: write
9
+
10
+ jobs:
11
+ publish:
12
+ runs-on: ubuntu-latest
13
+ environment: release
14
+
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Install uv
19
+ uses: astral-sh/setup-uv@v4
20
+
21
+ - name: Set up Python
22
+ run: uv python install 3.12
23
+
24
+ - name: Build package
25
+ run: uv build
26
+
27
+ - name: Publish to PyPI
28
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,22 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ # IDE
13
+ .idea/
14
+ .vscode/
15
+ .cursor/
16
+
17
+ # Generated reports
18
+ docs/report_*.html
19
+ docs/report_*.json
20
+
21
+ # Lock files
22
+ uv.lock
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,35 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.1.0] - 2026-04-09
11
+
12
+ ### Added
13
+
14
+ - Core data models: Trace, LLMCall, ToolCall, Turn, EvalResult, TestResult, SuiteResult
15
+ - InterceptorRegistry with auto-detection of installed SDKs
16
+ - 5 provider interceptors: OpenAI, AWS Bedrock, Google Vertex AI, Azure OpenAI, Anthropic
17
+ - PricingEngine with bundled pricing data for all providers
18
+ - 13 built-in evaluators:
19
+ - Structural: ToolCall, Cost, Latency, LoopDetector, OutputStructure
20
+ - Semantic: LLMJudge, Hallucination, Similarity
21
+ - Safety: Security, Guardrail
22
+ - Operational: Regression, Convergence, ContextUtilization
23
+ - Evaluator plugin interface with Python entry point discovery
24
+ - Eval model providers: OpenAI and Ollama ($0 local evals)
25
+ - pytest plugin with fixtures, markers, and CLI flags
26
+ - Trace convenience assertions (tool_called, no_loops, no_pii_leaked, etc.)
27
+ - CLI: `agenteval run`, `agenteval init`, `agenteval version`
28
+ - MCP server with 8 tools + `agenteval mcp serve/install`
29
+ - 6 cross-platform skills with adapters for Claude Code, Copilot, Cursor, Windsurf
30
+ - 3 report formats: console (rich), HTML, JSON
31
+ - GitHub Action with PR comment bot
32
+ - CI/CD: GitHub Actions for testing + PyPI publishing via Trusted Publisher
33
+ - 5 example projects: quickstart, OpenAI, Bedrock, LangChain, Ollama
34
+ - mkdocs documentation site
35
+ - CONTRIBUTING.md with evaluator/interceptor contribution guides
@@ -0,0 +1,211 @@
1
+ # Contributing to agenteval
2
+
3
+ Thanks for your interest in contributing! agenteval is an open-source project and we welcome contributions of all kinds.
4
+
5
+ ## Getting Started
6
+
7
+ ### Development setup
8
+
9
+ ```bash
10
+ # Clone the repo
11
+ git clone https://github.com/devbrat-anand/agenteval.git
12
+ cd agenteval
13
+
14
+ # Install uv (if not already installed)
15
+ curl -LsSf https://astral.sh/uv/install.sh | sh
16
+
17
+ # Install dev dependencies
18
+ uv pip install -e ".[dev]" --system
19
+
20
+ # Verify setup
21
+ pytest tests/ -v
22
+ ruff check src/ tests/
23
+ mypy src/agenteval/
24
+ ```
25
+
26
+ ### Running tests
27
+
28
+ ```bash
29
+ # All tests
30
+ pytest tests/ -v
31
+
32
+ # Unit tests only
33
+ pytest tests/unit/ -v
34
+
35
+ # Specific test file
36
+ pytest tests/unit/test_models.py -v
37
+
38
+ # With coverage
39
+ pytest tests/ --cov=agenteval --cov-report=term -v
40
+ ```
41
+
42
+ ### Code quality
43
+
44
+ ```bash
45
+ # Lint
46
+ ruff check src/ tests/
47
+
48
+ # Format
49
+ ruff format src/ tests/
50
+
51
+ # Type check
52
+ mypy src/agenteval/
53
+ ```
54
+
55
+ ## Contributing an Evaluator
56
+
57
+ The most impactful way to contribute. Here's how:
58
+
59
+ ### 1. Create the evaluator
60
+
61
+ Create `src/agenteval/evaluators/your_evaluator.py`:
62
+
63
+ ```python
64
+ """Your evaluator description."""
65
+
66
+ from __future__ import annotations
67
+
68
+ from agenteval.core.models import EvalResult, Trace
69
+ from agenteval.evaluators.base import Evaluator
70
+
71
+
72
+ class YourEvaluator(Evaluator):
73
+ """One-line description of what this evaluator checks."""
74
+
75
+ name = "your_evaluator"
76
+
77
+ def evaluate(self, trace: Trace, criteria: dict) -> EvalResult:
78
+ # Your evaluation logic
79
+ score = 1.0
80
+ passed = True
81
+ reason = "Check passed"
82
+
83
+ # Return an EvalResult
84
+ return EvalResult(
85
+ evaluator=self.name,
86
+ score=score,
87
+ passed=passed,
88
+ reason=reason,
89
+ details={},
90
+ )
91
+ ```
92
+
93
+ ### 2. Register the entry point
94
+
95
+ Add to `pyproject.toml`:
96
+
97
+ ```toml
98
+ [project.entry-points."agenteval.evaluators"]
99
+ your_evaluator = "agenteval.evaluators.your_evaluator:YourEvaluator"
100
+ ```
101
+
102
+ ### 3. Write tests
103
+
104
+ Create `tests/unit/test_your_evaluator.py`:
105
+
106
+ ```python
107
+ from agenteval.core.models import Trace, Turn
108
+ from agenteval.evaluators.your_evaluator import YourEvaluator
109
+
110
+
111
+ def _make_trace(**overrides) -> Trace:
112
+ defaults = {
113
+ "agent_name": "test", "input": "query", "output": "answer",
114
+ "turns": [], "total_cost_usd": 0.01, "total_latency_ms": 500,
115
+ "total_input_tokens": 100, "total_output_tokens": 50, "metadata": {},
116
+ }
117
+ defaults.update(overrides)
118
+ return Trace(**defaults)
119
+
120
+
121
+ def test_your_evaluator_passes():
122
+ trace = _make_trace(output="good response")
123
+ evaluator = YourEvaluator()
124
+ result = evaluator.evaluate(trace)
125
+ assert result.passed
126
+ assert result.score > 0.0
127
+
128
+
129
+ def test_your_evaluator_fails():
130
+ trace = _make_trace(output="bad response")
131
+ evaluator = YourEvaluator()
132
+ result = evaluator.evaluate(trace)
133
+ assert not result.passed
134
+ ```
135
+
136
+ ### 4. Submit a PR
137
+
138
+ - Run `pytest tests/ -v` — all tests pass
139
+ - Run `ruff check src/ tests/` — no lint errors
140
+ - Run `mypy src/agenteval/` — no type errors
141
+ - Create a PR with a clear description
142
+
143
+ ## Contributing an Interceptor
144
+
145
+ Provider interceptors capture LLM calls at the SDK or transport level.
146
+
147
+ ### 1. Create the interceptor
148
+
149
+ Create `src/agenteval/interceptors/your_provider.py`:
150
+
151
+ ```python
152
+ """Your provider interceptor."""
153
+
154
+ from __future__ import annotations
155
+
156
+ from agenteval.core.models import LLMCall
157
+ from agenteval.interceptors.base import Interceptor
158
+
159
+
160
+ class YourProviderInterceptor(Interceptor):
161
+ """Intercept calls to YourProvider SDK."""
162
+
163
+ name = "your_provider"
164
+
165
+ def install(self) -> None:
166
+ """Monkey-patch or hook into the provider SDK."""
167
+ # Hook into the SDK here
168
+ pass
169
+
170
+ def uninstall(self) -> None:
171
+ """Restore original SDK behavior."""
172
+ pass
173
+ ```
174
+
175
+ ### 2. Register in InterceptorRegistry
176
+
177
+ Add the auto-detect logic in `src/agenteval/interceptors/base.py`.
178
+
179
+ ### 3. Add pricing data
180
+
181
+ Add model pricing to `src/agenteval/interceptors/pricing.json`.
182
+
183
+ ## Issue Labels
184
+
185
+ | Label | Use for |
186
+ |---|---|
187
+ | `bug` | Something is broken |
188
+ | `enhancement` | New feature or improvement |
189
+ | `evaluator` | New evaluator proposal |
190
+ | `good-first-issue` | Good for newcomers |
191
+ | `documentation` | Documentation improvements |
192
+
193
+ ## Code Style
194
+
195
+ - Python 3.10+, type hints everywhere
196
+ - `ruff` for linting and formatting
197
+ - `mypy` strict mode on public API
198
+ - Docstrings on all public functions
199
+ - Tests for all new code (TDD preferred)
200
+
201
+ ## Pull Request Checklist
202
+
203
+ - [ ] Tests added/updated
204
+ - [ ] `ruff check` passes
205
+ - [ ] `mypy` passes
206
+ - [ ] Documentation updated (if applicable)
207
+ - [ ] Commit messages follow conventional commits
208
+
209
+ ## License
210
+
211
+ By contributing, you agree that your contributions will be licensed under the MIT License.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Devbrat Anand
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,44 @@
1
+ .PHONY: install dev test lint typecheck format clean build publish docs
2
+
3
+ install: ## Install agenteval with all extras
4
+ uv pip install -e ".[all]"
5
+
6
+ dev: ## Install with dev dependencies
7
+ uv pip install -e ".[all,dev]"
8
+
9
+ test: ## Run tests with coverage
10
+ pytest tests/ -v --cov=agenteval --cov-report=term-missing
11
+
12
+ test-unit: ## Run unit tests only
13
+ pytest tests/unit/ -v
14
+
15
+ lint: ## Run linter
16
+ ruff check src/ tests/
17
+
18
+ typecheck: ## Run type checker
19
+ mypy src/agenteval/
20
+
21
+ format: ## Format code
22
+ ruff format src/ tests/
23
+ ruff check --fix src/ tests/
24
+
25
+ clean: ## Remove build artifacts
26
+ rm -rf build/ dist/ *.egg-info .pytest_cache .mypy_cache .ruff_cache
27
+ find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
28
+
29
+ build: ## Build package
30
+ uv build
31
+
32
+ publish: build ## Publish to PyPI (use trusted publisher in CI)
33
+ uv publish
34
+
35
+ docs: ## Serve docs locally
36
+ mkdocs serve
37
+
38
+ docs-build: ## Build docs
39
+ mkdocs build
40
+
41
+ help: ## Show this help
42
+ @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m %s\n", $$1, $$2}'
43
+
44
+ .DEFAULT_GOAL := help