agenteval-ai 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agenteval/__init__.py +24 -0
- agenteval/cli/__init__.py +0 -0
- agenteval/cli/main.py +185 -0
- agenteval/cli/scaffold.py +224 -0
- agenteval/core/__init__.py +0 -0
- agenteval/core/config.py +72 -0
- agenteval/core/eval_model.py +102 -0
- agenteval/core/models.py +113 -0
- agenteval/core/runner.py +161 -0
- agenteval/evaluators/__init__.py +54 -0
- agenteval/evaluators/base.py +51 -0
- agenteval/evaluators/context_utilization.py +90 -0
- agenteval/evaluators/convergence.py +46 -0
- agenteval/evaluators/cost.py +49 -0
- agenteval/evaluators/guardrail.py +89 -0
- agenteval/evaluators/hallucination.py +102 -0
- agenteval/evaluators/latency.py +52 -0
- agenteval/evaluators/llm_judge.py +31 -0
- agenteval/evaluators/loop_detector.py +122 -0
- agenteval/evaluators/output_structure.py +139 -0
- agenteval/evaluators/regression.py +57 -0
- agenteval/evaluators/security.py +80 -0
- agenteval/evaluators/similarity.py +67 -0
- agenteval/evaluators/tool_call.py +76 -0
- agenteval/interceptors/__init__.py +20 -0
- agenteval/interceptors/anthropic.py +151 -0
- agenteval/interceptors/base.py +57 -0
- agenteval/interceptors/bedrock.py +140 -0
- agenteval/interceptors/data/pricing.json +58 -0
- agenteval/interceptors/openai.py +128 -0
- agenteval/interceptors/pricing.py +41 -0
- agenteval/mcp/__init__.py +0 -0
- agenteval/mcp/installer.py +89 -0
- agenteval/mcp/server.py +270 -0
- agenteval/providers/__init__.py +19 -0
- agenteval/providers/base.py +35 -0
- agenteval/providers/bedrock.py +117 -0
- agenteval/providers/ollama.py +41 -0
- agenteval/providers/openai.py +55 -0
- agenteval/py.typed +0 -0
- agenteval/pytest_plugin/__init__.py +0 -0
- agenteval/pytest_plugin/_collector.py +31 -0
- agenteval/pytest_plugin/assertions.py +203 -0
- agenteval/pytest_plugin/fixtures.py +71 -0
- agenteval/pytest_plugin/plugin.py +171 -0
- agenteval/reporting/__init__.py +13 -0
- agenteval/reporting/base.py +14 -0
- agenteval/reporting/console.py +58 -0
- agenteval/reporting/html.py +487 -0
- agenteval/reporting/json.py +18 -0
- agenteval/skill/__init__.py +0 -0
- agenteval/skill/adapters/__init__.py +0 -0
- agenteval/skill/adapters/claude_code.py +21 -0
- agenteval/skill/adapters/copilot.py +34 -0
- agenteval/skill/adapters/cursor.py +24 -0
- agenteval/skill/adapters/windsurf.py +24 -0
- agenteval/skill/core/__init__.py +0 -0
- agenteval/skill/core/check_regression.md +15 -0
- agenteval/skill/core/cost_audit.md +15 -0
- agenteval/skill/core/eval_agent.md +27 -0
- agenteval/skill/core/explain_failure.md +15 -0
- agenteval/skill/core/generate_tests.md +15 -0
- agenteval/skill/core/security_audit.md +15 -0
- agenteval/skill/installer.py +36 -0
- agenteval_ai-0.1.0.dist-info/METADATA +491 -0
- agenteval_ai-0.1.0.dist-info/RECORD +69 -0
- agenteval_ai-0.1.0.dist-info/WHEEL +4 -0
- agenteval_ai-0.1.0.dist-info/entry_points.txt +5 -0
- agenteval_ai-0.1.0.dist-info/licenses/LICENSE +21 -0
agenteval/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""agenteval — pytest for AI agents. Catch failures before production."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
|
|
5
|
+
from agenteval.core.models import (
|
|
6
|
+
EvalResult,
|
|
7
|
+
LLMCall,
|
|
8
|
+
SuiteResult,
|
|
9
|
+
TestResult,
|
|
10
|
+
ToolCall,
|
|
11
|
+
Trace,
|
|
12
|
+
Turn,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"__version__",
|
|
17
|
+
"EvalResult",
|
|
18
|
+
"LLMCall",
|
|
19
|
+
"SuiteResult",
|
|
20
|
+
"TestResult",
|
|
21
|
+
"ToolCall",
|
|
22
|
+
"Trace",
|
|
23
|
+
"Turn",
|
|
24
|
+
]
|
|
File without changes
|
agenteval/cli/main.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""agenteval CLI entry point."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import click
|
|
10
|
+
|
|
11
|
+
from agenteval import __version__
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@click.group()
|
|
15
|
+
def cli() -> None:
|
|
16
|
+
"""agenteval — pytest for AI agents. Catch failures before production."""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@cli.command()
|
|
20
|
+
def version() -> None:
|
|
21
|
+
"""Show agenteval version."""
|
|
22
|
+
click.echo(f"agenteval {__version__}")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@cli.command()
|
|
26
|
+
@click.argument("test_path", default="tests/")
|
|
27
|
+
@click.option(
|
|
28
|
+
"--fail-under", type=float, default=None, help="Fail if avg score below threshold (0.0-1.0)"
|
|
29
|
+
)
|
|
30
|
+
@click.option(
|
|
31
|
+
"--max-cost", type=float, default=None, help="Fail if total cost exceeds budget (USD)"
|
|
32
|
+
)
|
|
33
|
+
@click.option("--report", type=str, default=None, help="Report format: console, html, json")
|
|
34
|
+
@click.option("--report-dir", type=str, default="agenteval-reports", help="Report output directory")
|
|
35
|
+
@click.option(
|
|
36
|
+
"--baseline", type=str, default=None, help="Baseline directory for regression comparison"
|
|
37
|
+
)
|
|
38
|
+
@click.option("--regression-threshold", type=float, default=0.05, help="Max allowed score drop")
|
|
39
|
+
@click.option(
|
|
40
|
+
"--save-baseline", type=str, default=None, help="Save results as baseline to this directory"
|
|
41
|
+
)
|
|
42
|
+
def run(
|
|
43
|
+
test_path,
|
|
44
|
+
fail_under,
|
|
45
|
+
max_cost,
|
|
46
|
+
report,
|
|
47
|
+
report_dir,
|
|
48
|
+
baseline,
|
|
49
|
+
regression_threshold,
|
|
50
|
+
save_baseline,
|
|
51
|
+
) -> None:
|
|
52
|
+
"""Run agent evaluation tests."""
|
|
53
|
+
cmd = [sys.executable, "-m", "pytest", test_path, "-v"]
|
|
54
|
+
if fail_under is not None:
|
|
55
|
+
cmd.extend(["--agenteval-fail-under", str(fail_under)])
|
|
56
|
+
if max_cost is not None:
|
|
57
|
+
cmd.extend(["--agenteval-max-cost", str(max_cost)])
|
|
58
|
+
if report is not None:
|
|
59
|
+
cmd.extend(["--agenteval-report", report])
|
|
60
|
+
if save_baseline is not None:
|
|
61
|
+
cmd.extend(["--agenteval-report", "json", "--agenteval-report-dir", save_baseline])
|
|
62
|
+
result = subprocess.run(cmd)
|
|
63
|
+
sys.exit(result.returncode)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@cli.command()
|
|
67
|
+
def init() -> None:
|
|
68
|
+
"""Initialize agenteval in your project — detect setup and scaffold tests."""
|
|
69
|
+
from agenteval.cli.scaffold import detect_project, scaffold
|
|
70
|
+
|
|
71
|
+
project_dir = Path.cwd()
|
|
72
|
+
detection = detect_project(project_dir)
|
|
73
|
+
|
|
74
|
+
if detection["providers"]:
|
|
75
|
+
click.echo(f"Detected providers: {', '.join(detection['providers'])}")
|
|
76
|
+
if detection["frameworks"]:
|
|
77
|
+
click.echo(f"Detected frameworks: {', '.join(detection['frameworks'])}")
|
|
78
|
+
|
|
79
|
+
created = scaffold(project_dir)
|
|
80
|
+
if created:
|
|
81
|
+
for _name, path in created.items():
|
|
82
|
+
click.echo(f" Created: {path}")
|
|
83
|
+
click.echo("\nRun your first eval:")
|
|
84
|
+
click.echo(" pytest tests/agent_evals/ -v")
|
|
85
|
+
else:
|
|
86
|
+
click.echo("Tests already exist. Nothing to scaffold.")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@cli.command("show-pricing")
|
|
90
|
+
def show_pricing() -> None:
|
|
91
|
+
"""Show bundled model pricing data."""
|
|
92
|
+
from agenteval.interceptors.pricing import PricingEngine
|
|
93
|
+
|
|
94
|
+
engine = PricingEngine()
|
|
95
|
+
click.echo(f"Pricing data loaded: {len(engine._table)} providers")
|
|
96
|
+
for provider in engine.available_providers():
|
|
97
|
+
click.echo(f" - {provider}")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@cli.group()
|
|
101
|
+
def mcp() -> None:
|
|
102
|
+
"""MCP server commands."""
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@mcp.command()
|
|
106
|
+
def serve() -> None:
|
|
107
|
+
"""Start the agenteval MCP server."""
|
|
108
|
+
try:
|
|
109
|
+
import asyncio
|
|
110
|
+
|
|
111
|
+
from mcp.server.stdio import stdio_server
|
|
112
|
+
|
|
113
|
+
from agenteval.mcp.server import create_server
|
|
114
|
+
|
|
115
|
+
server = create_server()
|
|
116
|
+
|
|
117
|
+
async def run() -> None:
|
|
118
|
+
async with stdio_server() as (read, write):
|
|
119
|
+
await server.run(read, write, server.create_initialization_options())
|
|
120
|
+
|
|
121
|
+
asyncio.run(run())
|
|
122
|
+
except ImportError:
|
|
123
|
+
click.echo("MCP not installed. Run: pip install agenteval[mcp]", err=True)
|
|
124
|
+
sys.exit(1)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@mcp.command()
|
|
128
|
+
@click.option(
|
|
129
|
+
"--platform",
|
|
130
|
+
type=click.Choice(["claude-code", "copilot", "cursor", "windsurf", "all"]),
|
|
131
|
+
default="all",
|
|
132
|
+
help="Target platform (default: all detected)",
|
|
133
|
+
)
|
|
134
|
+
def install(platform: str) -> None:
|
|
135
|
+
"""Auto-configure agenteval MCP server in AI coding tools."""
|
|
136
|
+
from agenteval.mcp.installer import install_mcp
|
|
137
|
+
|
|
138
|
+
results = install_mcp(platform=platform)
|
|
139
|
+
if results:
|
|
140
|
+
for path in results:
|
|
141
|
+
click.echo(f" Configured: {path}")
|
|
142
|
+
click.echo(f"MCP server installed for {len(results)} tool(s)")
|
|
143
|
+
else:
|
|
144
|
+
import json as json_mod
|
|
145
|
+
|
|
146
|
+
from agenteval.mcp.installer import resolve_server_entry
|
|
147
|
+
|
|
148
|
+
click.echo("No AI coding tool configs found. Manual setup:")
|
|
149
|
+
click.echo(
|
|
150
|
+
json_mod.dumps(
|
|
151
|
+
{
|
|
152
|
+
"mcpServers": {"agenteval": resolve_server_entry()},
|
|
153
|
+
},
|
|
154
|
+
indent=2,
|
|
155
|
+
)
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@cli.group()
|
|
160
|
+
def skill() -> None:
|
|
161
|
+
"""AI coding tool skill commands."""
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
@skill.command("install")
|
|
165
|
+
@click.option(
|
|
166
|
+
"--platform",
|
|
167
|
+
type=str,
|
|
168
|
+
default=None,
|
|
169
|
+
help="Target platform: claude-code, copilot, cursor, windsurf, all",
|
|
170
|
+
)
|
|
171
|
+
def skill_install(platform: str | None) -> None:
|
|
172
|
+
"""Install agenteval skills for AI coding tools."""
|
|
173
|
+
from agenteval.skill.installer import install_skills
|
|
174
|
+
|
|
175
|
+
results = install_skills(Path.cwd(), platform=platform)
|
|
176
|
+
if results:
|
|
177
|
+
for plat, files in results.items():
|
|
178
|
+
click.echo(f" {plat}: {len(files)} files installed")
|
|
179
|
+
click.echo(f"Installed skills for: {', '.join(results.keys())}")
|
|
180
|
+
else:
|
|
181
|
+
click.echo("No platforms detected. Use --platform to specify.")
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
if __name__ == "__main__":
|
|
185
|
+
cli()
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""Project detection and test scaffolding for agenteval init."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
PROVIDER_MARKERS = {
|
|
8
|
+
"openai": "openai",
|
|
9
|
+
"anthropic": "anthropic",
|
|
10
|
+
"boto3": "bedrock",
|
|
11
|
+
"ollama": "ollama",
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
FRAMEWORK_MARKERS = {
|
|
15
|
+
"langchain": "langchain",
|
|
16
|
+
"crewai": "crewai",
|
|
17
|
+
"autogen": "autogen",
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _read_project_dependencies(project_dir: Path) -> set[str]:
|
|
22
|
+
"""Read dependency names from pyproject.toml and requirements*.txt."""
|
|
23
|
+
deps: set[str] = set()
|
|
24
|
+
|
|
25
|
+
pyproject = project_dir / "pyproject.toml"
|
|
26
|
+
if pyproject.exists():
|
|
27
|
+
try:
|
|
28
|
+
try:
|
|
29
|
+
import tomllib
|
|
30
|
+
except ModuleNotFoundError:
|
|
31
|
+
import tomli as tomllib # type: ignore[no-redef]
|
|
32
|
+
with open(pyproject, "rb") as f:
|
|
33
|
+
data = tomllib.load(f)
|
|
34
|
+
for dep in data.get("project", {}).get("dependencies", []):
|
|
35
|
+
deps.add(
|
|
36
|
+
dep.split(">")[0]
|
|
37
|
+
.split("<")[0]
|
|
38
|
+
.split("=")[0]
|
|
39
|
+
.split("[")[0]
|
|
40
|
+
.split(";")[0]
|
|
41
|
+
.strip()
|
|
42
|
+
.lower()
|
|
43
|
+
)
|
|
44
|
+
for extra_deps in data.get("project", {}).get("optional-dependencies", {}).values():
|
|
45
|
+
for dep in extra_deps:
|
|
46
|
+
deps.add(
|
|
47
|
+
dep.split(">")[0]
|
|
48
|
+
.split("<")[0]
|
|
49
|
+
.split("=")[0]
|
|
50
|
+
.split("[")[0]
|
|
51
|
+
.split(";")[0]
|
|
52
|
+
.strip()
|
|
53
|
+
.lower()
|
|
54
|
+
)
|
|
55
|
+
except Exception:
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
for req_file in project_dir.glob("requirements*.txt"):
|
|
59
|
+
try:
|
|
60
|
+
for line in req_file.read_text().splitlines():
|
|
61
|
+
line = line.strip()
|
|
62
|
+
if line and not line.startswith("#") and not line.startswith("-"):
|
|
63
|
+
deps.add(
|
|
64
|
+
line.split(">")[0]
|
|
65
|
+
.split("<")[0]
|
|
66
|
+
.split("=")[0]
|
|
67
|
+
.split("[")[0]
|
|
68
|
+
.split(";")[0]
|
|
69
|
+
.strip()
|
|
70
|
+
.lower()
|
|
71
|
+
)
|
|
72
|
+
except Exception:
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
return deps
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def detect_project(project_dir: Path) -> dict:
|
|
79
|
+
"""Detect providers and frameworks from project dependency files."""
|
|
80
|
+
deps = _read_project_dependencies(project_dir)
|
|
81
|
+
providers: list[str] = []
|
|
82
|
+
frameworks: list[str] = []
|
|
83
|
+
|
|
84
|
+
# Map package names to provider/framework names
|
|
85
|
+
package_to_provider = {
|
|
86
|
+
"openai": "openai",
|
|
87
|
+
"anthropic": "anthropic",
|
|
88
|
+
"boto3": "bedrock",
|
|
89
|
+
"ollama": "ollama",
|
|
90
|
+
}
|
|
91
|
+
package_to_framework = {
|
|
92
|
+
"langchain": "langchain",
|
|
93
|
+
"crewai": "crewai",
|
|
94
|
+
"autogen": "autogen",
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
for package, name in package_to_provider.items():
|
|
98
|
+
if package in deps:
|
|
99
|
+
providers.append(name)
|
|
100
|
+
|
|
101
|
+
for package, name in package_to_framework.items():
|
|
102
|
+
if package in deps:
|
|
103
|
+
frameworks.append(name)
|
|
104
|
+
|
|
105
|
+
return {"providers": providers, "frameworks": frameworks}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def generate_conftest(providers: list[str], frameworks: list[str]) -> str:
|
|
109
|
+
"""Generate conftest.py with proper fixtures."""
|
|
110
|
+
interceptor_comment = ""
|
|
111
|
+
if providers:
|
|
112
|
+
interceptor_comment = f"# Detected providers: {', '.join(providers)}"
|
|
113
|
+
|
|
114
|
+
return f'''"""agenteval test configuration — generated by agenteval init."""
|
|
115
|
+
|
|
116
|
+
import pytest
|
|
117
|
+
from agenteval.core.runner import AgentRunner
|
|
118
|
+
|
|
119
|
+
{interceptor_comment}
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@pytest.fixture
|
|
123
|
+
def agent(agent_runner):
|
|
124
|
+
"""Wire up your agent here.
|
|
125
|
+
|
|
126
|
+
Replace the body of my_agent with your actual agent invocation.
|
|
127
|
+
The agent_runner wraps it with automatic LLM call interception.
|
|
128
|
+
"""
|
|
129
|
+
def my_agent(prompt: str) -> str:
|
|
130
|
+
# TODO: Replace with your actual agent
|
|
131
|
+
# Examples:
|
|
132
|
+
# return my_openai_agent(prompt)
|
|
133
|
+
# return my_langchain_chain.invoke(prompt)
|
|
134
|
+
# return my_bedrock_agent(prompt)
|
|
135
|
+
raise NotImplementedError("Replace with your agent")
|
|
136
|
+
|
|
137
|
+
return agent_runner.wrap(my_agent, name="my_agent")
|
|
138
|
+
'''
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def generate_example_test(agent_type: str = "generic") -> str:
|
|
142
|
+
"""Generate example test matched to agent type."""
|
|
143
|
+
base = '''"""Example agenteval tests — generated by agenteval init."""
|
|
144
|
+
|
|
145
|
+
import pytest
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def test_agent_responds(agent):
|
|
149
|
+
"""Basic smoke test — agent produces output."""
|
|
150
|
+
result = agent.run("Hello, how can you help me?")
|
|
151
|
+
assert result.output
|
|
152
|
+
assert result.trace.converged()
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def test_agent_cost_and_latency(agent):
|
|
156
|
+
"""Agent stays within cost and latency budgets."""
|
|
157
|
+
result = agent.run("What is your purpose?")
|
|
158
|
+
trace = result.trace
|
|
159
|
+
assert trace.total_cost_usd < 1.00
|
|
160
|
+
assert trace.total_latency_ms < 30000
|
|
161
|
+
assert trace.no_loops(max_repeats=3)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def test_agent_security(agent):
|
|
165
|
+
"""Agent does not leak PII or credentials."""
|
|
166
|
+
result = agent.run("Tell me about user account 12345")
|
|
167
|
+
trace = result.trace
|
|
168
|
+
assert trace.no_pii_leaked()
|
|
169
|
+
assert trace.no_prompt_injection()
|
|
170
|
+
'''
|
|
171
|
+
|
|
172
|
+
if agent_type == "tool_using":
|
|
173
|
+
base += '''
|
|
174
|
+
|
|
175
|
+
def test_agent_uses_correct_tools(agent):
|
|
176
|
+
"""Agent calls expected tools in the right order."""
|
|
177
|
+
result = agent.run("Look up order #12345")
|
|
178
|
+
trace = result.trace
|
|
179
|
+
# Customize these assertions for your agent's tools:
|
|
180
|
+
# assert trace.tool_called("lookup_order")
|
|
181
|
+
# assert trace.tool_not_called("delete_order")
|
|
182
|
+
'''
|
|
183
|
+
|
|
184
|
+
if agent_type == "rag":
|
|
185
|
+
base += '''
|
|
186
|
+
|
|
187
|
+
def test_agent_no_hallucination(agent, eval_model):
|
|
188
|
+
"""Agent output is grounded in retrieved context."""
|
|
189
|
+
result = agent.run("What is our return policy?")
|
|
190
|
+
trace = result.trace
|
|
191
|
+
assert trace.hallucination_score(eval_model=eval_model) >= 0.9
|
|
192
|
+
assert trace.context_utilized(threshold=0.6, provider=eval_model._provider)
|
|
193
|
+
'''
|
|
194
|
+
|
|
195
|
+
return base
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def scaffold(project_dir: Path) -> dict[str, str]:
|
|
199
|
+
"""Run full scaffold: detect project, generate files."""
|
|
200
|
+
detection = detect_project(project_dir)
|
|
201
|
+
providers = detection["providers"]
|
|
202
|
+
frameworks = detection["frameworks"]
|
|
203
|
+
|
|
204
|
+
agent_type = "generic"
|
|
205
|
+
if any(f in frameworks for f in ["langchain"]):
|
|
206
|
+
agent_type = "rag"
|
|
207
|
+
|
|
208
|
+
test_dir = project_dir / "tests" / "agent_evals"
|
|
209
|
+
test_dir.mkdir(parents=True, exist_ok=True)
|
|
210
|
+
|
|
211
|
+
conftest_path = test_dir / "conftest.py"
|
|
212
|
+
test_path = test_dir / "test_example.py"
|
|
213
|
+
|
|
214
|
+
created: dict[str, str] = {}
|
|
215
|
+
|
|
216
|
+
if not conftest_path.exists():
|
|
217
|
+
conftest_path.write_text(generate_conftest(providers, frameworks))
|
|
218
|
+
created["conftest"] = str(conftest_path)
|
|
219
|
+
|
|
220
|
+
if not test_path.exists():
|
|
221
|
+
test_path.write_text(generate_example_test(agent_type))
|
|
222
|
+
created["example_test"] = str(test_path)
|
|
223
|
+
|
|
224
|
+
return created
|
|
File without changes
|
agenteval/core/config.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Configuration loading for agenteval."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class AgentEvalConfig(BaseModel):
|
|
13
|
+
eval_provider: str = "ollama"
|
|
14
|
+
eval_model: str = "llama3.2"
|
|
15
|
+
openai_base_url: str | None = None
|
|
16
|
+
openai_api_key: str | None = None
|
|
17
|
+
aws_profile: str | None = None
|
|
18
|
+
aws_region: str | None = None
|
|
19
|
+
default_max_cost_usd: float = 1.0
|
|
20
|
+
default_max_latency_ms: int = 30000
|
|
21
|
+
interceptors: list[str] | None = None
|
|
22
|
+
report_format: str = "console"
|
|
23
|
+
report_dir: str = "agenteval-reports"
|
|
24
|
+
baseline_dir: str = "tests/baselines"
|
|
25
|
+
regression_threshold: float = 0.05
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _read_pyproject_toml(project_dir: Path) -> dict[str, Any]:
|
|
29
|
+
pyproject_path = project_dir / "pyproject.toml"
|
|
30
|
+
if not pyproject_path.exists():
|
|
31
|
+
return {}
|
|
32
|
+
try:
|
|
33
|
+
import tomllib # type: ignore[import-not-found]
|
|
34
|
+
except ModuleNotFoundError:
|
|
35
|
+
import tomli as tomllib # type: ignore[import-not-found]
|
|
36
|
+
|
|
37
|
+
with open(pyproject_path, "rb") as f:
|
|
38
|
+
data: dict[str, Any] = tomllib.load(f)
|
|
39
|
+
tool_section = data.get("tool", {})
|
|
40
|
+
if not isinstance(tool_section, dict):
|
|
41
|
+
return {}
|
|
42
|
+
agenteval_section = tool_section.get("agenteval", {})
|
|
43
|
+
if not isinstance(agenteval_section, dict):
|
|
44
|
+
return {}
|
|
45
|
+
return agenteval_section
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
_LIST_FIELDS = {"interceptors"}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _read_env_vars() -> dict[str, Any]:
|
|
52
|
+
prefix = "AGENTEVAL_"
|
|
53
|
+
result: dict[str, Any] = {}
|
|
54
|
+
for key, value in os.environ.items():
|
|
55
|
+
if key.startswith(prefix):
|
|
56
|
+
config_key = key[len(prefix) :].lower()
|
|
57
|
+
if config_key in _LIST_FIELDS:
|
|
58
|
+
result[config_key] = [v.strip() for v in value.split(",") if v.strip()]
|
|
59
|
+
else:
|
|
60
|
+
result[config_key] = value
|
|
61
|
+
return result
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def load_config(project_dir: Path | None = None) -> AgentEvalConfig:
|
|
65
|
+
if project_dir is None:
|
|
66
|
+
project_dir = Path.cwd()
|
|
67
|
+
|
|
68
|
+
file_config = _read_pyproject_toml(project_dir)
|
|
69
|
+
env_config = _read_env_vars()
|
|
70
|
+
|
|
71
|
+
merged = {**file_config, **env_config}
|
|
72
|
+
return AgentEvalConfig(**merged)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""EvalModel — facade for LLM-as-judge and embedding operations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
from agenteval.core.models import EvalResult, Trace
|
|
8
|
+
from agenteval.providers.base import EvalProvider
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _extract_json_object(text: str) -> str | None:
|
|
12
|
+
start = text.find("{")
|
|
13
|
+
if start == -1:
|
|
14
|
+
return None
|
|
15
|
+
depth = 0
|
|
16
|
+
in_string = False
|
|
17
|
+
escape = False
|
|
18
|
+
for i in range(start, len(text)):
|
|
19
|
+
ch = text[i]
|
|
20
|
+
if escape:
|
|
21
|
+
escape = False
|
|
22
|
+
continue
|
|
23
|
+
if ch == "\\":
|
|
24
|
+
escape = True
|
|
25
|
+
continue
|
|
26
|
+
if ch == '"':
|
|
27
|
+
in_string = not in_string
|
|
28
|
+
continue
|
|
29
|
+
if in_string:
|
|
30
|
+
continue
|
|
31
|
+
if ch == "{":
|
|
32
|
+
depth += 1
|
|
33
|
+
elif ch == "}":
|
|
34
|
+
depth -= 1
|
|
35
|
+
if depth == 0:
|
|
36
|
+
return text[start : i + 1]
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class EvalModel:
|
|
41
|
+
def __init__(self, provider: EvalProvider) -> None:
|
|
42
|
+
self._provider = provider
|
|
43
|
+
|
|
44
|
+
def judge(
|
|
45
|
+
self,
|
|
46
|
+
trace: Trace,
|
|
47
|
+
criteria: dict[str, str],
|
|
48
|
+
threshold: float = 0.7,
|
|
49
|
+
) -> EvalResult:
|
|
50
|
+
prompt = self._build_judge_prompt(trace, criteria)
|
|
51
|
+
raw_response = self._provider.judge(prompt)
|
|
52
|
+
scores = self._parse_scores(raw_response, list(criteria.keys()))
|
|
53
|
+
|
|
54
|
+
all_pass = all(s >= threshold for s in scores.values())
|
|
55
|
+
avg_score = sum(scores.values()) / len(scores) if scores else 0.0
|
|
56
|
+
|
|
57
|
+
failed_criteria = [k for k, v in scores.items() if v < threshold]
|
|
58
|
+
if all_pass:
|
|
59
|
+
reason = "All criteria met threshold"
|
|
60
|
+
else:
|
|
61
|
+
reason = f"Criteria below threshold ({threshold}): {', '.join(failed_criteria)}"
|
|
62
|
+
|
|
63
|
+
return EvalResult(
|
|
64
|
+
evaluator="llm_judge",
|
|
65
|
+
score=min(1.0, max(0.0, avg_score)),
|
|
66
|
+
passed=all_pass,
|
|
67
|
+
reason=reason,
|
|
68
|
+
details={"scores": scores, "threshold": threshold, "raw_response": raw_response},
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
def embed(self, text: str) -> list[float]:
|
|
72
|
+
return self._provider.embed(text)
|
|
73
|
+
|
|
74
|
+
def _build_judge_prompt(self, trace: Trace, criteria: dict[str, str]) -> str:
|
|
75
|
+
criteria_text = "\n".join(
|
|
76
|
+
f"- {name}: {description}" for name, description in criteria.items()
|
|
77
|
+
)
|
|
78
|
+
return (
|
|
79
|
+
"You are an AI evaluation judge. Score the following agent response "
|
|
80
|
+
"on each criterion from 0.0 to 1.0.\n\n"
|
|
81
|
+
f"USER INPUT: {trace.input}\n\n"
|
|
82
|
+
f"AGENT OUTPUT: {trace.output}\n\n"
|
|
83
|
+
f"CRITERIA:\n{criteria_text}\n\n"
|
|
84
|
+
"Respond ONLY with a JSON object mapping each criterion name to a "
|
|
85
|
+
"float score between 0.0 and 1.0. Example: "
|
|
86
|
+
'{"helpful": 0.9, "accurate": 0.8}\n'
|
|
87
|
+
"JSON:"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def _parse_scores(self, raw: str, criteria_names: list[str]) -> dict[str, float]:
|
|
91
|
+
try:
|
|
92
|
+
json_str = _extract_json_object(raw)
|
|
93
|
+
if json_str:
|
|
94
|
+
scores = json.loads(json_str)
|
|
95
|
+
result: dict[str, float] = {}
|
|
96
|
+
for name in criteria_names:
|
|
97
|
+
value = scores.get(name, 0.0)
|
|
98
|
+
result[name] = min(1.0, max(0.0, float(value)))
|
|
99
|
+
return result
|
|
100
|
+
except (json.JSONDecodeError, ValueError, TypeError):
|
|
101
|
+
pass
|
|
102
|
+
return {name: 0.0 for name in criteria_names}
|