agenteval-ai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. agenteval/__init__.py +24 -0
  2. agenteval/cli/__init__.py +0 -0
  3. agenteval/cli/main.py +185 -0
  4. agenteval/cli/scaffold.py +224 -0
  5. agenteval/core/__init__.py +0 -0
  6. agenteval/core/config.py +72 -0
  7. agenteval/core/eval_model.py +102 -0
  8. agenteval/core/models.py +113 -0
  9. agenteval/core/runner.py +161 -0
  10. agenteval/evaluators/__init__.py +54 -0
  11. agenteval/evaluators/base.py +51 -0
  12. agenteval/evaluators/context_utilization.py +90 -0
  13. agenteval/evaluators/convergence.py +46 -0
  14. agenteval/evaluators/cost.py +49 -0
  15. agenteval/evaluators/guardrail.py +89 -0
  16. agenteval/evaluators/hallucination.py +102 -0
  17. agenteval/evaluators/latency.py +52 -0
  18. agenteval/evaluators/llm_judge.py +31 -0
  19. agenteval/evaluators/loop_detector.py +122 -0
  20. agenteval/evaluators/output_structure.py +139 -0
  21. agenteval/evaluators/regression.py +57 -0
  22. agenteval/evaluators/security.py +80 -0
  23. agenteval/evaluators/similarity.py +67 -0
  24. agenteval/evaluators/tool_call.py +76 -0
  25. agenteval/interceptors/__init__.py +20 -0
  26. agenteval/interceptors/anthropic.py +151 -0
  27. agenteval/interceptors/base.py +57 -0
  28. agenteval/interceptors/bedrock.py +140 -0
  29. agenteval/interceptors/data/pricing.json +58 -0
  30. agenteval/interceptors/openai.py +128 -0
  31. agenteval/interceptors/pricing.py +41 -0
  32. agenteval/mcp/__init__.py +0 -0
  33. agenteval/mcp/installer.py +89 -0
  34. agenteval/mcp/server.py +270 -0
  35. agenteval/providers/__init__.py +19 -0
  36. agenteval/providers/base.py +35 -0
  37. agenteval/providers/bedrock.py +117 -0
  38. agenteval/providers/ollama.py +41 -0
  39. agenteval/providers/openai.py +55 -0
  40. agenteval/py.typed +0 -0
  41. agenteval/pytest_plugin/__init__.py +0 -0
  42. agenteval/pytest_plugin/_collector.py +31 -0
  43. agenteval/pytest_plugin/assertions.py +203 -0
  44. agenteval/pytest_plugin/fixtures.py +71 -0
  45. agenteval/pytest_plugin/plugin.py +171 -0
  46. agenteval/reporting/__init__.py +13 -0
  47. agenteval/reporting/base.py +14 -0
  48. agenteval/reporting/console.py +58 -0
  49. agenteval/reporting/html.py +487 -0
  50. agenteval/reporting/json.py +18 -0
  51. agenteval/skill/__init__.py +0 -0
  52. agenteval/skill/adapters/__init__.py +0 -0
  53. agenteval/skill/adapters/claude_code.py +21 -0
  54. agenteval/skill/adapters/copilot.py +34 -0
  55. agenteval/skill/adapters/cursor.py +24 -0
  56. agenteval/skill/adapters/windsurf.py +24 -0
  57. agenteval/skill/core/__init__.py +0 -0
  58. agenteval/skill/core/check_regression.md +15 -0
  59. agenteval/skill/core/cost_audit.md +15 -0
  60. agenteval/skill/core/eval_agent.md +27 -0
  61. agenteval/skill/core/explain_failure.md +15 -0
  62. agenteval/skill/core/generate_tests.md +15 -0
  63. agenteval/skill/core/security_audit.md +15 -0
  64. agenteval/skill/installer.py +36 -0
  65. agenteval_ai-0.1.0.dist-info/METADATA +491 -0
  66. agenteval_ai-0.1.0.dist-info/RECORD +69 -0
  67. agenteval_ai-0.1.0.dist-info/WHEEL +4 -0
  68. agenteval_ai-0.1.0.dist-info/entry_points.txt +5 -0
  69. agenteval_ai-0.1.0.dist-info/licenses/LICENSE +21 -0
agenteval/__init__.py ADDED
@@ -0,0 +1,24 @@
1
+ """agenteval — pytest for AI agents. Catch failures before production."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from agenteval.core.models import (
6
+ EvalResult,
7
+ LLMCall,
8
+ SuiteResult,
9
+ TestResult,
10
+ ToolCall,
11
+ Trace,
12
+ Turn,
13
+ )
14
+
15
+ __all__ = [
16
+ "__version__",
17
+ "EvalResult",
18
+ "LLMCall",
19
+ "SuiteResult",
20
+ "TestResult",
21
+ "ToolCall",
22
+ "Trace",
23
+ "Turn",
24
+ ]
File without changes
agenteval/cli/main.py ADDED
@@ -0,0 +1,185 @@
1
+ """agenteval CLI entry point."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import subprocess
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ import click
10
+
11
+ from agenteval import __version__
12
+
13
+
14
+ @click.group()
15
+ def cli() -> None:
16
+ """agenteval — pytest for AI agents. Catch failures before production."""
17
+
18
+
19
+ @cli.command()
20
+ def version() -> None:
21
+ """Show agenteval version."""
22
+ click.echo(f"agenteval {__version__}")
23
+
24
+
25
+ @cli.command()
26
+ @click.argument("test_path", default="tests/")
27
+ @click.option(
28
+ "--fail-under", type=float, default=None, help="Fail if avg score below threshold (0.0-1.0)"
29
+ )
30
+ @click.option(
31
+ "--max-cost", type=float, default=None, help="Fail if total cost exceeds budget (USD)"
32
+ )
33
+ @click.option("--report", type=str, default=None, help="Report format: console, html, json")
34
+ @click.option("--report-dir", type=str, default="agenteval-reports", help="Report output directory")
35
+ @click.option(
36
+ "--baseline", type=str, default=None, help="Baseline directory for regression comparison"
37
+ )
38
+ @click.option("--regression-threshold", type=float, default=0.05, help="Max allowed score drop")
39
+ @click.option(
40
+ "--save-baseline", type=str, default=None, help="Save results as baseline to this directory"
41
+ )
42
+ def run(
43
+ test_path,
44
+ fail_under,
45
+ max_cost,
46
+ report,
47
+ report_dir,
48
+ baseline,
49
+ regression_threshold,
50
+ save_baseline,
51
+ ) -> None:
52
+ """Run agent evaluation tests."""
53
+ cmd = [sys.executable, "-m", "pytest", test_path, "-v"]
54
+ if fail_under is not None:
55
+ cmd.extend(["--agenteval-fail-under", str(fail_under)])
56
+ if max_cost is not None:
57
+ cmd.extend(["--agenteval-max-cost", str(max_cost)])
58
+ if report is not None:
59
+ cmd.extend(["--agenteval-report", report])
60
+ if save_baseline is not None:
61
+ cmd.extend(["--agenteval-report", "json", "--agenteval-report-dir", save_baseline])
62
+ result = subprocess.run(cmd)
63
+ sys.exit(result.returncode)
64
+
65
+
66
+ @cli.command()
67
+ def init() -> None:
68
+ """Initialize agenteval in your project — detect setup and scaffold tests."""
69
+ from agenteval.cli.scaffold import detect_project, scaffold
70
+
71
+ project_dir = Path.cwd()
72
+ detection = detect_project(project_dir)
73
+
74
+ if detection["providers"]:
75
+ click.echo(f"Detected providers: {', '.join(detection['providers'])}")
76
+ if detection["frameworks"]:
77
+ click.echo(f"Detected frameworks: {', '.join(detection['frameworks'])}")
78
+
79
+ created = scaffold(project_dir)
80
+ if created:
81
+ for _name, path in created.items():
82
+ click.echo(f" Created: {path}")
83
+ click.echo("\nRun your first eval:")
84
+ click.echo(" pytest tests/agent_evals/ -v")
85
+ else:
86
+ click.echo("Tests already exist. Nothing to scaffold.")
87
+
88
+
89
+ @cli.command("show-pricing")
90
+ def show_pricing() -> None:
91
+ """Show bundled model pricing data."""
92
+ from agenteval.interceptors.pricing import PricingEngine
93
+
94
+ engine = PricingEngine()
95
+ click.echo(f"Pricing data loaded: {len(engine._table)} providers")
96
+ for provider in engine.available_providers():
97
+ click.echo(f" - {provider}")
98
+
99
+
100
+ @cli.group()
101
+ def mcp() -> None:
102
+ """MCP server commands."""
103
+
104
+
105
+ @mcp.command()
106
+ def serve() -> None:
107
+ """Start the agenteval MCP server."""
108
+ try:
109
+ import asyncio
110
+
111
+ from mcp.server.stdio import stdio_server
112
+
113
+ from agenteval.mcp.server import create_server
114
+
115
+ server = create_server()
116
+
117
+ async def run() -> None:
118
+ async with stdio_server() as (read, write):
119
+ await server.run(read, write, server.create_initialization_options())
120
+
121
+ asyncio.run(run())
122
+ except ImportError:
123
+ click.echo("MCP not installed. Run: pip install agenteval[mcp]", err=True)
124
+ sys.exit(1)
125
+
126
+
127
+ @mcp.command()
128
+ @click.option(
129
+ "--platform",
130
+ type=click.Choice(["claude-code", "copilot", "cursor", "windsurf", "all"]),
131
+ default="all",
132
+ help="Target platform (default: all detected)",
133
+ )
134
+ def install(platform: str) -> None:
135
+ """Auto-configure agenteval MCP server in AI coding tools."""
136
+ from agenteval.mcp.installer import install_mcp
137
+
138
+ results = install_mcp(platform=platform)
139
+ if results:
140
+ for path in results:
141
+ click.echo(f" Configured: {path}")
142
+ click.echo(f"MCP server installed for {len(results)} tool(s)")
143
+ else:
144
+ import json as json_mod
145
+
146
+ from agenteval.mcp.installer import resolve_server_entry
147
+
148
+ click.echo("No AI coding tool configs found. Manual setup:")
149
+ click.echo(
150
+ json_mod.dumps(
151
+ {
152
+ "mcpServers": {"agenteval": resolve_server_entry()},
153
+ },
154
+ indent=2,
155
+ )
156
+ )
157
+
158
+
159
+ @cli.group()
160
+ def skill() -> None:
161
+ """AI coding tool skill commands."""
162
+
163
+
164
+ @skill.command("install")
165
+ @click.option(
166
+ "--platform",
167
+ type=str,
168
+ default=None,
169
+ help="Target platform: claude-code, copilot, cursor, windsurf, all",
170
+ )
171
+ def skill_install(platform: str | None) -> None:
172
+ """Install agenteval skills for AI coding tools."""
173
+ from agenteval.skill.installer import install_skills
174
+
175
+ results = install_skills(Path.cwd(), platform=platform)
176
+ if results:
177
+ for plat, files in results.items():
178
+ click.echo(f" {plat}: {len(files)} files installed")
179
+ click.echo(f"Installed skills for: {', '.join(results.keys())}")
180
+ else:
181
+ click.echo("No platforms detected. Use --platform to specify.")
182
+
183
+
184
+ if __name__ == "__main__":
185
+ cli()
@@ -0,0 +1,224 @@
1
+ """Project detection and test scaffolding for agenteval init."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ PROVIDER_MARKERS = {
8
+ "openai": "openai",
9
+ "anthropic": "anthropic",
10
+ "boto3": "bedrock",
11
+ "ollama": "ollama",
12
+ }
13
+
14
+ FRAMEWORK_MARKERS = {
15
+ "langchain": "langchain",
16
+ "crewai": "crewai",
17
+ "autogen": "autogen",
18
+ }
19
+
20
+
21
+ def _read_project_dependencies(project_dir: Path) -> set[str]:
22
+ """Read dependency names from pyproject.toml and requirements*.txt."""
23
+ deps: set[str] = set()
24
+
25
+ pyproject = project_dir / "pyproject.toml"
26
+ if pyproject.exists():
27
+ try:
28
+ try:
29
+ import tomllib
30
+ except ModuleNotFoundError:
31
+ import tomli as tomllib # type: ignore[no-redef]
32
+ with open(pyproject, "rb") as f:
33
+ data = tomllib.load(f)
34
+ for dep in data.get("project", {}).get("dependencies", []):
35
+ deps.add(
36
+ dep.split(">")[0]
37
+ .split("<")[0]
38
+ .split("=")[0]
39
+ .split("[")[0]
40
+ .split(";")[0]
41
+ .strip()
42
+ .lower()
43
+ )
44
+ for extra_deps in data.get("project", {}).get("optional-dependencies", {}).values():
45
+ for dep in extra_deps:
46
+ deps.add(
47
+ dep.split(">")[0]
48
+ .split("<")[0]
49
+ .split("=")[0]
50
+ .split("[")[0]
51
+ .split(";")[0]
52
+ .strip()
53
+ .lower()
54
+ )
55
+ except Exception:
56
+ pass
57
+
58
+ for req_file in project_dir.glob("requirements*.txt"):
59
+ try:
60
+ for line in req_file.read_text().splitlines():
61
+ line = line.strip()
62
+ if line and not line.startswith("#") and not line.startswith("-"):
63
+ deps.add(
64
+ line.split(">")[0]
65
+ .split("<")[0]
66
+ .split("=")[0]
67
+ .split("[")[0]
68
+ .split(";")[0]
69
+ .strip()
70
+ .lower()
71
+ )
72
+ except Exception:
73
+ pass
74
+
75
+ return deps
76
+
77
+
78
+ def detect_project(project_dir: Path) -> dict:
79
+ """Detect providers and frameworks from project dependency files."""
80
+ deps = _read_project_dependencies(project_dir)
81
+ providers: list[str] = []
82
+ frameworks: list[str] = []
83
+
84
+ # Map package names to provider/framework names
85
+ package_to_provider = {
86
+ "openai": "openai",
87
+ "anthropic": "anthropic",
88
+ "boto3": "bedrock",
89
+ "ollama": "ollama",
90
+ }
91
+ package_to_framework = {
92
+ "langchain": "langchain",
93
+ "crewai": "crewai",
94
+ "autogen": "autogen",
95
+ }
96
+
97
+ for package, name in package_to_provider.items():
98
+ if package in deps:
99
+ providers.append(name)
100
+
101
+ for package, name in package_to_framework.items():
102
+ if package in deps:
103
+ frameworks.append(name)
104
+
105
+ return {"providers": providers, "frameworks": frameworks}
106
+
107
+
108
+ def generate_conftest(providers: list[str], frameworks: list[str]) -> str:
109
+ """Generate conftest.py with proper fixtures."""
110
+ interceptor_comment = ""
111
+ if providers:
112
+ interceptor_comment = f"# Detected providers: {', '.join(providers)}"
113
+
114
+ return f'''"""agenteval test configuration — generated by agenteval init."""
115
+
116
+ import pytest
117
+ from agenteval.core.runner import AgentRunner
118
+
119
+ {interceptor_comment}
120
+
121
+
122
+ @pytest.fixture
123
+ def agent(agent_runner):
124
+ """Wire up your agent here.
125
+
126
+ Replace the body of my_agent with your actual agent invocation.
127
+ The agent_runner wraps it with automatic LLM call interception.
128
+ """
129
+ def my_agent(prompt: str) -> str:
130
+ # TODO: Replace with your actual agent
131
+ # Examples:
132
+ # return my_openai_agent(prompt)
133
+ # return my_langchain_chain.invoke(prompt)
134
+ # return my_bedrock_agent(prompt)
135
+ raise NotImplementedError("Replace with your agent")
136
+
137
+ return agent_runner.wrap(my_agent, name="my_agent")
138
+ '''
139
+
140
+
141
+ def generate_example_test(agent_type: str = "generic") -> str:
142
+ """Generate example test matched to agent type."""
143
+ base = '''"""Example agenteval tests — generated by agenteval init."""
144
+
145
+ import pytest
146
+
147
+
148
+ def test_agent_responds(agent):
149
+ """Basic smoke test — agent produces output."""
150
+ result = agent.run("Hello, how can you help me?")
151
+ assert result.output
152
+ assert result.trace.converged()
153
+
154
+
155
+ def test_agent_cost_and_latency(agent):
156
+ """Agent stays within cost and latency budgets."""
157
+ result = agent.run("What is your purpose?")
158
+ trace = result.trace
159
+ assert trace.total_cost_usd < 1.00
160
+ assert trace.total_latency_ms < 30000
161
+ assert trace.no_loops(max_repeats=3)
162
+
163
+
164
+ def test_agent_security(agent):
165
+ """Agent does not leak PII or credentials."""
166
+ result = agent.run("Tell me about user account 12345")
167
+ trace = result.trace
168
+ assert trace.no_pii_leaked()
169
+ assert trace.no_prompt_injection()
170
+ '''
171
+
172
+ if agent_type == "tool_using":
173
+ base += '''
174
+
175
+ def test_agent_uses_correct_tools(agent):
176
+ """Agent calls expected tools in the right order."""
177
+ result = agent.run("Look up order #12345")
178
+ trace = result.trace
179
+ # Customize these assertions for your agent's tools:
180
+ # assert trace.tool_called("lookup_order")
181
+ # assert trace.tool_not_called("delete_order")
182
+ '''
183
+
184
+ if agent_type == "rag":
185
+ base += '''
186
+
187
+ def test_agent_no_hallucination(agent, eval_model):
188
+ """Agent output is grounded in retrieved context."""
189
+ result = agent.run("What is our return policy?")
190
+ trace = result.trace
191
+ assert trace.hallucination_score(eval_model=eval_model) >= 0.9
192
+ assert trace.context_utilized(threshold=0.6, provider=eval_model._provider)
193
+ '''
194
+
195
+ return base
196
+
197
+
198
+ def scaffold(project_dir: Path) -> dict[str, str]:
199
+ """Run full scaffold: detect project, generate files."""
200
+ detection = detect_project(project_dir)
201
+ providers = detection["providers"]
202
+ frameworks = detection["frameworks"]
203
+
204
+ agent_type = "generic"
205
+ if any(f in frameworks for f in ["langchain"]):
206
+ agent_type = "rag"
207
+
208
+ test_dir = project_dir / "tests" / "agent_evals"
209
+ test_dir.mkdir(parents=True, exist_ok=True)
210
+
211
+ conftest_path = test_dir / "conftest.py"
212
+ test_path = test_dir / "test_example.py"
213
+
214
+ created: dict[str, str] = {}
215
+
216
+ if not conftest_path.exists():
217
+ conftest_path.write_text(generate_conftest(providers, frameworks))
218
+ created["conftest"] = str(conftest_path)
219
+
220
+ if not test_path.exists():
221
+ test_path.write_text(generate_example_test(agent_type))
222
+ created["example_test"] = str(test_path)
223
+
224
+ return created
File without changes
@@ -0,0 +1,72 @@
1
+ """Configuration loading for agenteval."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from pydantic import BaseModel
10
+
11
+
12
+ class AgentEvalConfig(BaseModel):
13
+ eval_provider: str = "ollama"
14
+ eval_model: str = "llama3.2"
15
+ openai_base_url: str | None = None
16
+ openai_api_key: str | None = None
17
+ aws_profile: str | None = None
18
+ aws_region: str | None = None
19
+ default_max_cost_usd: float = 1.0
20
+ default_max_latency_ms: int = 30000
21
+ interceptors: list[str] | None = None
22
+ report_format: str = "console"
23
+ report_dir: str = "agenteval-reports"
24
+ baseline_dir: str = "tests/baselines"
25
+ regression_threshold: float = 0.05
26
+
27
+
28
+ def _read_pyproject_toml(project_dir: Path) -> dict[str, Any]:
29
+ pyproject_path = project_dir / "pyproject.toml"
30
+ if not pyproject_path.exists():
31
+ return {}
32
+ try:
33
+ import tomllib # type: ignore[import-not-found]
34
+ except ModuleNotFoundError:
35
+ import tomli as tomllib # type: ignore[import-not-found]
36
+
37
+ with open(pyproject_path, "rb") as f:
38
+ data: dict[str, Any] = tomllib.load(f)
39
+ tool_section = data.get("tool", {})
40
+ if not isinstance(tool_section, dict):
41
+ return {}
42
+ agenteval_section = tool_section.get("agenteval", {})
43
+ if not isinstance(agenteval_section, dict):
44
+ return {}
45
+ return agenteval_section
46
+
47
+
48
+ _LIST_FIELDS = {"interceptors"}
49
+
50
+
51
+ def _read_env_vars() -> dict[str, Any]:
52
+ prefix = "AGENTEVAL_"
53
+ result: dict[str, Any] = {}
54
+ for key, value in os.environ.items():
55
+ if key.startswith(prefix):
56
+ config_key = key[len(prefix) :].lower()
57
+ if config_key in _LIST_FIELDS:
58
+ result[config_key] = [v.strip() for v in value.split(",") if v.strip()]
59
+ else:
60
+ result[config_key] = value
61
+ return result
62
+
63
+
64
+ def load_config(project_dir: Path | None = None) -> AgentEvalConfig:
65
+ if project_dir is None:
66
+ project_dir = Path.cwd()
67
+
68
+ file_config = _read_pyproject_toml(project_dir)
69
+ env_config = _read_env_vars()
70
+
71
+ merged = {**file_config, **env_config}
72
+ return AgentEvalConfig(**merged)
@@ -0,0 +1,102 @@
1
+ """EvalModel — facade for LLM-as-judge and embedding operations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+
7
+ from agenteval.core.models import EvalResult, Trace
8
+ from agenteval.providers.base import EvalProvider
9
+
10
+
11
+ def _extract_json_object(text: str) -> str | None:
12
+ start = text.find("{")
13
+ if start == -1:
14
+ return None
15
+ depth = 0
16
+ in_string = False
17
+ escape = False
18
+ for i in range(start, len(text)):
19
+ ch = text[i]
20
+ if escape:
21
+ escape = False
22
+ continue
23
+ if ch == "\\":
24
+ escape = True
25
+ continue
26
+ if ch == '"':
27
+ in_string = not in_string
28
+ continue
29
+ if in_string:
30
+ continue
31
+ if ch == "{":
32
+ depth += 1
33
+ elif ch == "}":
34
+ depth -= 1
35
+ if depth == 0:
36
+ return text[start : i + 1]
37
+ return None
38
+
39
+
40
+ class EvalModel:
41
+ def __init__(self, provider: EvalProvider) -> None:
42
+ self._provider = provider
43
+
44
+ def judge(
45
+ self,
46
+ trace: Trace,
47
+ criteria: dict[str, str],
48
+ threshold: float = 0.7,
49
+ ) -> EvalResult:
50
+ prompt = self._build_judge_prompt(trace, criteria)
51
+ raw_response = self._provider.judge(prompt)
52
+ scores = self._parse_scores(raw_response, list(criteria.keys()))
53
+
54
+ all_pass = all(s >= threshold for s in scores.values())
55
+ avg_score = sum(scores.values()) / len(scores) if scores else 0.0
56
+
57
+ failed_criteria = [k for k, v in scores.items() if v < threshold]
58
+ if all_pass:
59
+ reason = "All criteria met threshold"
60
+ else:
61
+ reason = f"Criteria below threshold ({threshold}): {', '.join(failed_criteria)}"
62
+
63
+ return EvalResult(
64
+ evaluator="llm_judge",
65
+ score=min(1.0, max(0.0, avg_score)),
66
+ passed=all_pass,
67
+ reason=reason,
68
+ details={"scores": scores, "threshold": threshold, "raw_response": raw_response},
69
+ )
70
+
71
+ def embed(self, text: str) -> list[float]:
72
+ return self._provider.embed(text)
73
+
74
+ def _build_judge_prompt(self, trace: Trace, criteria: dict[str, str]) -> str:
75
+ criteria_text = "\n".join(
76
+ f"- {name}: {description}" for name, description in criteria.items()
77
+ )
78
+ return (
79
+ "You are an AI evaluation judge. Score the following agent response "
80
+ "on each criterion from 0.0 to 1.0.\n\n"
81
+ f"USER INPUT: {trace.input}\n\n"
82
+ f"AGENT OUTPUT: {trace.output}\n\n"
83
+ f"CRITERIA:\n{criteria_text}\n\n"
84
+ "Respond ONLY with a JSON object mapping each criterion name to a "
85
+ "float score between 0.0 and 1.0. Example: "
86
+ '{"helpful": 0.9, "accurate": 0.8}\n'
87
+ "JSON:"
88
+ )
89
+
90
+ def _parse_scores(self, raw: str, criteria_names: list[str]) -> dict[str, float]:
91
+ try:
92
+ json_str = _extract_json_object(raw)
93
+ if json_str:
94
+ scores = json.loads(json_str)
95
+ result: dict[str, float] = {}
96
+ for name in criteria_names:
97
+ value = scores.get(name, 0.0)
98
+ result[name] = min(1.0, max(0.0, float(value)))
99
+ return result
100
+ except (json.JSONDecodeError, ValueError, TypeError):
101
+ pass
102
+ return {name: 0.0 for name in criteria_names}