evaldeck 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evaldeck/__init__.py +88 -0
- evaldeck/cli.py +324 -0
- evaldeck/config.py +223 -0
- evaldeck/evaluator.py +566 -0
- evaldeck/graders/__init__.py +36 -0
- evaldeck/graders/base.py +146 -0
- evaldeck/graders/code.py +484 -0
- evaldeck/graders/llm.py +344 -0
- evaldeck/integrations/__init__.py +29 -0
- evaldeck/integrations/opentelemetry.py +416 -0
- evaldeck/metrics/__init__.py +25 -0
- evaldeck/metrics/base.py +62 -0
- evaldeck/metrics/builtin.py +195 -0
- evaldeck/results.py +211 -0
- evaldeck/test_case.py +162 -0
- evaldeck/trace.py +215 -0
- evaldeck-0.1.0.dist-info/METADATA +363 -0
- evaldeck-0.1.0.dist-info/RECORD +21 -0
- evaldeck-0.1.0.dist-info/WHEEL +4 -0
- evaldeck-0.1.0.dist-info/entry_points.txt +2 -0
- evaldeck-0.1.0.dist-info/licenses/LICENSE +190 -0
evaldeck/graders/llm.py
ADDED
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
"""LLM-based graders (model-as-judge)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
8
|
+
|
|
9
|
+
from evaldeck.graders.base import BaseGrader
|
|
10
|
+
from evaldeck.results import GradeResult, GradeStatus
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from evaldeck.test_case import EvalCase
|
|
14
|
+
from evaldeck.trace import Trace
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class LLMGrader(BaseGrader):
|
|
18
|
+
"""Use an LLM to grade agent output.
|
|
19
|
+
|
|
20
|
+
This grader sends the trace/output to an LLM with a grading prompt
|
|
21
|
+
and parses the response to determine pass/fail.
|
|
22
|
+
|
|
23
|
+
Supports OpenAI and Anthropic APIs (user provides their own API key).
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name = "llm"
|
|
27
|
+
|
|
28
|
+
# Default grading prompt template
|
|
29
|
+
DEFAULT_PROMPT = """You are evaluating an AI agent's response.
|
|
30
|
+
|
|
31
|
+
User Input: {input}
|
|
32
|
+
Agent Output: {output}
|
|
33
|
+
|
|
34
|
+
Task: {task}
|
|
35
|
+
|
|
36
|
+
Evaluate whether the agent's response meets the requirements.
|
|
37
|
+
Respond with exactly one of: PASS or FAIL
|
|
38
|
+
Then provide a brief explanation.
|
|
39
|
+
|
|
40
|
+
Format:
|
|
41
|
+
VERDICT: PASS or FAIL
|
|
42
|
+
REASON: Your explanation
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
prompt: str | None = None,
|
|
48
|
+
model: str = "gpt-4o-mini",
|
|
49
|
+
provider: str | None = None,
|
|
50
|
+
api_key: str | None = None,
|
|
51
|
+
threshold: float | None = None,
|
|
52
|
+
temperature: float = 0.0,
|
|
53
|
+
task: str | None = None,
|
|
54
|
+
) -> None:
|
|
55
|
+
"""Initialize LLM grader.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
prompt: Custom grading prompt. Use {input}, {output}, {trace} placeholders.
|
|
59
|
+
model: Model to use (e.g., "gpt-4o-mini", "claude-3-haiku-20240307").
|
|
60
|
+
provider: API provider ("openai" or "anthropic"). Auto-detected from model.
|
|
61
|
+
api_key: API key. If None, uses environment variable.
|
|
62
|
+
threshold: Score threshold for pass (if using scored evaluation).
|
|
63
|
+
temperature: Model temperature.
|
|
64
|
+
task: Task description for the default prompt.
|
|
65
|
+
"""
|
|
66
|
+
self.prompt_template = prompt or self.DEFAULT_PROMPT
|
|
67
|
+
self.model = model
|
|
68
|
+
self.provider = provider or self._detect_provider(model)
|
|
69
|
+
self.api_key = api_key
|
|
70
|
+
self.threshold = threshold
|
|
71
|
+
self.temperature = temperature
|
|
72
|
+
self.task = task or "Determine if the agent completed the task correctly."
|
|
73
|
+
|
|
74
|
+
def _detect_provider(self, model: str) -> str:
|
|
75
|
+
"""Detect API provider from model name."""
|
|
76
|
+
if model.startswith("claude"):
|
|
77
|
+
return "anthropic"
|
|
78
|
+
return "openai"
|
|
79
|
+
|
|
80
|
+
def _get_api_key(self) -> str:
|
|
81
|
+
"""Get API key from init or environment."""
|
|
82
|
+
if self.api_key:
|
|
83
|
+
return self.api_key
|
|
84
|
+
|
|
85
|
+
if self.provider == "anthropic":
|
|
86
|
+
key = os.environ.get("ANTHROPIC_API_KEY")
|
|
87
|
+
if key:
|
|
88
|
+
return key
|
|
89
|
+
raise ValueError(
|
|
90
|
+
"Anthropic API key not found. Set ANTHROPIC_API_KEY environment variable."
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Default to OpenAI
|
|
94
|
+
key = os.environ.get("OPENAI_API_KEY")
|
|
95
|
+
if key:
|
|
96
|
+
return key
|
|
97
|
+
raise ValueError("OpenAI API key not found. Set OPENAI_API_KEY environment variable.")
|
|
98
|
+
|
|
99
|
+
def _format_prompt(self, trace: Trace, test_case: EvalCase) -> str:
|
|
100
|
+
"""Format the grading prompt with trace data."""
|
|
101
|
+
# Build trace summary
|
|
102
|
+
trace_summary = self._build_trace_summary(trace)
|
|
103
|
+
|
|
104
|
+
return self.prompt_template.format(
|
|
105
|
+
input=trace.input,
|
|
106
|
+
output=trace.output or "(no output)",
|
|
107
|
+
trace=trace_summary,
|
|
108
|
+
task=self.task,
|
|
109
|
+
test_case_name=test_case.name,
|
|
110
|
+
expected=str(test_case.expected.model_dump(exclude_none=True)),
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
def _build_trace_summary(self, trace: Trace) -> str:
|
|
114
|
+
"""Build a human-readable trace summary."""
|
|
115
|
+
lines = ["Execution Trace:"]
|
|
116
|
+
for i, step in enumerate(trace.steps, 1):
|
|
117
|
+
if step.type.value == "tool_call":
|
|
118
|
+
lines.append(f" {i}. Tool: {step.tool_name}({step.tool_args})")
|
|
119
|
+
if step.tool_result:
|
|
120
|
+
result_str = str(step.tool_result)[:200]
|
|
121
|
+
lines.append(f" Result: {result_str}")
|
|
122
|
+
elif step.type.value == "llm_call":
|
|
123
|
+
output_preview = (step.output or "")[:100]
|
|
124
|
+
lines.append(f" {i}. LLM: {output_preview}...")
|
|
125
|
+
elif step.type.value == "reasoning":
|
|
126
|
+
reasoning_preview = (step.reasoning_text or "")[:100]
|
|
127
|
+
lines.append(f" {i}. Reasoning: {reasoning_preview}...")
|
|
128
|
+
return "\n".join(lines)
|
|
129
|
+
|
|
130
|
+
def _call_openai(self, prompt: str) -> str:
|
|
131
|
+
"""Call OpenAI API (sync)."""
|
|
132
|
+
try:
|
|
133
|
+
from openai import OpenAI
|
|
134
|
+
except ImportError:
|
|
135
|
+
raise ImportError(
|
|
136
|
+
"OpenAI package not installed. Run: pip install evaldeck[openai]"
|
|
137
|
+
) from None
|
|
138
|
+
|
|
139
|
+
client = OpenAI(api_key=self._get_api_key())
|
|
140
|
+
response = client.chat.completions.create(
|
|
141
|
+
model=self.model,
|
|
142
|
+
messages=[{"role": "user", "content": prompt}],
|
|
143
|
+
temperature=self.temperature,
|
|
144
|
+
)
|
|
145
|
+
return response.choices[0].message.content or ""
|
|
146
|
+
|
|
147
|
+
async def _call_openai_async(self, prompt: str) -> str:
|
|
148
|
+
"""Call OpenAI API (async)."""
|
|
149
|
+
try:
|
|
150
|
+
from openai import AsyncOpenAI
|
|
151
|
+
except ImportError:
|
|
152
|
+
raise ImportError(
|
|
153
|
+
"OpenAI package not installed. Run: pip install evaldeck[openai]"
|
|
154
|
+
) from None
|
|
155
|
+
|
|
156
|
+
client = AsyncOpenAI(api_key=self._get_api_key())
|
|
157
|
+
response = await client.chat.completions.create(
|
|
158
|
+
model=self.model,
|
|
159
|
+
messages=[{"role": "user", "content": prompt}],
|
|
160
|
+
temperature=self.temperature,
|
|
161
|
+
)
|
|
162
|
+
return response.choices[0].message.content or ""
|
|
163
|
+
|
|
164
|
+
def _call_anthropic(self, prompt: str) -> str:
|
|
165
|
+
"""Call Anthropic API (sync)."""
|
|
166
|
+
try:
|
|
167
|
+
from anthropic import Anthropic
|
|
168
|
+
except ImportError:
|
|
169
|
+
raise ImportError(
|
|
170
|
+
"Anthropic package not installed. Run: pip install evaldeck[anthropic]"
|
|
171
|
+
) from None
|
|
172
|
+
|
|
173
|
+
client = Anthropic(api_key=self._get_api_key())
|
|
174
|
+
response = client.messages.create(
|
|
175
|
+
model=self.model,
|
|
176
|
+
max_tokens=1024,
|
|
177
|
+
messages=[{"role": "user", "content": prompt}],
|
|
178
|
+
)
|
|
179
|
+
return response.content[0].text
|
|
180
|
+
|
|
181
|
+
async def _call_anthropic_async(self, prompt: str) -> str:
|
|
182
|
+
"""Call Anthropic API (async)."""
|
|
183
|
+
try:
|
|
184
|
+
from anthropic import AsyncAnthropic
|
|
185
|
+
except ImportError:
|
|
186
|
+
raise ImportError(
|
|
187
|
+
"Anthropic package not installed. Run: pip install evaldeck[anthropic]"
|
|
188
|
+
) from None
|
|
189
|
+
|
|
190
|
+
client = AsyncAnthropic(api_key=self._get_api_key())
|
|
191
|
+
response = await client.messages.create(
|
|
192
|
+
model=self.model,
|
|
193
|
+
max_tokens=1024,
|
|
194
|
+
messages=[{"role": "user", "content": prompt}],
|
|
195
|
+
)
|
|
196
|
+
return response.content[0].text
|
|
197
|
+
|
|
198
|
+
def _parse_response(self, response: str) -> tuple[GradeStatus, str, float | None]:
|
|
199
|
+
"""Parse LLM response to extract verdict.
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
Tuple of (status, reason, score).
|
|
203
|
+
"""
|
|
204
|
+
response_upper = response.upper()
|
|
205
|
+
|
|
206
|
+
# Look for explicit VERDICT: PASS/FAIL
|
|
207
|
+
verdict_match = re.search(r"VERDICT:\s*(PASS|FAIL)", response_upper)
|
|
208
|
+
if verdict_match:
|
|
209
|
+
status = GradeStatus.PASS if verdict_match.group(1) == "PASS" else GradeStatus.FAIL
|
|
210
|
+
elif "PASS" in response_upper and "FAIL" not in response_upper:
|
|
211
|
+
status = GradeStatus.PASS
|
|
212
|
+
elif "FAIL" in response_upper:
|
|
213
|
+
status = GradeStatus.FAIL
|
|
214
|
+
else:
|
|
215
|
+
# Couldn't determine, default to fail
|
|
216
|
+
status = GradeStatus.FAIL
|
|
217
|
+
|
|
218
|
+
# Extract reason
|
|
219
|
+
reason_match = re.search(r"REASON:\s*(.+)", response, re.IGNORECASE | re.DOTALL)
|
|
220
|
+
reason = reason_match.group(1).strip() if reason_match else response[:200]
|
|
221
|
+
|
|
222
|
+
# Extract score if present
|
|
223
|
+
score = None
|
|
224
|
+
score_match = re.search(r"SCORE:\s*(\d+(?:\.\d+)?)", response)
|
|
225
|
+
if score_match:
|
|
226
|
+
score = float(score_match.group(1))
|
|
227
|
+
# Normalize to 0-1 if needed
|
|
228
|
+
if score > 1:
|
|
229
|
+
score = score / 10 if score <= 10 else score / 100
|
|
230
|
+
|
|
231
|
+
return status, reason, score
|
|
232
|
+
|
|
233
|
+
def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
|
|
234
|
+
"""Grade the trace using an LLM (sync)."""
|
|
235
|
+
try:
|
|
236
|
+
# Format prompt
|
|
237
|
+
prompt = self._format_prompt(trace, test_case)
|
|
238
|
+
|
|
239
|
+
# Call LLM
|
|
240
|
+
if self.provider == "anthropic":
|
|
241
|
+
response = self._call_anthropic(prompt)
|
|
242
|
+
else:
|
|
243
|
+
response = self._call_openai(prompt)
|
|
244
|
+
|
|
245
|
+
return self._build_result(response)
|
|
246
|
+
|
|
247
|
+
except Exception as e:
|
|
248
|
+
return GradeResult.error_result(self.name, f"LLM grader error: {e}")
|
|
249
|
+
|
|
250
|
+
async def grade_async(self, trace: Trace, test_case: EvalCase) -> GradeResult:
|
|
251
|
+
"""Grade the trace using an LLM (async).
|
|
252
|
+
|
|
253
|
+
Uses async API clients for better performance in concurrent evaluation.
|
|
254
|
+
"""
|
|
255
|
+
try:
|
|
256
|
+
# Format prompt
|
|
257
|
+
prompt = self._format_prompt(trace, test_case)
|
|
258
|
+
|
|
259
|
+
# Call LLM asynchronously
|
|
260
|
+
if self.provider == "anthropic":
|
|
261
|
+
response = await self._call_anthropic_async(prompt)
|
|
262
|
+
else:
|
|
263
|
+
response = await self._call_openai_async(prompt)
|
|
264
|
+
|
|
265
|
+
return self._build_result(response)
|
|
266
|
+
|
|
267
|
+
except Exception as e:
|
|
268
|
+
return GradeResult.error_result(self.name, f"LLM grader error: {e}")
|
|
269
|
+
|
|
270
|
+
def _build_result(self, response: str) -> GradeResult:
|
|
271
|
+
"""Build GradeResult from LLM response."""
|
|
272
|
+
# Parse response
|
|
273
|
+
status, reason, score = self._parse_response(response)
|
|
274
|
+
|
|
275
|
+
# Apply threshold if score-based
|
|
276
|
+
if self.threshold is not None and score is not None:
|
|
277
|
+
status = GradeStatus.PASS if score >= self.threshold else GradeStatus.FAIL
|
|
278
|
+
|
|
279
|
+
return GradeResult(
|
|
280
|
+
grader_name=self.name,
|
|
281
|
+
status=status,
|
|
282
|
+
score=score,
|
|
283
|
+
message=reason,
|
|
284
|
+
details={
|
|
285
|
+
"model": self.model,
|
|
286
|
+
"raw_response": response,
|
|
287
|
+
},
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
class LLMRubricGrader(LLMGrader):
|
|
292
|
+
"""LLM grader with a detailed scoring rubric."""
|
|
293
|
+
|
|
294
|
+
name = "llm_rubric"
|
|
295
|
+
|
|
296
|
+
RUBRIC_PROMPT = """You are evaluating an AI agent's response using a scoring rubric.
|
|
297
|
+
|
|
298
|
+
User Input: {input}
|
|
299
|
+
Agent Output: {output}
|
|
300
|
+
|
|
301
|
+
Scoring Rubric:
|
|
302
|
+
{rubric}
|
|
303
|
+
|
|
304
|
+
For each criterion, provide a score from 1-5 where:
|
|
305
|
+
1 = Poor, 2 = Below Average, 3 = Average, 4 = Good, 5 = Excellent
|
|
306
|
+
|
|
307
|
+
Format your response as:
|
|
308
|
+
CRITERION: criterion_name
|
|
309
|
+
SCORE: X
|
|
310
|
+
REASON: explanation
|
|
311
|
+
|
|
312
|
+
After scoring all criteria, provide:
|
|
313
|
+
TOTAL_SCORE: X/Y
|
|
314
|
+
VERDICT: PASS or FAIL
|
|
315
|
+
"""
|
|
316
|
+
|
|
317
|
+
def __init__(
|
|
318
|
+
self,
|
|
319
|
+
rubric: dict[str, str],
|
|
320
|
+
pass_threshold: float = 0.7,
|
|
321
|
+
**kwargs: Any,
|
|
322
|
+
) -> None:
|
|
323
|
+
"""Initialize rubric grader.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
rubric: Dict mapping criterion names to descriptions.
|
|
327
|
+
pass_threshold: Minimum score ratio to pass (0-1).
|
|
328
|
+
**kwargs: Passed to LLMGrader.
|
|
329
|
+
"""
|
|
330
|
+
self.rubric = rubric
|
|
331
|
+
self.pass_threshold = pass_threshold
|
|
332
|
+
super().__init__(**kwargs)
|
|
333
|
+
self.prompt_template = self.RUBRIC_PROMPT
|
|
334
|
+
|
|
335
|
+
def _format_prompt(self, trace: Trace, test_case: EvalCase) -> str:
|
|
336
|
+
"""Format prompt with rubric."""
|
|
337
|
+
rubric_text = "\n".join(
|
|
338
|
+
f"- {name}: {description}" for name, description in self.rubric.items()
|
|
339
|
+
)
|
|
340
|
+
return self.prompt_template.format(
|
|
341
|
+
input=trace.input,
|
|
342
|
+
output=trace.output or "(no output)",
|
|
343
|
+
rubric=rubric_text,
|
|
344
|
+
)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Framework integrations for Evaldeck.
|
|
2
|
+
|
|
3
|
+
This module provides the OpenTelemetry/OpenInference adapter for capturing traces
|
|
4
|
+
from any instrumented AI framework (LangChain, CrewAI, LiteLLM, OpenAI, Anthropic, etc.)
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
from evaldeck.integrations import EvaldeckSpanProcessor, setup_otel_tracing
|
|
8
|
+
from openinference.instrumentation.langchain import LangChainInstrumentor
|
|
9
|
+
|
|
10
|
+
processor = setup_otel_tracing()
|
|
11
|
+
LangChainInstrumentor().instrument()
|
|
12
|
+
|
|
13
|
+
# Run your agent...
|
|
14
|
+
|
|
15
|
+
trace = processor.get_latest_trace()
|
|
16
|
+
result = evaluator.evaluate(trace, test_case)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from evaldeck.integrations.opentelemetry import (
|
|
20
|
+
EvaldeckSpanProcessor,
|
|
21
|
+
)
|
|
22
|
+
from evaldeck.integrations.opentelemetry import (
|
|
23
|
+
setup_tracing as setup_otel_tracing,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"EvaldeckSpanProcessor",
|
|
28
|
+
"setup_otel_tracing",
|
|
29
|
+
]
|