evaldeck 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,344 @@
1
+ """LLM-based graders (model-as-judge)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ from evaldeck.graders.base import BaseGrader
10
+ from evaldeck.results import GradeResult, GradeStatus
11
+
12
+ if TYPE_CHECKING:
13
+ from evaldeck.test_case import EvalCase
14
+ from evaldeck.trace import Trace
15
+
16
+
17
+ class LLMGrader(BaseGrader):
18
+ """Use an LLM to grade agent output.
19
+
20
+ This grader sends the trace/output to an LLM with a grading prompt
21
+ and parses the response to determine pass/fail.
22
+
23
+ Supports OpenAI and Anthropic APIs (user provides their own API key).
24
+ """
25
+
26
+ name = "llm"
27
+
28
+ # Default grading prompt template
29
+ DEFAULT_PROMPT = """You are evaluating an AI agent's response.
30
+
31
+ User Input: {input}
32
+ Agent Output: {output}
33
+
34
+ Task: {task}
35
+
36
+ Evaluate whether the agent's response meets the requirements.
37
+ Respond with exactly one of: PASS or FAIL
38
+ Then provide a brief explanation.
39
+
40
+ Format:
41
+ VERDICT: PASS or FAIL
42
+ REASON: Your explanation
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ prompt: str | None = None,
48
+ model: str = "gpt-4o-mini",
49
+ provider: str | None = None,
50
+ api_key: str | None = None,
51
+ threshold: float | None = None,
52
+ temperature: float = 0.0,
53
+ task: str | None = None,
54
+ ) -> None:
55
+ """Initialize LLM grader.
56
+
57
+ Args:
58
+ prompt: Custom grading prompt. Use {input}, {output}, {trace} placeholders.
59
+ model: Model to use (e.g., "gpt-4o-mini", "claude-3-haiku-20240307").
60
+ provider: API provider ("openai" or "anthropic"). Auto-detected from model.
61
+ api_key: API key. If None, uses environment variable.
62
+ threshold: Score threshold for pass (if using scored evaluation).
63
+ temperature: Model temperature.
64
+ task: Task description for the default prompt.
65
+ """
66
+ self.prompt_template = prompt or self.DEFAULT_PROMPT
67
+ self.model = model
68
+ self.provider = provider or self._detect_provider(model)
69
+ self.api_key = api_key
70
+ self.threshold = threshold
71
+ self.temperature = temperature
72
+ self.task = task or "Determine if the agent completed the task correctly."
73
+
74
+ def _detect_provider(self, model: str) -> str:
75
+ """Detect API provider from model name."""
76
+ if model.startswith("claude"):
77
+ return "anthropic"
78
+ return "openai"
79
+
80
+ def _get_api_key(self) -> str:
81
+ """Get API key from init or environment."""
82
+ if self.api_key:
83
+ return self.api_key
84
+
85
+ if self.provider == "anthropic":
86
+ key = os.environ.get("ANTHROPIC_API_KEY")
87
+ if key:
88
+ return key
89
+ raise ValueError(
90
+ "Anthropic API key not found. Set ANTHROPIC_API_KEY environment variable."
91
+ )
92
+
93
+ # Default to OpenAI
94
+ key = os.environ.get("OPENAI_API_KEY")
95
+ if key:
96
+ return key
97
+ raise ValueError("OpenAI API key not found. Set OPENAI_API_KEY environment variable.")
98
+
99
+ def _format_prompt(self, trace: Trace, test_case: EvalCase) -> str:
100
+ """Format the grading prompt with trace data."""
101
+ # Build trace summary
102
+ trace_summary = self._build_trace_summary(trace)
103
+
104
+ return self.prompt_template.format(
105
+ input=trace.input,
106
+ output=trace.output or "(no output)",
107
+ trace=trace_summary,
108
+ task=self.task,
109
+ test_case_name=test_case.name,
110
+ expected=str(test_case.expected.model_dump(exclude_none=True)),
111
+ )
112
+
113
+ def _build_trace_summary(self, trace: Trace) -> str:
114
+ """Build a human-readable trace summary."""
115
+ lines = ["Execution Trace:"]
116
+ for i, step in enumerate(trace.steps, 1):
117
+ if step.type.value == "tool_call":
118
+ lines.append(f" {i}. Tool: {step.tool_name}({step.tool_args})")
119
+ if step.tool_result:
120
+ result_str = str(step.tool_result)[:200]
121
+ lines.append(f" Result: {result_str}")
122
+ elif step.type.value == "llm_call":
123
+ output_preview = (step.output or "")[:100]
124
+ lines.append(f" {i}. LLM: {output_preview}...")
125
+ elif step.type.value == "reasoning":
126
+ reasoning_preview = (step.reasoning_text or "")[:100]
127
+ lines.append(f" {i}. Reasoning: {reasoning_preview}...")
128
+ return "\n".join(lines)
129
+
130
+ def _call_openai(self, prompt: str) -> str:
131
+ """Call OpenAI API (sync)."""
132
+ try:
133
+ from openai import OpenAI
134
+ except ImportError:
135
+ raise ImportError(
136
+ "OpenAI package not installed. Run: pip install evaldeck[openai]"
137
+ ) from None
138
+
139
+ client = OpenAI(api_key=self._get_api_key())
140
+ response = client.chat.completions.create(
141
+ model=self.model,
142
+ messages=[{"role": "user", "content": prompt}],
143
+ temperature=self.temperature,
144
+ )
145
+ return response.choices[0].message.content or ""
146
+
147
+ async def _call_openai_async(self, prompt: str) -> str:
148
+ """Call OpenAI API (async)."""
149
+ try:
150
+ from openai import AsyncOpenAI
151
+ except ImportError:
152
+ raise ImportError(
153
+ "OpenAI package not installed. Run: pip install evaldeck[openai]"
154
+ ) from None
155
+
156
+ client = AsyncOpenAI(api_key=self._get_api_key())
157
+ response = await client.chat.completions.create(
158
+ model=self.model,
159
+ messages=[{"role": "user", "content": prompt}],
160
+ temperature=self.temperature,
161
+ )
162
+ return response.choices[0].message.content or ""
163
+
164
+ def _call_anthropic(self, prompt: str) -> str:
165
+ """Call Anthropic API (sync)."""
166
+ try:
167
+ from anthropic import Anthropic
168
+ except ImportError:
169
+ raise ImportError(
170
+ "Anthropic package not installed. Run: pip install evaldeck[anthropic]"
171
+ ) from None
172
+
173
+ client = Anthropic(api_key=self._get_api_key())
174
+ response = client.messages.create(
175
+ model=self.model,
176
+ max_tokens=1024,
177
+ messages=[{"role": "user", "content": prompt}],
178
+ )
179
+ return response.content[0].text
180
+
181
+ async def _call_anthropic_async(self, prompt: str) -> str:
182
+ """Call Anthropic API (async)."""
183
+ try:
184
+ from anthropic import AsyncAnthropic
185
+ except ImportError:
186
+ raise ImportError(
187
+ "Anthropic package not installed. Run: pip install evaldeck[anthropic]"
188
+ ) from None
189
+
190
+ client = AsyncAnthropic(api_key=self._get_api_key())
191
+ response = await client.messages.create(
192
+ model=self.model,
193
+ max_tokens=1024,
194
+ messages=[{"role": "user", "content": prompt}],
195
+ )
196
+ return response.content[0].text
197
+
198
+ def _parse_response(self, response: str) -> tuple[GradeStatus, str, float | None]:
199
+ """Parse LLM response to extract verdict.
200
+
201
+ Returns:
202
+ Tuple of (status, reason, score).
203
+ """
204
+ response_upper = response.upper()
205
+
206
+ # Look for explicit VERDICT: PASS/FAIL
207
+ verdict_match = re.search(r"VERDICT:\s*(PASS|FAIL)", response_upper)
208
+ if verdict_match:
209
+ status = GradeStatus.PASS if verdict_match.group(1) == "PASS" else GradeStatus.FAIL
210
+ elif "PASS" in response_upper and "FAIL" not in response_upper:
211
+ status = GradeStatus.PASS
212
+ elif "FAIL" in response_upper:
213
+ status = GradeStatus.FAIL
214
+ else:
215
+ # Couldn't determine, default to fail
216
+ status = GradeStatus.FAIL
217
+
218
+ # Extract reason
219
+ reason_match = re.search(r"REASON:\s*(.+)", response, re.IGNORECASE | re.DOTALL)
220
+ reason = reason_match.group(1).strip() if reason_match else response[:200]
221
+
222
+ # Extract score if present
223
+ score = None
224
+ score_match = re.search(r"SCORE:\s*(\d+(?:\.\d+)?)", response)
225
+ if score_match:
226
+ score = float(score_match.group(1))
227
+ # Normalize to 0-1 if needed
228
+ if score > 1:
229
+ score = score / 10 if score <= 10 else score / 100
230
+
231
+ return status, reason, score
232
+
233
+ def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
234
+ """Grade the trace using an LLM (sync)."""
235
+ try:
236
+ # Format prompt
237
+ prompt = self._format_prompt(trace, test_case)
238
+
239
+ # Call LLM
240
+ if self.provider == "anthropic":
241
+ response = self._call_anthropic(prompt)
242
+ else:
243
+ response = self._call_openai(prompt)
244
+
245
+ return self._build_result(response)
246
+
247
+ except Exception as e:
248
+ return GradeResult.error_result(self.name, f"LLM grader error: {e}")
249
+
250
+ async def grade_async(self, trace: Trace, test_case: EvalCase) -> GradeResult:
251
+ """Grade the trace using an LLM (async).
252
+
253
+ Uses async API clients for better performance in concurrent evaluation.
254
+ """
255
+ try:
256
+ # Format prompt
257
+ prompt = self._format_prompt(trace, test_case)
258
+
259
+ # Call LLM asynchronously
260
+ if self.provider == "anthropic":
261
+ response = await self._call_anthropic_async(prompt)
262
+ else:
263
+ response = await self._call_openai_async(prompt)
264
+
265
+ return self._build_result(response)
266
+
267
+ except Exception as e:
268
+ return GradeResult.error_result(self.name, f"LLM grader error: {e}")
269
+
270
+ def _build_result(self, response: str) -> GradeResult:
271
+ """Build GradeResult from LLM response."""
272
+ # Parse response
273
+ status, reason, score = self._parse_response(response)
274
+
275
+ # Apply threshold if score-based
276
+ if self.threshold is not None and score is not None:
277
+ status = GradeStatus.PASS if score >= self.threshold else GradeStatus.FAIL
278
+
279
+ return GradeResult(
280
+ grader_name=self.name,
281
+ status=status,
282
+ score=score,
283
+ message=reason,
284
+ details={
285
+ "model": self.model,
286
+ "raw_response": response,
287
+ },
288
+ )
289
+
290
+
291
+ class LLMRubricGrader(LLMGrader):
292
+ """LLM grader with a detailed scoring rubric."""
293
+
294
+ name = "llm_rubric"
295
+
296
+ RUBRIC_PROMPT = """You are evaluating an AI agent's response using a scoring rubric.
297
+
298
+ User Input: {input}
299
+ Agent Output: {output}
300
+
301
+ Scoring Rubric:
302
+ {rubric}
303
+
304
+ For each criterion, provide a score from 1-5 where:
305
+ 1 = Poor, 2 = Below Average, 3 = Average, 4 = Good, 5 = Excellent
306
+
307
+ Format your response as:
308
+ CRITERION: criterion_name
309
+ SCORE: X
310
+ REASON: explanation
311
+
312
+ After scoring all criteria, provide:
313
+ TOTAL_SCORE: X/Y
314
+ VERDICT: PASS or FAIL
315
+ """
316
+
317
+ def __init__(
318
+ self,
319
+ rubric: dict[str, str],
320
+ pass_threshold: float = 0.7,
321
+ **kwargs: Any,
322
+ ) -> None:
323
+ """Initialize rubric grader.
324
+
325
+ Args:
326
+ rubric: Dict mapping criterion names to descriptions.
327
+ pass_threshold: Minimum score ratio to pass (0-1).
328
+ **kwargs: Passed to LLMGrader.
329
+ """
330
+ self.rubric = rubric
331
+ self.pass_threshold = pass_threshold
332
+ super().__init__(**kwargs)
333
+ self.prompt_template = self.RUBRIC_PROMPT
334
+
335
+ def _format_prompt(self, trace: Trace, test_case: EvalCase) -> str:
336
+ """Format prompt with rubric."""
337
+ rubric_text = "\n".join(
338
+ f"- {name}: {description}" for name, description in self.rubric.items()
339
+ )
340
+ return self.prompt_template.format(
341
+ input=trace.input,
342
+ output=trace.output or "(no output)",
343
+ rubric=rubric_text,
344
+ )
@@ -0,0 +1,29 @@
1
+ """Framework integrations for Evaldeck.
2
+
3
+ This module provides the OpenTelemetry/OpenInference adapter for capturing traces
4
+ from any instrumented AI framework (LangChain, CrewAI, LiteLLM, OpenAI, Anthropic, etc.)
5
+
6
+ Usage:
7
+ from evaldeck.integrations import EvaldeckSpanProcessor, setup_otel_tracing
8
+ from openinference.instrumentation.langchain import LangChainInstrumentor
9
+
10
+ processor = setup_otel_tracing()
11
+ LangChainInstrumentor().instrument()
12
+
13
+ # Run your agent...
14
+
15
+ trace = processor.get_latest_trace()
16
+ result = evaluator.evaluate(trace, test_case)
17
+ """
18
+
19
+ from evaldeck.integrations.opentelemetry import (
20
+ EvaldeckSpanProcessor,
21
+ )
22
+ from evaldeck.integrations.opentelemetry import (
23
+ setup_tracing as setup_otel_tracing,
24
+ )
25
+
26
+ __all__ = [
27
+ "EvaldeckSpanProcessor",
28
+ "setup_otel_tracing",
29
+ ]