evaldeck 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evaldeck/__init__.py +88 -0
- evaldeck/cli.py +324 -0
- evaldeck/config.py +223 -0
- evaldeck/evaluator.py +566 -0
- evaldeck/graders/__init__.py +36 -0
- evaldeck/graders/base.py +146 -0
- evaldeck/graders/code.py +484 -0
- evaldeck/graders/llm.py +344 -0
- evaldeck/integrations/__init__.py +29 -0
- evaldeck/integrations/opentelemetry.py +416 -0
- evaldeck/metrics/__init__.py +25 -0
- evaldeck/metrics/base.py +62 -0
- evaldeck/metrics/builtin.py +195 -0
- evaldeck/results.py +211 -0
- evaldeck/test_case.py +162 -0
- evaldeck/trace.py +215 -0
- evaldeck-0.1.0.dist-info/METADATA +363 -0
- evaldeck-0.1.0.dist-info/RECORD +21 -0
- evaldeck-0.1.0.dist-info/WHEEL +4 -0
- evaldeck-0.1.0.dist-info/entry_points.txt +2 -0
- evaldeck-0.1.0.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
"""OpenTelemetry/OpenInference integration for Evaldeck.
|
|
2
|
+
|
|
3
|
+
This module provides an OpenTelemetry SpanProcessor that captures traces from
|
|
4
|
+
any OpenInference-instrumented framework (LangChain, CrewAI, LiteLLM, OpenAI SDK,
|
|
5
|
+
Anthropic SDK, etc.) and converts them to Evaldeck's Trace format.
|
|
6
|
+
|
|
7
|
+
Installation:
|
|
8
|
+
pip install opentelemetry-sdk openinference-semantic-conventions
|
|
9
|
+
|
|
10
|
+
# Then install instrumentors for your framework(s):
|
|
11
|
+
pip install openinference-instrumentation-langchain
|
|
12
|
+
pip install openinference-instrumentation-crewai
|
|
13
|
+
pip install openinference-instrumentation-litellm
|
|
14
|
+
pip install openinference-instrumentation-openai
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
from evaldeck.integrations import EvaldeckSpanProcessor, setup_otel_tracing
|
|
18
|
+
from openinference.instrumentation.langchain import LangChainInstrumentor
|
|
19
|
+
|
|
20
|
+
# Setup tracing
|
|
21
|
+
processor = setup_otel_tracing()
|
|
22
|
+
LangChainInstrumentor().instrument()
|
|
23
|
+
|
|
24
|
+
# Run your agent
|
|
25
|
+
result = agent.invoke({"input": "Book a flight to NYC"})
|
|
26
|
+
|
|
27
|
+
# Get the evaldeck trace and evaluate
|
|
28
|
+
trace = processor.get_latest_trace()
|
|
29
|
+
result = evaluator.evaluate(trace, test_case)
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import json
|
|
35
|
+
from datetime import datetime, timezone
|
|
36
|
+
from typing import TYPE_CHECKING, Any
|
|
37
|
+
|
|
38
|
+
from evaldeck.trace import Step, StepStatus, StepType, TokenUsage, Trace, TraceStatus
|
|
39
|
+
|
|
40
|
+
# OpenTelemetry imports
|
|
41
|
+
try:
|
|
42
|
+
from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor
|
|
43
|
+
from opentelemetry.trace import StatusCode
|
|
44
|
+
|
|
45
|
+
OTEL_AVAILABLE = True
|
|
46
|
+
except ImportError:
|
|
47
|
+
OTEL_AVAILABLE = False
|
|
48
|
+
|
|
49
|
+
if TYPE_CHECKING:
|
|
50
|
+
from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor
|
|
51
|
+
from opentelemetry.trace import StatusCode
|
|
52
|
+
else:
|
|
53
|
+
SpanProcessor = object
|
|
54
|
+
ReadableSpan = object
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# OpenInference span kinds
|
|
58
|
+
SPAN_KIND_LLM = "LLM"
|
|
59
|
+
SPAN_KIND_TOOL = "TOOL"
|
|
60
|
+
SPAN_KIND_CHAIN = "CHAIN"
|
|
61
|
+
SPAN_KIND_EMBEDDING = "EMBEDDING"
|
|
62
|
+
SPAN_KIND_RETRIEVER = "RETRIEVER"
|
|
63
|
+
SPAN_KIND_RERANKER = "RERANKER"
|
|
64
|
+
SPAN_KIND_GUARDRAIL = "GUARDRAIL"
|
|
65
|
+
SPAN_KIND_AGENT = "AGENT"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class EvaldeckSpanProcessor(SpanProcessor):
|
|
69
|
+
"""OpenTelemetry SpanProcessor that builds Evaldeck Traces from OpenInference spans.
|
|
70
|
+
|
|
71
|
+
This processor intercepts OpenTelemetry spans as they complete and converts them
|
|
72
|
+
to Evaldeck's Trace/Step format. It supports all OpenInference span kinds:
|
|
73
|
+
LLM, TOOL, CHAIN, EMBEDDING, RETRIEVER, RERANKER, GUARDRAIL, AGENT.
|
|
74
|
+
|
|
75
|
+
Example:
|
|
76
|
+
from evaldeck.integrations import EvaldeckSpanProcessor
|
|
77
|
+
from opentelemetry import trace
|
|
78
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
79
|
+
|
|
80
|
+
processor = EvaldeckSpanProcessor()
|
|
81
|
+
provider = TracerProvider()
|
|
82
|
+
provider.add_span_processor(processor)
|
|
83
|
+
trace.set_tracer_provider(provider)
|
|
84
|
+
|
|
85
|
+
# After running instrumented code:
|
|
86
|
+
evaldeck_trace = processor.get_latest_trace()
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(self) -> None:
|
|
90
|
+
if not OTEL_AVAILABLE:
|
|
91
|
+
raise ImportError(
|
|
92
|
+
"OpenTelemetry is not installed. Install with: "
|
|
93
|
+
"pip install opentelemetry-sdk openinference-semantic-conventions"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
self._traces: dict[str, Trace] = {}
|
|
97
|
+
self._trace_order: list[str] = [] # Track order for get_latest_trace
|
|
98
|
+
|
|
99
|
+
def on_start(self, span: ReadableSpan, parent_context: Any = None) -> None:
|
|
100
|
+
"""Called when a span starts. We don't need to do anything here."""
|
|
101
|
+
pass
|
|
102
|
+
|
|
103
|
+
def on_end(self, span: ReadableSpan) -> None:
|
|
104
|
+
"""Called when a span ends. Convert to Evaldeck format."""
|
|
105
|
+
attrs = dict(span.attributes or {})
|
|
106
|
+
span_kind = str(attrs.get("openinference.span.kind", "")).upper()
|
|
107
|
+
|
|
108
|
+
# Skip spans without OpenInference kind
|
|
109
|
+
if not span_kind:
|
|
110
|
+
return
|
|
111
|
+
|
|
112
|
+
trace_id = format(span.context.trace_id, "032x")
|
|
113
|
+
|
|
114
|
+
# Ensure trace exists
|
|
115
|
+
if trace_id not in self._traces:
|
|
116
|
+
self._traces[trace_id] = Trace(
|
|
117
|
+
id=trace_id,
|
|
118
|
+
input="",
|
|
119
|
+
framework="openinference",
|
|
120
|
+
)
|
|
121
|
+
self._trace_order.append(trace_id)
|
|
122
|
+
|
|
123
|
+
trace = self._traces[trace_id]
|
|
124
|
+
|
|
125
|
+
# CHAIN/AGENT spans with no parent become the root trace
|
|
126
|
+
if span_kind in (SPAN_KIND_CHAIN, SPAN_KIND_AGENT) and span.parent is None:
|
|
127
|
+
self._update_trace_from_root_span(trace, span, attrs)
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
# Convert other spans to Steps
|
|
131
|
+
step = self._span_to_step(span, span_kind, attrs)
|
|
132
|
+
if step:
|
|
133
|
+
trace.add_step(step)
|
|
134
|
+
|
|
135
|
+
def _update_trace_from_root_span(
|
|
136
|
+
self, trace: Trace, span: ReadableSpan, attrs: dict[str, Any]
|
|
137
|
+
) -> None:
|
|
138
|
+
"""Update trace metadata from the root CHAIN/AGENT span."""
|
|
139
|
+
trace.input = str(attrs.get("input.value", trace.input or ""))
|
|
140
|
+
trace.output = attrs.get("output.value")
|
|
141
|
+
trace.status = self._map_trace_status(span)
|
|
142
|
+
trace.started_at = self._ns_to_datetime(span.start_time)
|
|
143
|
+
trace.completed_at = self._ns_to_datetime(span.end_time)
|
|
144
|
+
trace.duration_ms = (span.end_time - span.start_time) / 1_000_000
|
|
145
|
+
|
|
146
|
+
# Extract agent/framework info
|
|
147
|
+
if "llm.system" in attrs:
|
|
148
|
+
trace.framework = str(attrs["llm.system"])
|
|
149
|
+
|
|
150
|
+
trace.metadata["otel_trace_id"] = format(span.context.trace_id, "032x")
|
|
151
|
+
trace.metadata["otel_root_span_id"] = format(span.context.span_id, "016x")
|
|
152
|
+
|
|
153
|
+
def _span_to_step(
|
|
154
|
+
self, span: ReadableSpan, kind: str, attrs: dict[str, Any]
|
|
155
|
+
) -> Step | None:
|
|
156
|
+
"""Convert an OpenTelemetry span to an Evaldeck Step."""
|
|
157
|
+
|
|
158
|
+
if kind == SPAN_KIND_LLM:
|
|
159
|
+
return self._convert_llm_span(span, attrs)
|
|
160
|
+
|
|
161
|
+
elif kind == SPAN_KIND_TOOL:
|
|
162
|
+
return self._convert_tool_span(span, attrs)
|
|
163
|
+
|
|
164
|
+
elif kind in (SPAN_KIND_EMBEDDING, SPAN_KIND_RETRIEVER, SPAN_KIND_RERANKER):
|
|
165
|
+
return self._convert_retrieval_span(span, kind, attrs)
|
|
166
|
+
|
|
167
|
+
elif kind == SPAN_KIND_GUARDRAIL:
|
|
168
|
+
return self._convert_guardrail_span(span, attrs)
|
|
169
|
+
|
|
170
|
+
elif kind in (SPAN_KIND_CHAIN, SPAN_KIND_AGENT):
|
|
171
|
+
# Nested chains/agents become reasoning steps
|
|
172
|
+
return self._convert_chain_span(span, attrs)
|
|
173
|
+
|
|
174
|
+
return None
|
|
175
|
+
|
|
176
|
+
def _convert_llm_span(self, span: ReadableSpan, attrs: dict[str, Any]) -> Step:
|
|
177
|
+
"""Convert an LLM span to a Step."""
|
|
178
|
+
return Step(
|
|
179
|
+
type=StepType.LLM_CALL,
|
|
180
|
+
model=attrs.get("llm.model_name") or attrs.get("gen_ai.request.model"),
|
|
181
|
+
input=self._extract_messages(attrs, "input"),
|
|
182
|
+
output=self._extract_messages(attrs, "output"),
|
|
183
|
+
tokens=TokenUsage(
|
|
184
|
+
prompt_tokens=int(attrs.get("llm.token_count.prompt", 0)),
|
|
185
|
+
completion_tokens=int(attrs.get("llm.token_count.completion", 0)),
|
|
186
|
+
total_tokens=int(attrs.get("llm.token_count.total", 0)),
|
|
187
|
+
),
|
|
188
|
+
status=self._map_step_status(span),
|
|
189
|
+
duration_ms=self._calc_duration_ms(span),
|
|
190
|
+
error=self._extract_error(span),
|
|
191
|
+
metadata={
|
|
192
|
+
"otel_span_id": format(span.context.span_id, "016x"),
|
|
193
|
+
"llm_provider": attrs.get("llm.provider") or attrs.get("llm.system"),
|
|
194
|
+
},
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
def _convert_tool_span(self, span: ReadableSpan, attrs: dict[str, Any]) -> Step:
|
|
198
|
+
"""Convert a TOOL span to a Step."""
|
|
199
|
+
tool_name = (
|
|
200
|
+
attrs.get("tool.name")
|
|
201
|
+
or attrs.get("tool_call.function.name")
|
|
202
|
+
or "unknown_tool"
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
tool_args = self._parse_json(
|
|
206
|
+
attrs.get("tool.parameters")
|
|
207
|
+
or attrs.get("tool_call.function.arguments")
|
|
208
|
+
or attrs.get("input.value")
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
return Step(
|
|
212
|
+
type=StepType.TOOL_CALL,
|
|
213
|
+
tool_name=str(tool_name),
|
|
214
|
+
tool_args=tool_args if isinstance(tool_args, dict) else {"input": tool_args},
|
|
215
|
+
tool_result=attrs.get("output.value"),
|
|
216
|
+
status=self._map_step_status(span),
|
|
217
|
+
duration_ms=self._calc_duration_ms(span),
|
|
218
|
+
error=self._extract_error(span),
|
|
219
|
+
metadata={
|
|
220
|
+
"otel_span_id": format(span.context.span_id, "016x"),
|
|
221
|
+
"tool_id": attrs.get("tool.id") or attrs.get("tool_call.id"),
|
|
222
|
+
},
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
def _convert_retrieval_span(
|
|
226
|
+
self, span: ReadableSpan, kind: str, attrs: dict[str, Any]
|
|
227
|
+
) -> Step:
|
|
228
|
+
"""Convert EMBEDDING/RETRIEVER/RERANKER spans to tool call Steps."""
|
|
229
|
+
return Step(
|
|
230
|
+
type=StepType.TOOL_CALL,
|
|
231
|
+
tool_name=kind.lower(), # "embedding", "retriever", "reranker"
|
|
232
|
+
tool_args={"input": attrs.get("input.value")},
|
|
233
|
+
tool_result=attrs.get("output.value"),
|
|
234
|
+
status=self._map_step_status(span),
|
|
235
|
+
duration_ms=self._calc_duration_ms(span),
|
|
236
|
+
error=self._extract_error(span),
|
|
237
|
+
metadata={
|
|
238
|
+
"otel_span_id": format(span.context.span_id, "016x"),
|
|
239
|
+
"span_kind": kind,
|
|
240
|
+
},
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
def _convert_guardrail_span(self, span: ReadableSpan, attrs: dict[str, Any]) -> Step:
|
|
244
|
+
"""Convert GUARDRAIL spans to reasoning Steps."""
|
|
245
|
+
return Step(
|
|
246
|
+
type=StepType.REASONING,
|
|
247
|
+
reasoning_text=f"Guardrail check: {attrs.get('output.value', 'passed')}",
|
|
248
|
+
status=self._map_step_status(span),
|
|
249
|
+
duration_ms=self._calc_duration_ms(span),
|
|
250
|
+
error=self._extract_error(span),
|
|
251
|
+
metadata={
|
|
252
|
+
"otel_span_id": format(span.context.span_id, "016x"),
|
|
253
|
+
"guardrail_input": attrs.get("input.value"),
|
|
254
|
+
},
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
def _convert_chain_span(self, span: ReadableSpan, attrs: dict[str, Any]) -> Step:
|
|
258
|
+
"""Convert nested CHAIN/AGENT spans to reasoning Steps."""
|
|
259
|
+
return Step(
|
|
260
|
+
type=StepType.REASONING,
|
|
261
|
+
reasoning_text=f"Chain: {span.name} - {attrs.get('output.value', '')}",
|
|
262
|
+
status=self._map_step_status(span),
|
|
263
|
+
duration_ms=self._calc_duration_ms(span),
|
|
264
|
+
metadata={
|
|
265
|
+
"otel_span_id": format(span.context.span_id, "016x"),
|
|
266
|
+
"chain_input": attrs.get("input.value"),
|
|
267
|
+
},
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
def _extract_messages(self, attrs: dict[str, Any], direction: str) -> str:
|
|
271
|
+
"""Extract message content from OpenInference indexed attributes.
|
|
272
|
+
|
|
273
|
+
OpenInference uses indexed prefixes like:
|
|
274
|
+
llm.input_messages.0.message.content
|
|
275
|
+
llm.input_messages.1.message.content
|
|
276
|
+
"""
|
|
277
|
+
messages = []
|
|
278
|
+
i = 0
|
|
279
|
+
while True:
|
|
280
|
+
content_key = f"llm.{direction}_messages.{i}.message.content"
|
|
281
|
+
if content_key in attrs:
|
|
282
|
+
content = attrs[content_key]
|
|
283
|
+
role_key = f"llm.{direction}_messages.{i}.message.role"
|
|
284
|
+
role = attrs.get(role_key, "")
|
|
285
|
+
if role:
|
|
286
|
+
messages.append(f"[{role}]: {content}")
|
|
287
|
+
else:
|
|
288
|
+
messages.append(str(content))
|
|
289
|
+
i += 1
|
|
290
|
+
else:
|
|
291
|
+
break
|
|
292
|
+
|
|
293
|
+
if messages:
|
|
294
|
+
return "\n".join(messages)
|
|
295
|
+
|
|
296
|
+
# Fallback to simple input/output value
|
|
297
|
+
return str(attrs.get(f"{direction}.value", ""))
|
|
298
|
+
|
|
299
|
+
def _parse_json(self, value: Any) -> Any:
|
|
300
|
+
"""Parse JSON string if possible, return as-is otherwise."""
|
|
301
|
+
if value is None:
|
|
302
|
+
return {}
|
|
303
|
+
if isinstance(value, str):
|
|
304
|
+
try:
|
|
305
|
+
return json.loads(value)
|
|
306
|
+
except (json.JSONDecodeError, TypeError):
|
|
307
|
+
return value
|
|
308
|
+
return value
|
|
309
|
+
|
|
310
|
+
def _map_trace_status(self, span: ReadableSpan) -> TraceStatus:
|
|
311
|
+
"""Map OTel span status to Evaldeck TraceStatus."""
|
|
312
|
+
if span.status.status_code == StatusCode.ERROR:
|
|
313
|
+
return TraceStatus.ERROR
|
|
314
|
+
return TraceStatus.SUCCESS
|
|
315
|
+
|
|
316
|
+
def _map_step_status(self, span: ReadableSpan) -> StepStatus:
|
|
317
|
+
"""Map OTel span status to Evaldeck StepStatus."""
|
|
318
|
+
if span.status.status_code == StatusCode.ERROR:
|
|
319
|
+
return StepStatus.FAILURE
|
|
320
|
+
return StepStatus.SUCCESS
|
|
321
|
+
|
|
322
|
+
def _extract_error(self, span: ReadableSpan) -> str | None:
|
|
323
|
+
"""Extract error message from span if present."""
|
|
324
|
+
if span.status.status_code == StatusCode.ERROR:
|
|
325
|
+
return span.status.description
|
|
326
|
+
return None
|
|
327
|
+
|
|
328
|
+
def _calc_duration_ms(self, span: ReadableSpan) -> float:
|
|
329
|
+
"""Calculate span duration in milliseconds."""
|
|
330
|
+
return (span.end_time - span.start_time) / 1_000_000
|
|
331
|
+
|
|
332
|
+
def _ns_to_datetime(self, ns: int) -> datetime:
|
|
333
|
+
"""Convert nanoseconds timestamp to datetime."""
|
|
334
|
+
return datetime.fromtimestamp(ns / 1_000_000_000, tz=timezone.utc)
|
|
335
|
+
|
|
336
|
+
# -------------------------------------------------------------------------
|
|
337
|
+
# Public API
|
|
338
|
+
# -------------------------------------------------------------------------
|
|
339
|
+
|
|
340
|
+
def get_trace(self, trace_id: str) -> Trace | None:
|
|
341
|
+
"""Get a trace by its ID.
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
trace_id: The OpenTelemetry trace ID (32 hex chars)
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
The Evaldeck Trace, or None if not found
|
|
348
|
+
"""
|
|
349
|
+
return self._traces.get(trace_id)
|
|
350
|
+
|
|
351
|
+
def get_latest_trace(self) -> Trace | None:
|
|
352
|
+
"""Get the most recently completed trace.
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
The most recent Evaldeck Trace, or None if no traces captured
|
|
356
|
+
"""
|
|
357
|
+
if self._trace_order:
|
|
358
|
+
return self._traces.get(self._trace_order[-1])
|
|
359
|
+
return None
|
|
360
|
+
|
|
361
|
+
def get_all_traces(self) -> list[Trace]:
|
|
362
|
+
"""Get all captured traces in order.
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
List of all Evaldeck Traces
|
|
366
|
+
"""
|
|
367
|
+
return [self._traces[tid] for tid in self._trace_order if tid in self._traces]
|
|
368
|
+
|
|
369
|
+
def reset(self) -> None:
|
|
370
|
+
"""Clear all captured traces."""
|
|
371
|
+
self._traces.clear()
|
|
372
|
+
self._trace_order.clear()
|
|
373
|
+
|
|
374
|
+
def shutdown(self) -> None:
|
|
375
|
+
"""Shutdown the processor (required by SpanProcessor interface)."""
|
|
376
|
+
pass
|
|
377
|
+
|
|
378
|
+
def force_flush(self, timeout_millis: int = 30000) -> bool:
|
|
379
|
+
"""Force flush (required by SpanProcessor interface)."""
|
|
380
|
+
return True
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def setup_tracing(processor: EvaldeckSpanProcessor | None = None) -> EvaldeckSpanProcessor:
|
|
384
|
+
"""Setup OpenTelemetry tracing with the Evaldeck processor.
|
|
385
|
+
|
|
386
|
+
This is a convenience function that sets up the tracer provider
|
|
387
|
+
with the Evaldeck processor.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
processor: Optional existing processor. If None, creates a new one.
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
The EvaldeckSpanProcessor (for later trace retrieval)
|
|
394
|
+
|
|
395
|
+
Example:
|
|
396
|
+
from evaldeck.integrations import setup_otel_tracing
|
|
397
|
+
from openinference.instrumentation.langchain import LangChainInstrumentor
|
|
398
|
+
|
|
399
|
+
processor = setup_otel_tracing()
|
|
400
|
+
LangChainInstrumentor().instrument()
|
|
401
|
+
|
|
402
|
+
# Run agent...
|
|
403
|
+
|
|
404
|
+
trace = processor.get_latest_trace()
|
|
405
|
+
"""
|
|
406
|
+
from opentelemetry import trace
|
|
407
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
408
|
+
|
|
409
|
+
if processor is None:
|
|
410
|
+
processor = EvaldeckSpanProcessor()
|
|
411
|
+
|
|
412
|
+
provider = TracerProvider()
|
|
413
|
+
provider.add_span_processor(processor)
|
|
414
|
+
trace.set_tracer_provider(provider)
|
|
415
|
+
|
|
416
|
+
return processor
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Metrics for measuring agent performance."""
|
|
2
|
+
|
|
3
|
+
from evaldeck.metrics.base import BaseMetric
|
|
4
|
+
from evaldeck.metrics.builtin import (
|
|
5
|
+
DurationMetric,
|
|
6
|
+
ErrorRateMetric,
|
|
7
|
+
LLMCallCountMetric,
|
|
8
|
+
StepCountMetric,
|
|
9
|
+
StepEfficiencyMetric,
|
|
10
|
+
TokenUsageMetric,
|
|
11
|
+
ToolCallCountMetric,
|
|
12
|
+
ToolDiversityMetric,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"BaseMetric",
|
|
17
|
+
"StepCountMetric",
|
|
18
|
+
"TokenUsageMetric",
|
|
19
|
+
"ToolCallCountMetric",
|
|
20
|
+
"DurationMetric",
|
|
21
|
+
"ToolDiversityMetric",
|
|
22
|
+
"StepEfficiencyMetric",
|
|
23
|
+
"LLMCallCountMetric",
|
|
24
|
+
"ErrorRateMetric",
|
|
25
|
+
]
|
evaldeck/metrics/base.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Base metric class."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from evaldeck.results import MetricResult
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from evaldeck.test_case import EvalCase
|
|
13
|
+
from evaldeck.trace import Trace
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BaseMetric(ABC):
|
|
17
|
+
"""Base class for all metrics.
|
|
18
|
+
|
|
19
|
+
Metrics calculate quantitative measurements from traces.
|
|
20
|
+
Unlike graders, metrics don't pass/fail - they just measure.
|
|
21
|
+
|
|
22
|
+
Supports both sync and async calculation. Override calculate_async()
|
|
23
|
+
for metrics that need to make async I/O calls (e.g., fetching external
|
|
24
|
+
benchmark data).
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
name: str = "base"
|
|
28
|
+
unit: str | None = None
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
|
|
32
|
+
"""Calculate the metric value (sync).
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
trace: The execution trace to measure.
|
|
36
|
+
test_case: Optional test case for context.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
MetricResult with the calculated value.
|
|
40
|
+
"""
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
async def calculate_async(
|
|
44
|
+
self, trace: Trace, test_case: EvalCase | None = None
|
|
45
|
+
) -> MetricResult:
|
|
46
|
+
"""Calculate the metric value (async).
|
|
47
|
+
|
|
48
|
+
Default implementation runs sync calculate() in a thread pool.
|
|
49
|
+
Override this method for true async behavior (e.g., async API calls
|
|
50
|
+
for external benchmarking services).
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
trace: The execution trace to measure.
|
|
54
|
+
test_case: Optional test case for context.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
MetricResult with the calculated value.
|
|
58
|
+
"""
|
|
59
|
+
return await asyncio.to_thread(self.calculate, trace, test_case)
|
|
60
|
+
|
|
61
|
+
def __repr__(self) -> str:
|
|
62
|
+
return f"{self.__class__.__name__}()"
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""Built-in metrics for agent evaluation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from evaldeck.metrics.base import BaseMetric
|
|
8
|
+
from evaldeck.results import MetricResult
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from evaldeck.test_case import EvalCase
|
|
12
|
+
from evaldeck.trace import Trace
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class StepCountMetric(BaseMetric):
|
|
16
|
+
"""Count total number of steps in the trace."""
|
|
17
|
+
|
|
18
|
+
name = "step_count"
|
|
19
|
+
unit = "steps"
|
|
20
|
+
|
|
21
|
+
def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
|
|
22
|
+
return MetricResult(
|
|
23
|
+
metric_name=self.name,
|
|
24
|
+
value=float(trace.step_count),
|
|
25
|
+
unit=self.unit,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TokenUsageMetric(BaseMetric):
|
|
30
|
+
"""Total token usage across all LLM calls."""
|
|
31
|
+
|
|
32
|
+
name = "token_usage"
|
|
33
|
+
unit = "tokens"
|
|
34
|
+
|
|
35
|
+
def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
|
|
36
|
+
return MetricResult(
|
|
37
|
+
metric_name=self.name,
|
|
38
|
+
value=float(trace.total_tokens),
|
|
39
|
+
unit=self.unit,
|
|
40
|
+
details={
|
|
41
|
+
"llm_calls": len(trace.llm_calls),
|
|
42
|
+
},
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ToolCallCountMetric(BaseMetric):
|
|
47
|
+
"""Count number of tool calls."""
|
|
48
|
+
|
|
49
|
+
name = "tool_call_count"
|
|
50
|
+
unit = "calls"
|
|
51
|
+
|
|
52
|
+
def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
|
|
53
|
+
return MetricResult(
|
|
54
|
+
metric_name=self.name,
|
|
55
|
+
value=float(len(trace.tool_calls)),
|
|
56
|
+
unit=self.unit,
|
|
57
|
+
details={
|
|
58
|
+
"tools": trace.tools_called,
|
|
59
|
+
},
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class DurationMetric(BaseMetric):
|
|
64
|
+
"""Total execution duration."""
|
|
65
|
+
|
|
66
|
+
name = "duration"
|
|
67
|
+
unit = "ms"
|
|
68
|
+
|
|
69
|
+
def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
|
|
70
|
+
duration = trace.duration_ms or 0.0
|
|
71
|
+
return MetricResult(
|
|
72
|
+
metric_name=self.name,
|
|
73
|
+
value=duration,
|
|
74
|
+
unit=self.unit,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class ToolDiversityMetric(BaseMetric):
|
|
79
|
+
"""Measure diversity of tools used (unique tools / total calls)."""
|
|
80
|
+
|
|
81
|
+
name = "tool_diversity"
|
|
82
|
+
unit = "ratio"
|
|
83
|
+
|
|
84
|
+
def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
|
|
85
|
+
tool_calls = trace.tool_calls
|
|
86
|
+
if not tool_calls:
|
|
87
|
+
return MetricResult(
|
|
88
|
+
metric_name=self.name,
|
|
89
|
+
value=0.0,
|
|
90
|
+
unit=self.unit,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
unique_tools = len(set(trace.tools_called))
|
|
94
|
+
total_calls = len(tool_calls)
|
|
95
|
+
diversity = unique_tools / total_calls
|
|
96
|
+
|
|
97
|
+
return MetricResult(
|
|
98
|
+
metric_name=self.name,
|
|
99
|
+
value=diversity,
|
|
100
|
+
unit=self.unit,
|
|
101
|
+
details={
|
|
102
|
+
"unique_tools": unique_tools,
|
|
103
|
+
"total_calls": total_calls,
|
|
104
|
+
},
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class StepEfficiencyMetric(BaseMetric):
|
|
109
|
+
"""Measure step efficiency compared to expected max steps.
|
|
110
|
+
|
|
111
|
+
Returns 1.0 if within expected steps, <1.0 if exceeded.
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
name = "step_efficiency"
|
|
115
|
+
unit = "ratio"
|
|
116
|
+
|
|
117
|
+
def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
|
|
118
|
+
actual_steps = trace.step_count
|
|
119
|
+
|
|
120
|
+
# Get expected max from test case
|
|
121
|
+
max_steps = None
|
|
122
|
+
if test_case and test_case.expected.max_steps:
|
|
123
|
+
max_steps = test_case.expected.max_steps
|
|
124
|
+
|
|
125
|
+
if max_steps is None:
|
|
126
|
+
# No baseline, just return step count as negative efficiency
|
|
127
|
+
return MetricResult(
|
|
128
|
+
metric_name=self.name,
|
|
129
|
+
value=1.0,
|
|
130
|
+
unit=self.unit,
|
|
131
|
+
details={
|
|
132
|
+
"actual_steps": actual_steps,
|
|
133
|
+
"max_steps": None,
|
|
134
|
+
},
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Calculate efficiency (1.0 = at or under budget, <1.0 = over budget)
|
|
138
|
+
if actual_steps <= max_steps:
|
|
139
|
+
efficiency = 1.0
|
|
140
|
+
else:
|
|
141
|
+
efficiency = max_steps / actual_steps
|
|
142
|
+
|
|
143
|
+
return MetricResult(
|
|
144
|
+
metric_name=self.name,
|
|
145
|
+
value=efficiency,
|
|
146
|
+
unit=self.unit,
|
|
147
|
+
details={
|
|
148
|
+
"actual_steps": actual_steps,
|
|
149
|
+
"max_steps": max_steps,
|
|
150
|
+
},
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class LLMCallCountMetric(BaseMetric):
|
|
155
|
+
"""Count number of LLM calls."""
|
|
156
|
+
|
|
157
|
+
name = "llm_call_count"
|
|
158
|
+
unit = "calls"
|
|
159
|
+
|
|
160
|
+
def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
|
|
161
|
+
return MetricResult(
|
|
162
|
+
metric_name=self.name,
|
|
163
|
+
value=float(len(trace.llm_calls)),
|
|
164
|
+
unit=self.unit,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class ErrorRateMetric(BaseMetric):
|
|
169
|
+
"""Calculate error rate across steps."""
|
|
170
|
+
|
|
171
|
+
name = "error_rate"
|
|
172
|
+
unit = "ratio"
|
|
173
|
+
|
|
174
|
+
def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
|
|
175
|
+
from evaldeck.trace import StepStatus
|
|
176
|
+
|
|
177
|
+
if not trace.steps:
|
|
178
|
+
return MetricResult(
|
|
179
|
+
metric_name=self.name,
|
|
180
|
+
value=0.0,
|
|
181
|
+
unit=self.unit,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
error_count = sum(1 for s in trace.steps if s.status == StepStatus.FAILURE)
|
|
185
|
+
error_rate = error_count / len(trace.steps)
|
|
186
|
+
|
|
187
|
+
return MetricResult(
|
|
188
|
+
metric_name=self.name,
|
|
189
|
+
value=error_rate,
|
|
190
|
+
unit=self.unit,
|
|
191
|
+
details={
|
|
192
|
+
"error_count": error_count,
|
|
193
|
+
"total_steps": len(trace.steps),
|
|
194
|
+
},
|
|
195
|
+
)
|