evaldeck 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,416 @@
1
+ """OpenTelemetry/OpenInference integration for Evaldeck.
2
+
3
+ This module provides an OpenTelemetry SpanProcessor that captures traces from
4
+ any OpenInference-instrumented framework (LangChain, CrewAI, LiteLLM, OpenAI SDK,
5
+ Anthropic SDK, etc.) and converts them to Evaldeck's Trace format.
6
+
7
+ Installation:
8
+ pip install opentelemetry-sdk openinference-semantic-conventions
9
+
10
+ # Then install instrumentors for your framework(s):
11
+ pip install openinference-instrumentation-langchain
12
+ pip install openinference-instrumentation-crewai
13
+ pip install openinference-instrumentation-litellm
14
+ pip install openinference-instrumentation-openai
15
+
16
+ Usage:
17
+ from evaldeck.integrations import EvaldeckSpanProcessor, setup_otel_tracing
18
+ from openinference.instrumentation.langchain import LangChainInstrumentor
19
+
20
+ # Setup tracing
21
+ processor = setup_otel_tracing()
22
+ LangChainInstrumentor().instrument()
23
+
24
+ # Run your agent
25
+ result = agent.invoke({"input": "Book a flight to NYC"})
26
+
27
+ # Get the evaldeck trace and evaluate
28
+ trace = processor.get_latest_trace()
29
+ result = evaluator.evaluate(trace, test_case)
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import json
35
+ from datetime import datetime, timezone
36
+ from typing import TYPE_CHECKING, Any
37
+
38
+ from evaldeck.trace import Step, StepStatus, StepType, TokenUsage, Trace, TraceStatus
39
+
40
+ # OpenTelemetry imports
41
+ try:
42
+ from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor
43
+ from opentelemetry.trace import StatusCode
44
+
45
+ OTEL_AVAILABLE = True
46
+ except ImportError:
47
+ OTEL_AVAILABLE = False
48
+
49
+ if TYPE_CHECKING:
50
+ from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor
51
+ from opentelemetry.trace import StatusCode
52
+ else:
53
+ SpanProcessor = object
54
+ ReadableSpan = object
55
+
56
+
57
+ # OpenInference span kinds
58
+ SPAN_KIND_LLM = "LLM"
59
+ SPAN_KIND_TOOL = "TOOL"
60
+ SPAN_KIND_CHAIN = "CHAIN"
61
+ SPAN_KIND_EMBEDDING = "EMBEDDING"
62
+ SPAN_KIND_RETRIEVER = "RETRIEVER"
63
+ SPAN_KIND_RERANKER = "RERANKER"
64
+ SPAN_KIND_GUARDRAIL = "GUARDRAIL"
65
+ SPAN_KIND_AGENT = "AGENT"
66
+
67
+
68
+ class EvaldeckSpanProcessor(SpanProcessor):
69
+ """OpenTelemetry SpanProcessor that builds Evaldeck Traces from OpenInference spans.
70
+
71
+ This processor intercepts OpenTelemetry spans as they complete and converts them
72
+ to Evaldeck's Trace/Step format. It supports all OpenInference span kinds:
73
+ LLM, TOOL, CHAIN, EMBEDDING, RETRIEVER, RERANKER, GUARDRAIL, AGENT.
74
+
75
+ Example:
76
+ from evaldeck.integrations import EvaldeckSpanProcessor
77
+ from opentelemetry import trace
78
+ from opentelemetry.sdk.trace import TracerProvider
79
+
80
+ processor = EvaldeckSpanProcessor()
81
+ provider = TracerProvider()
82
+ provider.add_span_processor(processor)
83
+ trace.set_tracer_provider(provider)
84
+
85
+ # After running instrumented code:
86
+ evaldeck_trace = processor.get_latest_trace()
87
+ """
88
+
89
+ def __init__(self) -> None:
90
+ if not OTEL_AVAILABLE:
91
+ raise ImportError(
92
+ "OpenTelemetry is not installed. Install with: "
93
+ "pip install opentelemetry-sdk openinference-semantic-conventions"
94
+ )
95
+
96
+ self._traces: dict[str, Trace] = {}
97
+ self._trace_order: list[str] = [] # Track order for get_latest_trace
98
+
99
+ def on_start(self, span: ReadableSpan, parent_context: Any = None) -> None:
100
+ """Called when a span starts. We don't need to do anything here."""
101
+ pass
102
+
103
+ def on_end(self, span: ReadableSpan) -> None:
104
+ """Called when a span ends. Convert to Evaldeck format."""
105
+ attrs = dict(span.attributes or {})
106
+ span_kind = str(attrs.get("openinference.span.kind", "")).upper()
107
+
108
+ # Skip spans without OpenInference kind
109
+ if not span_kind:
110
+ return
111
+
112
+ trace_id = format(span.context.trace_id, "032x")
113
+
114
+ # Ensure trace exists
115
+ if trace_id not in self._traces:
116
+ self._traces[trace_id] = Trace(
117
+ id=trace_id,
118
+ input="",
119
+ framework="openinference",
120
+ )
121
+ self._trace_order.append(trace_id)
122
+
123
+ trace = self._traces[trace_id]
124
+
125
+ # CHAIN/AGENT spans with no parent become the root trace
126
+ if span_kind in (SPAN_KIND_CHAIN, SPAN_KIND_AGENT) and span.parent is None:
127
+ self._update_trace_from_root_span(trace, span, attrs)
128
+ return
129
+
130
+ # Convert other spans to Steps
131
+ step = self._span_to_step(span, span_kind, attrs)
132
+ if step:
133
+ trace.add_step(step)
134
+
135
+ def _update_trace_from_root_span(
136
+ self, trace: Trace, span: ReadableSpan, attrs: dict[str, Any]
137
+ ) -> None:
138
+ """Update trace metadata from the root CHAIN/AGENT span."""
139
+ trace.input = str(attrs.get("input.value", trace.input or ""))
140
+ trace.output = attrs.get("output.value")
141
+ trace.status = self._map_trace_status(span)
142
+ trace.started_at = self._ns_to_datetime(span.start_time)
143
+ trace.completed_at = self._ns_to_datetime(span.end_time)
144
+ trace.duration_ms = (span.end_time - span.start_time) / 1_000_000
145
+
146
+ # Extract agent/framework info
147
+ if "llm.system" in attrs:
148
+ trace.framework = str(attrs["llm.system"])
149
+
150
+ trace.metadata["otel_trace_id"] = format(span.context.trace_id, "032x")
151
+ trace.metadata["otel_root_span_id"] = format(span.context.span_id, "016x")
152
+
153
+ def _span_to_step(
154
+ self, span: ReadableSpan, kind: str, attrs: dict[str, Any]
155
+ ) -> Step | None:
156
+ """Convert an OpenTelemetry span to an Evaldeck Step."""
157
+
158
+ if kind == SPAN_KIND_LLM:
159
+ return self._convert_llm_span(span, attrs)
160
+
161
+ elif kind == SPAN_KIND_TOOL:
162
+ return self._convert_tool_span(span, attrs)
163
+
164
+ elif kind in (SPAN_KIND_EMBEDDING, SPAN_KIND_RETRIEVER, SPAN_KIND_RERANKER):
165
+ return self._convert_retrieval_span(span, kind, attrs)
166
+
167
+ elif kind == SPAN_KIND_GUARDRAIL:
168
+ return self._convert_guardrail_span(span, attrs)
169
+
170
+ elif kind in (SPAN_KIND_CHAIN, SPAN_KIND_AGENT):
171
+ # Nested chains/agents become reasoning steps
172
+ return self._convert_chain_span(span, attrs)
173
+
174
+ return None
175
+
176
+ def _convert_llm_span(self, span: ReadableSpan, attrs: dict[str, Any]) -> Step:
177
+ """Convert an LLM span to a Step."""
178
+ return Step(
179
+ type=StepType.LLM_CALL,
180
+ model=attrs.get("llm.model_name") or attrs.get("gen_ai.request.model"),
181
+ input=self._extract_messages(attrs, "input"),
182
+ output=self._extract_messages(attrs, "output"),
183
+ tokens=TokenUsage(
184
+ prompt_tokens=int(attrs.get("llm.token_count.prompt", 0)),
185
+ completion_tokens=int(attrs.get("llm.token_count.completion", 0)),
186
+ total_tokens=int(attrs.get("llm.token_count.total", 0)),
187
+ ),
188
+ status=self._map_step_status(span),
189
+ duration_ms=self._calc_duration_ms(span),
190
+ error=self._extract_error(span),
191
+ metadata={
192
+ "otel_span_id": format(span.context.span_id, "016x"),
193
+ "llm_provider": attrs.get("llm.provider") or attrs.get("llm.system"),
194
+ },
195
+ )
196
+
197
+ def _convert_tool_span(self, span: ReadableSpan, attrs: dict[str, Any]) -> Step:
198
+ """Convert a TOOL span to a Step."""
199
+ tool_name = (
200
+ attrs.get("tool.name")
201
+ or attrs.get("tool_call.function.name")
202
+ or "unknown_tool"
203
+ )
204
+
205
+ tool_args = self._parse_json(
206
+ attrs.get("tool.parameters")
207
+ or attrs.get("tool_call.function.arguments")
208
+ or attrs.get("input.value")
209
+ )
210
+
211
+ return Step(
212
+ type=StepType.TOOL_CALL,
213
+ tool_name=str(tool_name),
214
+ tool_args=tool_args if isinstance(tool_args, dict) else {"input": tool_args},
215
+ tool_result=attrs.get("output.value"),
216
+ status=self._map_step_status(span),
217
+ duration_ms=self._calc_duration_ms(span),
218
+ error=self._extract_error(span),
219
+ metadata={
220
+ "otel_span_id": format(span.context.span_id, "016x"),
221
+ "tool_id": attrs.get("tool.id") or attrs.get("tool_call.id"),
222
+ },
223
+ )
224
+
225
+ def _convert_retrieval_span(
226
+ self, span: ReadableSpan, kind: str, attrs: dict[str, Any]
227
+ ) -> Step:
228
+ """Convert EMBEDDING/RETRIEVER/RERANKER spans to tool call Steps."""
229
+ return Step(
230
+ type=StepType.TOOL_CALL,
231
+ tool_name=kind.lower(), # "embedding", "retriever", "reranker"
232
+ tool_args={"input": attrs.get("input.value")},
233
+ tool_result=attrs.get("output.value"),
234
+ status=self._map_step_status(span),
235
+ duration_ms=self._calc_duration_ms(span),
236
+ error=self._extract_error(span),
237
+ metadata={
238
+ "otel_span_id": format(span.context.span_id, "016x"),
239
+ "span_kind": kind,
240
+ },
241
+ )
242
+
243
+ def _convert_guardrail_span(self, span: ReadableSpan, attrs: dict[str, Any]) -> Step:
244
+ """Convert GUARDRAIL spans to reasoning Steps."""
245
+ return Step(
246
+ type=StepType.REASONING,
247
+ reasoning_text=f"Guardrail check: {attrs.get('output.value', 'passed')}",
248
+ status=self._map_step_status(span),
249
+ duration_ms=self._calc_duration_ms(span),
250
+ error=self._extract_error(span),
251
+ metadata={
252
+ "otel_span_id": format(span.context.span_id, "016x"),
253
+ "guardrail_input": attrs.get("input.value"),
254
+ },
255
+ )
256
+
257
+ def _convert_chain_span(self, span: ReadableSpan, attrs: dict[str, Any]) -> Step:
258
+ """Convert nested CHAIN/AGENT spans to reasoning Steps."""
259
+ return Step(
260
+ type=StepType.REASONING,
261
+ reasoning_text=f"Chain: {span.name} - {attrs.get('output.value', '')}",
262
+ status=self._map_step_status(span),
263
+ duration_ms=self._calc_duration_ms(span),
264
+ metadata={
265
+ "otel_span_id": format(span.context.span_id, "016x"),
266
+ "chain_input": attrs.get("input.value"),
267
+ },
268
+ )
269
+
270
+ def _extract_messages(self, attrs: dict[str, Any], direction: str) -> str:
271
+ """Extract message content from OpenInference indexed attributes.
272
+
273
+ OpenInference uses indexed prefixes like:
274
+ llm.input_messages.0.message.content
275
+ llm.input_messages.1.message.content
276
+ """
277
+ messages = []
278
+ i = 0
279
+ while True:
280
+ content_key = f"llm.{direction}_messages.{i}.message.content"
281
+ if content_key in attrs:
282
+ content = attrs[content_key]
283
+ role_key = f"llm.{direction}_messages.{i}.message.role"
284
+ role = attrs.get(role_key, "")
285
+ if role:
286
+ messages.append(f"[{role}]: {content}")
287
+ else:
288
+ messages.append(str(content))
289
+ i += 1
290
+ else:
291
+ break
292
+
293
+ if messages:
294
+ return "\n".join(messages)
295
+
296
+ # Fallback to simple input/output value
297
+ return str(attrs.get(f"{direction}.value", ""))
298
+
299
+ def _parse_json(self, value: Any) -> Any:
300
+ """Parse JSON string if possible, return as-is otherwise."""
301
+ if value is None:
302
+ return {}
303
+ if isinstance(value, str):
304
+ try:
305
+ return json.loads(value)
306
+ except (json.JSONDecodeError, TypeError):
307
+ return value
308
+ return value
309
+
310
+ def _map_trace_status(self, span: ReadableSpan) -> TraceStatus:
311
+ """Map OTel span status to Evaldeck TraceStatus."""
312
+ if span.status.status_code == StatusCode.ERROR:
313
+ return TraceStatus.ERROR
314
+ return TraceStatus.SUCCESS
315
+
316
+ def _map_step_status(self, span: ReadableSpan) -> StepStatus:
317
+ """Map OTel span status to Evaldeck StepStatus."""
318
+ if span.status.status_code == StatusCode.ERROR:
319
+ return StepStatus.FAILURE
320
+ return StepStatus.SUCCESS
321
+
322
+ def _extract_error(self, span: ReadableSpan) -> str | None:
323
+ """Extract error message from span if present."""
324
+ if span.status.status_code == StatusCode.ERROR:
325
+ return span.status.description
326
+ return None
327
+
328
+ def _calc_duration_ms(self, span: ReadableSpan) -> float:
329
+ """Calculate span duration in milliseconds."""
330
+ return (span.end_time - span.start_time) / 1_000_000
331
+
332
+ def _ns_to_datetime(self, ns: int) -> datetime:
333
+ """Convert nanoseconds timestamp to datetime."""
334
+ return datetime.fromtimestamp(ns / 1_000_000_000, tz=timezone.utc)
335
+
336
+ # -------------------------------------------------------------------------
337
+ # Public API
338
+ # -------------------------------------------------------------------------
339
+
340
+ def get_trace(self, trace_id: str) -> Trace | None:
341
+ """Get a trace by its ID.
342
+
343
+ Args:
344
+ trace_id: The OpenTelemetry trace ID (32 hex chars)
345
+
346
+ Returns:
347
+ The Evaldeck Trace, or None if not found
348
+ """
349
+ return self._traces.get(trace_id)
350
+
351
+ def get_latest_trace(self) -> Trace | None:
352
+ """Get the most recently completed trace.
353
+
354
+ Returns:
355
+ The most recent Evaldeck Trace, or None if no traces captured
356
+ """
357
+ if self._trace_order:
358
+ return self._traces.get(self._trace_order[-1])
359
+ return None
360
+
361
+ def get_all_traces(self) -> list[Trace]:
362
+ """Get all captured traces in order.
363
+
364
+ Returns:
365
+ List of all Evaldeck Traces
366
+ """
367
+ return [self._traces[tid] for tid in self._trace_order if tid in self._traces]
368
+
369
+ def reset(self) -> None:
370
+ """Clear all captured traces."""
371
+ self._traces.clear()
372
+ self._trace_order.clear()
373
+
374
+ def shutdown(self) -> None:
375
+ """Shutdown the processor (required by SpanProcessor interface)."""
376
+ pass
377
+
378
+ def force_flush(self, timeout_millis: int = 30000) -> bool:
379
+ """Force flush (required by SpanProcessor interface)."""
380
+ return True
381
+
382
+
383
+ def setup_tracing(processor: EvaldeckSpanProcessor | None = None) -> EvaldeckSpanProcessor:
384
+ """Setup OpenTelemetry tracing with the Evaldeck processor.
385
+
386
+ This is a convenience function that sets up the tracer provider
387
+ with the Evaldeck processor.
388
+
389
+ Args:
390
+ processor: Optional existing processor. If None, creates a new one.
391
+
392
+ Returns:
393
+ The EvaldeckSpanProcessor (for later trace retrieval)
394
+
395
+ Example:
396
+ from evaldeck.integrations import setup_otel_tracing
397
+ from openinference.instrumentation.langchain import LangChainInstrumentor
398
+
399
+ processor = setup_otel_tracing()
400
+ LangChainInstrumentor().instrument()
401
+
402
+ # Run agent...
403
+
404
+ trace = processor.get_latest_trace()
405
+ """
406
+ from opentelemetry import trace
407
+ from opentelemetry.sdk.trace import TracerProvider
408
+
409
+ if processor is None:
410
+ processor = EvaldeckSpanProcessor()
411
+
412
+ provider = TracerProvider()
413
+ provider.add_span_processor(processor)
414
+ trace.set_tracer_provider(provider)
415
+
416
+ return processor
@@ -0,0 +1,25 @@
1
+ """Metrics for measuring agent performance."""
2
+
3
+ from evaldeck.metrics.base import BaseMetric
4
+ from evaldeck.metrics.builtin import (
5
+ DurationMetric,
6
+ ErrorRateMetric,
7
+ LLMCallCountMetric,
8
+ StepCountMetric,
9
+ StepEfficiencyMetric,
10
+ TokenUsageMetric,
11
+ ToolCallCountMetric,
12
+ ToolDiversityMetric,
13
+ )
14
+
15
+ __all__ = [
16
+ "BaseMetric",
17
+ "StepCountMetric",
18
+ "TokenUsageMetric",
19
+ "ToolCallCountMetric",
20
+ "DurationMetric",
21
+ "ToolDiversityMetric",
22
+ "StepEfficiencyMetric",
23
+ "LLMCallCountMetric",
24
+ "ErrorRateMetric",
25
+ ]
@@ -0,0 +1,62 @@
1
+ """Base metric class."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from abc import ABC, abstractmethod
7
+ from typing import TYPE_CHECKING
8
+
9
+ from evaldeck.results import MetricResult
10
+
11
+ if TYPE_CHECKING:
12
+ from evaldeck.test_case import EvalCase
13
+ from evaldeck.trace import Trace
14
+
15
+
16
+ class BaseMetric(ABC):
17
+ """Base class for all metrics.
18
+
19
+ Metrics calculate quantitative measurements from traces.
20
+ Unlike graders, metrics don't pass/fail - they just measure.
21
+
22
+ Supports both sync and async calculation. Override calculate_async()
23
+ for metrics that need to make async I/O calls (e.g., fetching external
24
+ benchmark data).
25
+ """
26
+
27
+ name: str = "base"
28
+ unit: str | None = None
29
+
30
+ @abstractmethod
31
+ def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
32
+ """Calculate the metric value (sync).
33
+
34
+ Args:
35
+ trace: The execution trace to measure.
36
+ test_case: Optional test case for context.
37
+
38
+ Returns:
39
+ MetricResult with the calculated value.
40
+ """
41
+ pass
42
+
43
+ async def calculate_async(
44
+ self, trace: Trace, test_case: EvalCase | None = None
45
+ ) -> MetricResult:
46
+ """Calculate the metric value (async).
47
+
48
+ Default implementation runs sync calculate() in a thread pool.
49
+ Override this method for true async behavior (e.g., async API calls
50
+ for external benchmarking services).
51
+
52
+ Args:
53
+ trace: The execution trace to measure.
54
+ test_case: Optional test case for context.
55
+
56
+ Returns:
57
+ MetricResult with the calculated value.
58
+ """
59
+ return await asyncio.to_thread(self.calculate, trace, test_case)
60
+
61
+ def __repr__(self) -> str:
62
+ return f"{self.__class__.__name__}()"
@@ -0,0 +1,195 @@
1
+ """Built-in metrics for agent evaluation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ from evaldeck.metrics.base import BaseMetric
8
+ from evaldeck.results import MetricResult
9
+
10
+ if TYPE_CHECKING:
11
+ from evaldeck.test_case import EvalCase
12
+ from evaldeck.trace import Trace
13
+
14
+
15
+ class StepCountMetric(BaseMetric):
16
+ """Count total number of steps in the trace."""
17
+
18
+ name = "step_count"
19
+ unit = "steps"
20
+
21
+ def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
22
+ return MetricResult(
23
+ metric_name=self.name,
24
+ value=float(trace.step_count),
25
+ unit=self.unit,
26
+ )
27
+
28
+
29
+ class TokenUsageMetric(BaseMetric):
30
+ """Total token usage across all LLM calls."""
31
+
32
+ name = "token_usage"
33
+ unit = "tokens"
34
+
35
+ def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
36
+ return MetricResult(
37
+ metric_name=self.name,
38
+ value=float(trace.total_tokens),
39
+ unit=self.unit,
40
+ details={
41
+ "llm_calls": len(trace.llm_calls),
42
+ },
43
+ )
44
+
45
+
46
+ class ToolCallCountMetric(BaseMetric):
47
+ """Count number of tool calls."""
48
+
49
+ name = "tool_call_count"
50
+ unit = "calls"
51
+
52
+ def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
53
+ return MetricResult(
54
+ metric_name=self.name,
55
+ value=float(len(trace.tool_calls)),
56
+ unit=self.unit,
57
+ details={
58
+ "tools": trace.tools_called,
59
+ },
60
+ )
61
+
62
+
63
+ class DurationMetric(BaseMetric):
64
+ """Total execution duration."""
65
+
66
+ name = "duration"
67
+ unit = "ms"
68
+
69
+ def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
70
+ duration = trace.duration_ms or 0.0
71
+ return MetricResult(
72
+ metric_name=self.name,
73
+ value=duration,
74
+ unit=self.unit,
75
+ )
76
+
77
+
78
+ class ToolDiversityMetric(BaseMetric):
79
+ """Measure diversity of tools used (unique tools / total calls)."""
80
+
81
+ name = "tool_diversity"
82
+ unit = "ratio"
83
+
84
+ def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
85
+ tool_calls = trace.tool_calls
86
+ if not tool_calls:
87
+ return MetricResult(
88
+ metric_name=self.name,
89
+ value=0.0,
90
+ unit=self.unit,
91
+ )
92
+
93
+ unique_tools = len(set(trace.tools_called))
94
+ total_calls = len(tool_calls)
95
+ diversity = unique_tools / total_calls
96
+
97
+ return MetricResult(
98
+ metric_name=self.name,
99
+ value=diversity,
100
+ unit=self.unit,
101
+ details={
102
+ "unique_tools": unique_tools,
103
+ "total_calls": total_calls,
104
+ },
105
+ )
106
+
107
+
108
+ class StepEfficiencyMetric(BaseMetric):
109
+ """Measure step efficiency compared to expected max steps.
110
+
111
+ Returns 1.0 if within expected steps, <1.0 if exceeded.
112
+ """
113
+
114
+ name = "step_efficiency"
115
+ unit = "ratio"
116
+
117
+ def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
118
+ actual_steps = trace.step_count
119
+
120
+ # Get expected max from test case
121
+ max_steps = None
122
+ if test_case and test_case.expected.max_steps:
123
+ max_steps = test_case.expected.max_steps
124
+
125
+ if max_steps is None:
126
+ # No baseline, just return step count as negative efficiency
127
+ return MetricResult(
128
+ metric_name=self.name,
129
+ value=1.0,
130
+ unit=self.unit,
131
+ details={
132
+ "actual_steps": actual_steps,
133
+ "max_steps": None,
134
+ },
135
+ )
136
+
137
+ # Calculate efficiency (1.0 = at or under budget, <1.0 = over budget)
138
+ if actual_steps <= max_steps:
139
+ efficiency = 1.0
140
+ else:
141
+ efficiency = max_steps / actual_steps
142
+
143
+ return MetricResult(
144
+ metric_name=self.name,
145
+ value=efficiency,
146
+ unit=self.unit,
147
+ details={
148
+ "actual_steps": actual_steps,
149
+ "max_steps": max_steps,
150
+ },
151
+ )
152
+
153
+
154
+ class LLMCallCountMetric(BaseMetric):
155
+ """Count number of LLM calls."""
156
+
157
+ name = "llm_call_count"
158
+ unit = "calls"
159
+
160
+ def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
161
+ return MetricResult(
162
+ metric_name=self.name,
163
+ value=float(len(trace.llm_calls)),
164
+ unit=self.unit,
165
+ )
166
+
167
+
168
+ class ErrorRateMetric(BaseMetric):
169
+ """Calculate error rate across steps."""
170
+
171
+ name = "error_rate"
172
+ unit = "ratio"
173
+
174
+ def calculate(self, trace: Trace, test_case: EvalCase | None = None) -> MetricResult:
175
+ from evaldeck.trace import StepStatus
176
+
177
+ if not trace.steps:
178
+ return MetricResult(
179
+ metric_name=self.name,
180
+ value=0.0,
181
+ unit=self.unit,
182
+ )
183
+
184
+ error_count = sum(1 for s in trace.steps if s.status == StepStatus.FAILURE)
185
+ error_rate = error_count / len(trace.steps)
186
+
187
+ return MetricResult(
188
+ metric_name=self.name,
189
+ value=error_rate,
190
+ unit=self.unit,
191
+ details={
192
+ "error_count": error_count,
193
+ "total_steps": len(trace.steps),
194
+ },
195
+ )