evaldeck 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evaldeck/cli.py +46 -11
- evaldeck/config.py +6 -4
- evaldeck/evaluator.py +10 -9
- evaldeck/graders/base.py +1 -1
- evaldeck/graders/code.py +2 -2
- evaldeck/graders/llm.py +12 -2
- evaldeck/integrations/langchain.py +33 -11
- evaldeck/integrations/opentelemetry.py +15 -16
- evaldeck/test_case.py +2 -1
- {evaldeck-0.1.3.dist-info → evaldeck-0.1.4.dist-info}/METADATA +2 -1
- evaldeck-0.1.4.dist-info/RECORD +22 -0
- evaldeck-0.1.3.dist-info/RECORD +0 -22
- {evaldeck-0.1.3.dist-info → evaldeck-0.1.4.dist-info}/WHEEL +0 -0
- {evaldeck-0.1.3.dist-info → evaldeck-0.1.4.dist-info}/entry_points.txt +0 -0
- {evaldeck-0.1.3.dist-info → evaldeck-0.1.4.dist-info}/licenses/LICENSE +0 -0
evaldeck/cli.py
CHANGED
|
@@ -2,12 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import logging
|
|
5
6
|
import sys
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
|
|
8
9
|
import click
|
|
9
10
|
from rich import box
|
|
10
11
|
from rich.console import Console
|
|
12
|
+
from rich.logging import RichHandler
|
|
11
13
|
from rich.panel import Panel
|
|
12
14
|
from rich.table import Table
|
|
13
15
|
|
|
@@ -15,6 +17,23 @@ from evaldeck.config import EvaldeckConfig, generate_default_config, generate_ex
|
|
|
15
17
|
from evaldeck.results import EvaluationResult, GradeStatus, RunResult
|
|
16
18
|
|
|
17
19
|
console = Console()
|
|
20
|
+
logger = logging.getLogger("evaldeck")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def setup_logging(verbose: bool) -> None:
|
|
24
|
+
"""Configure logging with rich handler."""
|
|
25
|
+
# Only configure evaldeck logger, not root (to avoid noise from other libraries)
|
|
26
|
+
handler = RichHandler(console=console, show_time=False, show_path=False)
|
|
27
|
+
handler.setFormatter(logging.Formatter("%(message)s"))
|
|
28
|
+
|
|
29
|
+
logger.addHandler(handler)
|
|
30
|
+
logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
|
31
|
+
|
|
32
|
+
# Suppress noisy loggers
|
|
33
|
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
34
|
+
logging.getLogger("openai").setLevel(logging.WARNING)
|
|
35
|
+
logging.getLogger("anthropic").setLevel(logging.WARNING)
|
|
36
|
+
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
|
18
37
|
|
|
19
38
|
|
|
20
39
|
@click.group()
|
|
@@ -93,11 +112,14 @@ def run(
|
|
|
93
112
|
workers: int | None,
|
|
94
113
|
) -> None:
|
|
95
114
|
"""Run evaluations."""
|
|
115
|
+
setup_logging(verbose)
|
|
116
|
+
|
|
96
117
|
try:
|
|
97
118
|
# Load config
|
|
98
119
|
cfg = EvaldeckConfig.load(config)
|
|
120
|
+
logger.debug(f"Loaded config: test_dir={cfg.test_dir}, agent={cfg.agent.module}")
|
|
99
121
|
except FileNotFoundError:
|
|
100
|
-
|
|
122
|
+
logger.error("No evaldeck.yaml found. Run 'evaldeck init' first.")
|
|
101
123
|
sys.exit(1)
|
|
102
124
|
|
|
103
125
|
console.print("[bold]Evaldeck[/bold] - Running evaluations...\n")
|
|
@@ -130,8 +152,8 @@ def run(
|
|
|
130
152
|
|
|
131
153
|
# Check if agent is configured
|
|
132
154
|
if not cfg.agent.module or not cfg.agent.function:
|
|
133
|
-
|
|
134
|
-
|
|
155
|
+
logger.warning("No agent configured in evaldeck.yaml")
|
|
156
|
+
logger.info("Running in dry-run mode (no agent execution)\n")
|
|
135
157
|
|
|
136
158
|
# Show what would be run
|
|
137
159
|
for s in suites:
|
|
@@ -140,6 +162,10 @@ def run(
|
|
|
140
162
|
console.print(f" - {tc.name}")
|
|
141
163
|
sys.exit(0)
|
|
142
164
|
|
|
165
|
+
logger.debug(f"Agent: {cfg.agent.module}.{cfg.agent.function}")
|
|
166
|
+
if cfg.agent.framework:
|
|
167
|
+
logger.debug(f"Framework: {cfg.agent.framework}")
|
|
168
|
+
|
|
143
169
|
# Run evaluations
|
|
144
170
|
def on_result(result: EvaluationResult) -> None:
|
|
145
171
|
"""Print result as it completes."""
|
|
@@ -153,9 +179,20 @@ def run(
|
|
|
153
179
|
duration = f"({result.duration_ms:.1f}ms)" if result.duration_ms else ""
|
|
154
180
|
console.print(f" {icon} {result.test_case_name} {duration}")
|
|
155
181
|
|
|
156
|
-
if verbose
|
|
157
|
-
|
|
158
|
-
|
|
182
|
+
if verbose:
|
|
183
|
+
# Show all grades in verbose mode
|
|
184
|
+
for grade in result.grades:
|
|
185
|
+
if grade.passed:
|
|
186
|
+
grade_icon = "[green]✓[/green]"
|
|
187
|
+
else:
|
|
188
|
+
grade_icon = "[red]✗[/red]"
|
|
189
|
+
msg = grade.message or grade.status.value
|
|
190
|
+
console.print(f" [dim]{grade_icon} {grade.grader_name}: {msg}[/dim]")
|
|
191
|
+
|
|
192
|
+
# Show extra details for LLM graders
|
|
193
|
+
if grade.details and "raw_response" in grade.details:
|
|
194
|
+
response_preview = grade.details["raw_response"][:150].replace("\n", " ")
|
|
195
|
+
logger.debug(f" LLM response: {response_preview}...")
|
|
159
196
|
|
|
160
197
|
# Show concurrency info
|
|
161
198
|
effective_workers = workers if workers is not None else cfg.execution.workers
|
|
@@ -174,14 +211,12 @@ def run(
|
|
|
174
211
|
max_concurrent=workers,
|
|
175
212
|
)
|
|
176
213
|
except ValueError as e:
|
|
177
|
-
|
|
214
|
+
logger.error(f"Error: {e}")
|
|
178
215
|
sys.exit(1)
|
|
179
216
|
except Exception as e:
|
|
180
|
-
|
|
217
|
+
logger.error(f"Evaluation error: {e}")
|
|
181
218
|
if verbose:
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
console.print(traceback.format_exc())
|
|
219
|
+
logger.exception("Full traceback:")
|
|
185
220
|
sys.exit(1)
|
|
186
221
|
|
|
187
222
|
# Print summary
|
evaldeck/config.py
CHANGED
|
@@ -69,10 +69,12 @@ class EvaldeckConfig(BaseModel):
|
|
|
69
69
|
execution: ExecutionConfig = Field(default_factory=ExecutionConfig)
|
|
70
70
|
|
|
71
71
|
# Legacy execution defaults (deprecated, use execution instead)
|
|
72
|
-
defaults: dict[str, Any] = Field(
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
72
|
+
defaults: dict[str, Any] = Field(
|
|
73
|
+
default_factory=lambda: {
|
|
74
|
+
"timeout": 30,
|
|
75
|
+
"retries": 0,
|
|
76
|
+
}
|
|
77
|
+
)
|
|
76
78
|
|
|
77
79
|
# Grader configuration
|
|
78
80
|
graders: GraderDefaults = Field(default_factory=GraderDefaults)
|
evaldeck/evaluator.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
-
from collections.abc import Awaitable, Callable
|
|
6
|
+
from collections.abc import AsyncIterator, Awaitable, Callable
|
|
7
7
|
from contextlib import asynccontextmanager
|
|
8
8
|
from datetime import datetime
|
|
9
9
|
from typing import TYPE_CHECKING, Any
|
|
@@ -32,6 +32,7 @@ from evaldeck.results import (
|
|
|
32
32
|
EvaluationResult,
|
|
33
33
|
GradeResult,
|
|
34
34
|
GradeStatus,
|
|
35
|
+
MetricResult,
|
|
35
36
|
RunResult,
|
|
36
37
|
SuiteResult,
|
|
37
38
|
)
|
|
@@ -266,7 +267,7 @@ class Evaluator:
|
|
|
266
267
|
)
|
|
267
268
|
|
|
268
269
|
# Run graders concurrently
|
|
269
|
-
async def run_grader(grader):
|
|
270
|
+
async def run_grader(grader: BaseGrader) -> GradeResult:
|
|
270
271
|
try:
|
|
271
272
|
return await grader.grade_async(trace, test_case)
|
|
272
273
|
except Exception as e:
|
|
@@ -278,7 +279,7 @@ class Evaluator:
|
|
|
278
279
|
result.add_grade(grade)
|
|
279
280
|
|
|
280
281
|
# Calculate metrics concurrently (supports async custom metrics)
|
|
281
|
-
async def run_metric(metric):
|
|
282
|
+
async def run_metric(metric: BaseMetric) -> MetricResult | None:
|
|
282
283
|
try:
|
|
283
284
|
return await metric.calculate_async(trace, test_case)
|
|
284
285
|
except Exception:
|
|
@@ -348,7 +349,7 @@ class Evaluator:
|
|
|
348
349
|
semaphore = asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
|
|
349
350
|
|
|
350
351
|
@asynccontextmanager
|
|
351
|
-
async def maybe_semaphore():
|
|
352
|
+
async def maybe_semaphore() -> AsyncIterator[None]:
|
|
352
353
|
"""Context manager that optionally acquires semaphore."""
|
|
353
354
|
if semaphore:
|
|
354
355
|
async with semaphore:
|
|
@@ -371,11 +372,11 @@ class Evaluator:
|
|
|
371
372
|
# Add results in original order
|
|
372
373
|
results_by_index: dict[int, EvaluationResult] = {}
|
|
373
374
|
for item in results:
|
|
374
|
-
if isinstance(item,
|
|
375
|
+
if isinstance(item, BaseException):
|
|
375
376
|
# This shouldn't happen since _evaluate_single_async catches exceptions
|
|
376
377
|
continue
|
|
377
|
-
|
|
378
|
-
results_by_index[
|
|
378
|
+
idx, res = item
|
|
379
|
+
results_by_index[idx] = res
|
|
379
380
|
|
|
380
381
|
for i in range(len(suite.test_cases)):
|
|
381
382
|
if i in results_by_index:
|
|
@@ -414,7 +415,7 @@ class Evaluator:
|
|
|
414
415
|
trace = await agent_func(test_case.input) # type: ignore
|
|
415
416
|
else:
|
|
416
417
|
# Run sync function in thread pool to not block event loop
|
|
417
|
-
trace = await asyncio.to_thread(agent_func, test_case.input)
|
|
418
|
+
trace = await asyncio.to_thread(agent_func, test_case.input)
|
|
418
419
|
|
|
419
420
|
# Use async evaluate to run graders concurrently
|
|
420
421
|
return await self.evaluate_async(trace, test_case)
|
|
@@ -584,4 +585,4 @@ class EvaluationRunner:
|
|
|
584
585
|
else:
|
|
585
586
|
raise ValueError(f"Unknown framework: {agent_config.framework}")
|
|
586
587
|
|
|
587
|
-
return func
|
|
588
|
+
return func # type: ignore[no-any-return]
|
evaldeck/graders/base.py
CHANGED
|
@@ -113,7 +113,7 @@ class CompositeGrader(BaseGrader):
|
|
|
113
113
|
# Handle any exceptions
|
|
114
114
|
grade_results: list[GradeResult] = []
|
|
115
115
|
for i, result in enumerate(results):
|
|
116
|
-
if isinstance(result,
|
|
116
|
+
if isinstance(result, BaseException):
|
|
117
117
|
grade_results.append(
|
|
118
118
|
GradeResult.error_result(self.graders[i].name, f"Grader error: {result}")
|
|
119
119
|
)
|
evaldeck/graders/code.py
CHANGED
|
@@ -508,7 +508,7 @@ class CustomGrader(BaseGrader):
|
|
|
508
508
|
self.func = func
|
|
509
509
|
self.module_name = module
|
|
510
510
|
self.function_name = function
|
|
511
|
-
self._loaded_func: Callable | None = None
|
|
511
|
+
self._loaded_func: Callable[..., GradeResult] | None = None
|
|
512
512
|
|
|
513
513
|
def _get_func(self) -> Callable[[Trace, EvalCase], GradeResult]:
|
|
514
514
|
"""Get the grading function."""
|
|
@@ -550,7 +550,7 @@ class CustomGrader(BaseGrader):
|
|
|
550
550
|
try:
|
|
551
551
|
func = self._get_func()
|
|
552
552
|
if asyncio.iscoroutinefunction(func):
|
|
553
|
-
return await func(trace, test_case)
|
|
553
|
+
return await func(trace, test_case) # type: ignore[no-any-return]
|
|
554
554
|
else:
|
|
555
555
|
return await asyncio.to_thread(func, trace, test_case)
|
|
556
556
|
except Exception as e:
|
evaldeck/graders/llm.py
CHANGED
|
@@ -165,6 +165,7 @@ REASON: Your explanation
|
|
|
165
165
|
"""Call Anthropic API (sync)."""
|
|
166
166
|
try:
|
|
167
167
|
from anthropic import Anthropic
|
|
168
|
+
from anthropic.types import TextBlock
|
|
168
169
|
except ImportError:
|
|
169
170
|
raise ImportError(
|
|
170
171
|
"Anthropic package not installed. Run: pip install evaldeck[anthropic]"
|
|
@@ -176,12 +177,17 @@ REASON: Your explanation
|
|
|
176
177
|
max_tokens=1024,
|
|
177
178
|
messages=[{"role": "user", "content": prompt}],
|
|
178
179
|
)
|
|
179
|
-
|
|
180
|
+
# Extract text from first TextBlock
|
|
181
|
+
for block in response.content:
|
|
182
|
+
if isinstance(block, TextBlock):
|
|
183
|
+
return block.text
|
|
184
|
+
return ""
|
|
180
185
|
|
|
181
186
|
async def _call_anthropic_async(self, prompt: str) -> str:
|
|
182
187
|
"""Call Anthropic API (async)."""
|
|
183
188
|
try:
|
|
184
189
|
from anthropic import AsyncAnthropic
|
|
190
|
+
from anthropic.types import TextBlock
|
|
185
191
|
except ImportError:
|
|
186
192
|
raise ImportError(
|
|
187
193
|
"Anthropic package not installed. Run: pip install evaldeck[anthropic]"
|
|
@@ -193,7 +199,11 @@ REASON: Your explanation
|
|
|
193
199
|
max_tokens=1024,
|
|
194
200
|
messages=[{"role": "user", "content": prompt}],
|
|
195
201
|
)
|
|
196
|
-
|
|
202
|
+
# Extract text from first TextBlock
|
|
203
|
+
for block in response.content:
|
|
204
|
+
if isinstance(block, TextBlock):
|
|
205
|
+
return block.text
|
|
206
|
+
return ""
|
|
197
207
|
|
|
198
208
|
def _parse_response(self, response: str) -> tuple[GradeStatus, str, float | None]:
|
|
199
209
|
"""Parse LLM response to extract verdict.
|
|
@@ -5,7 +5,9 @@ Provides automatic instrumentation and trace capture for LangChain/LangGraph age
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
import threading
|
|
9
|
+
from collections.abc import Callable
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
9
11
|
|
|
10
12
|
if TYPE_CHECKING:
|
|
11
13
|
from evaldeck.trace import Trace
|
|
@@ -16,12 +18,17 @@ class LangChainIntegration:
|
|
|
16
18
|
|
|
17
19
|
Automatically sets up OpenTelemetry tracing and provides a wrapper
|
|
18
20
|
that invokes the agent and returns a Trace.
|
|
21
|
+
|
|
22
|
+
Thread-safe: uses thread-local storage to track traces per thread,
|
|
23
|
+
allowing parallel test execution.
|
|
19
24
|
"""
|
|
20
25
|
|
|
21
26
|
def __init__(self) -> None:
|
|
22
27
|
self._processor: Any = None
|
|
23
28
|
self._agent: Any = None
|
|
24
29
|
self._initialized = False
|
|
30
|
+
self._lock = threading.Lock()
|
|
31
|
+
self._local = threading.local()
|
|
25
32
|
|
|
26
33
|
def setup(self, agent_factory: Callable[[], Any]) -> None:
|
|
27
34
|
"""Set up instrumentation and create the agent.
|
|
@@ -46,7 +53,7 @@ class LangChainIntegration:
|
|
|
46
53
|
# Set up OTel tracing
|
|
47
54
|
self._processor = setup_otel_tracing()
|
|
48
55
|
|
|
49
|
-
# Instrument LangChain
|
|
56
|
+
# Instrument LangChain (only once)
|
|
50
57
|
LangChainInstrumentor().instrument()
|
|
51
58
|
|
|
52
59
|
# Create the agent
|
|
@@ -56,6 +63,9 @@ class LangChainIntegration:
|
|
|
56
63
|
def run(self, input: str) -> Trace:
|
|
57
64
|
"""Run the agent and return a trace.
|
|
58
65
|
|
|
66
|
+
Note: Agent invocations are serialized (one at a time) to ensure
|
|
67
|
+
clean trace capture. Evaluations (grading) can still run in parallel.
|
|
68
|
+
|
|
59
69
|
Args:
|
|
60
70
|
input: The input string to send to the agent.
|
|
61
71
|
|
|
@@ -65,18 +75,30 @@ class LangChainIntegration:
|
|
|
65
75
|
if not self._initialized:
|
|
66
76
|
raise RuntimeError("Integration not initialized. Call setup() first.")
|
|
67
77
|
|
|
68
|
-
#
|
|
69
|
-
|
|
78
|
+
# Serialize agent invocations to ensure clean trace capture
|
|
79
|
+
# (OTel trace IDs can get mixed when agents run truly in parallel)
|
|
80
|
+
with self._lock:
|
|
81
|
+
# Record traces before
|
|
82
|
+
traces_before = set(self._processor._traces.keys())
|
|
83
|
+
|
|
84
|
+
# Invoke the agent
|
|
85
|
+
self._invoke_agent(input)
|
|
86
|
+
|
|
87
|
+
# Find the new trace created by this invocation
|
|
88
|
+
traces_after = set(self._processor._traces.keys())
|
|
89
|
+
new_trace_ids = traces_after - traces_before
|
|
90
|
+
|
|
91
|
+
if not new_trace_ids:
|
|
92
|
+
raise RuntimeError("No trace captured from agent execution")
|
|
70
93
|
|
|
71
|
-
|
|
72
|
-
|
|
94
|
+
# Get the trace
|
|
95
|
+
trace_id = new_trace_ids.pop()
|
|
96
|
+
trace: Trace | None = self._processor.get_trace(trace_id)
|
|
73
97
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
if trace is None:
|
|
77
|
-
raise RuntimeError("No trace captured from agent execution")
|
|
98
|
+
if trace is None:
|
|
99
|
+
raise RuntimeError("No trace captured from agent execution")
|
|
78
100
|
|
|
79
|
-
|
|
101
|
+
return trace
|
|
80
102
|
|
|
81
103
|
def _invoke_agent(self, input: str) -> Any:
|
|
82
104
|
"""Invoke the agent with the appropriate format.
|
|
@@ -139,9 +139,12 @@ class EvaldeckSpanProcessor(SpanProcessor):
|
|
|
139
139
|
trace.input = str(attrs.get("input.value", trace.input or ""))
|
|
140
140
|
trace.output = attrs.get("output.value")
|
|
141
141
|
trace.status = self._map_trace_status(span)
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
142
|
+
if span.start_time is not None:
|
|
143
|
+
trace.started_at = self._ns_to_datetime(span.start_time)
|
|
144
|
+
if span.end_time is not None:
|
|
145
|
+
trace.completed_at = self._ns_to_datetime(span.end_time)
|
|
146
|
+
if span.start_time is not None and span.end_time is not None:
|
|
147
|
+
trace.duration_ms = (span.end_time - span.start_time) / 1_000_000
|
|
145
148
|
|
|
146
149
|
# Extract agent/framework info
|
|
147
150
|
if "llm.system" in attrs:
|
|
@@ -150,9 +153,7 @@ class EvaldeckSpanProcessor(SpanProcessor):
|
|
|
150
153
|
trace.metadata["otel_trace_id"] = format(span.context.trace_id, "032x")
|
|
151
154
|
trace.metadata["otel_root_span_id"] = format(span.context.span_id, "016x")
|
|
152
155
|
|
|
153
|
-
def _span_to_step(
|
|
154
|
-
self, span: ReadableSpan, kind: str, attrs: dict[str, Any]
|
|
155
|
-
) -> Step | None:
|
|
156
|
+
def _span_to_step(self, span: ReadableSpan, kind: str, attrs: dict[str, Any]) -> Step | None:
|
|
156
157
|
"""Convert an OpenTelemetry span to an Evaldeck Step."""
|
|
157
158
|
|
|
158
159
|
if kind == SPAN_KIND_LLM:
|
|
@@ -196,11 +197,7 @@ class EvaldeckSpanProcessor(SpanProcessor):
|
|
|
196
197
|
|
|
197
198
|
def _convert_tool_span(self, span: ReadableSpan, attrs: dict[str, Any]) -> Step:
|
|
198
199
|
"""Convert a TOOL span to a Step."""
|
|
199
|
-
tool_name = (
|
|
200
|
-
attrs.get("tool.name")
|
|
201
|
-
or attrs.get("tool_call.function.name")
|
|
202
|
-
or "unknown_tool"
|
|
203
|
-
)
|
|
200
|
+
tool_name = attrs.get("tool.name") or attrs.get("tool_call.function.name") or "unknown_tool"
|
|
204
201
|
|
|
205
202
|
tool_args = self._parse_json(
|
|
206
203
|
attrs.get("tool.parameters")
|
|
@@ -222,9 +219,7 @@ class EvaldeckSpanProcessor(SpanProcessor):
|
|
|
222
219
|
},
|
|
223
220
|
)
|
|
224
221
|
|
|
225
|
-
def _convert_retrieval_span(
|
|
226
|
-
self, span: ReadableSpan, kind: str, attrs: dict[str, Any]
|
|
227
|
-
) -> Step:
|
|
222
|
+
def _convert_retrieval_span(self, span: ReadableSpan, kind: str, attrs: dict[str, Any]) -> Step:
|
|
228
223
|
"""Convert EMBEDDING/RETRIEVER/RERANKER spans to tool call Steps."""
|
|
229
224
|
return Step(
|
|
230
225
|
type=StepType.TOOL_CALL,
|
|
@@ -322,12 +317,16 @@ class EvaldeckSpanProcessor(SpanProcessor):
|
|
|
322
317
|
def _extract_error(self, span: ReadableSpan) -> str | None:
|
|
323
318
|
"""Extract error message from span if present."""
|
|
324
319
|
if span.status.status_code == StatusCode.ERROR:
|
|
325
|
-
|
|
320
|
+
desc: str | None = span.status.description
|
|
321
|
+
return desc
|
|
326
322
|
return None
|
|
327
323
|
|
|
328
324
|
def _calc_duration_ms(self, span: ReadableSpan) -> float:
|
|
329
325
|
"""Calculate span duration in milliseconds."""
|
|
330
|
-
|
|
326
|
+
if span.start_time is None or span.end_time is None:
|
|
327
|
+
return 0.0
|
|
328
|
+
duration: float = (span.end_time - span.start_time) / 1_000_000
|
|
329
|
+
return duration
|
|
331
330
|
|
|
332
331
|
def _ns_to_datetime(self, ns: int) -> datetime:
|
|
333
332
|
"""Convert nanoseconds timestamp to datetime."""
|
evaldeck/test_case.py
CHANGED
|
@@ -109,7 +109,8 @@ class EvalCase(BaseModel):
|
|
|
109
109
|
|
|
110
110
|
def to_yaml(self) -> str:
|
|
111
111
|
"""Convert test case to YAML string."""
|
|
112
|
-
|
|
112
|
+
result: str = yaml.dump(self.model_dump(exclude_none=True), default_flow_style=False)
|
|
113
|
+
return result
|
|
113
114
|
|
|
114
115
|
|
|
115
116
|
class EvalSuite(BaseModel):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: evaldeck
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: The evaluation framework for AI agents. Pytest for agents.
|
|
5
5
|
Project-URL: Homepage, https://github.com/tantra-run/evaldeck-py
|
|
6
6
|
Project-URL: Documentation, https://tantra-run.github.io/evaldeck-py/
|
|
@@ -41,6 +41,7 @@ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
|
41
41
|
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
42
42
|
Requires-Dist: ruff>=0.1; extra == 'dev'
|
|
43
43
|
Requires-Dist: twine>=5.0; extra == 'dev'
|
|
44
|
+
Requires-Dist: types-pyyaml>=6.0; extra == 'dev'
|
|
44
45
|
Provides-Extra: docs
|
|
45
46
|
Requires-Dist: mkdocs-autorefs>=0.5; extra == 'docs'
|
|
46
47
|
Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
evaldeck/__init__.py,sha256=SF9kMDGuf3UHMHrMeT8vBPzdLUtEqTjTAlRk6Fry9b0,1877
|
|
2
|
+
evaldeck/cli.py,sha256=GPMzppuiK4ITF-Nsa3M-5mlupMlmcsOlNldHkDIlZ3s,12136
|
|
3
|
+
evaldeck/config.py,sha256=-3WxA2yYZB1MGu69G2QsQTvzI7PGRx-ejSHk-Sy7igo,5937
|
|
4
|
+
evaldeck/evaluator.py,sha256=ei0cohsNh0pcauZ5KEHnm-xsgRSErnEGQ9Vv-Ewdyis,20242
|
|
5
|
+
evaldeck/results.py,sha256=gygFnuh2cZdZv5ygxDB-Lksv_9N5sAj2HFkEXRgTnqQ,6039
|
|
6
|
+
evaldeck/test_case.py,sha256=zRaNZjXjMDaBd_kkhSviZNdToPClyI23sSjAcWizoUs,5113
|
|
7
|
+
evaldeck/trace.py,sha256=erVrdJyfUilutM1z6NioIp8FVbeCh5XP6VhGtbwAClU,5787
|
|
8
|
+
evaldeck/graders/__init__.py,sha256=M418zN3y06Dn449oUtBZLjwSw_tiUzCx3xvRriFhSd8,882
|
|
9
|
+
evaldeck/graders/base.py,sha256=bJpoCj89en5ZVH5rL1-Yfj1CEMrA00jqb8Hu7DCOeLo,4825
|
|
10
|
+
evaldeck/graders/code.py,sha256=oNyARPDrF-xptftyZsqGB8NuMy0jRrL4Z4Gy7uZqMFc,18061
|
|
11
|
+
evaldeck/graders/llm.py,sha256=ACcNiUt0KZFifHEDwPbj0qv4YFZhSQOUNgjCia1Y8Gc,12105
|
|
12
|
+
evaldeck/integrations/__init__.py,sha256=PwvBNsNCRBsFUVC5hd2vGk7r4DntuPZIDGMYJOojJfg,1131
|
|
13
|
+
evaldeck/integrations/langchain.py,sha256=sZ3aRFzbgCfAvPugEOHXu9f3dsHQHepW29JaOCWkNS4,4780
|
|
14
|
+
evaldeck/integrations/opentelemetry.py,sha256=1BhmXG_gGiUSN26UmC-S-HTw239Zpf6Gf08TxWrjCj4,15403
|
|
15
|
+
evaldeck/metrics/__init__.py,sha256=jXTIx5k9f1CjwS-9jc25YLeodhencoUOLfbP8qvcbbw,551
|
|
16
|
+
evaldeck/metrics/base.py,sha256=ibUQNfbkQEXTX1x8SqmFWelWAF1DQ785LXP1KYIZWUk,1790
|
|
17
|
+
evaldeck/metrics/builtin.py,sha256=ghdqeZRN51PhLeG8bGnPW2NNoPUAaeD05HtYlWw5yQM,5399
|
|
18
|
+
evaldeck-0.1.4.dist-info/METADATA,sha256=6e0AgA3g426honozJbTY0IUYPMIZmKzR_hSQ7CbGyx0,8829
|
|
19
|
+
evaldeck-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
20
|
+
evaldeck-0.1.4.dist-info/entry_points.txt,sha256=wtyPiDMSTKf41ShIbQC5X8USDn68OybGecpTaMNaGts,47
|
|
21
|
+
evaldeck-0.1.4.dist-info/licenses/LICENSE,sha256=sEp2tzjeTY9bP_jb1TWAGV4yvxNhVngHpJNglJkT9YA,10770
|
|
22
|
+
evaldeck-0.1.4.dist-info/RECORD,,
|
evaldeck-0.1.3.dist-info/RECORD
DELETED
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
evaldeck/__init__.py,sha256=SF9kMDGuf3UHMHrMeT8vBPzdLUtEqTjTAlRk6Fry9b0,1877
|
|
2
|
-
evaldeck/cli.py,sha256=Khrl2CRkrYP18b1mG7sot82t-Glm4YAuNJxNkbRjuGU,10655
|
|
3
|
-
evaldeck/config.py,sha256=0Ge9ZWxV_xZ68vzFkDy_IZTyPB_TtDDJtd_gN6tRyoY,5911
|
|
4
|
-
evaldeck/evaluator.py,sha256=bOIL7vaafYieUD7oZWAs6cZCK9ILCp0Wh3OfLKNhXFQ,20115
|
|
5
|
-
evaldeck/results.py,sha256=gygFnuh2cZdZv5ygxDB-Lksv_9N5sAj2HFkEXRgTnqQ,6039
|
|
6
|
-
evaldeck/test_case.py,sha256=cy3Qfcuh4h1BlLPAncthzx3ILabtnnqN76MNhoA_9j8,5084
|
|
7
|
-
evaldeck/trace.py,sha256=erVrdJyfUilutM1z6NioIp8FVbeCh5XP6VhGtbwAClU,5787
|
|
8
|
-
evaldeck/graders/__init__.py,sha256=M418zN3y06Dn449oUtBZLjwSw_tiUzCx3xvRriFhSd8,882
|
|
9
|
-
evaldeck/graders/base.py,sha256=CvLq_AQQfQzdrb4Hs1q6gcKB05e0qfWn31fxXir8T-k,4821
|
|
10
|
-
evaldeck/graders/code.py,sha256=t2rfaB-U9LZnwtyCQ8NIW3Qxrb9aGVlgzgTU8oOHJuM,18012
|
|
11
|
-
evaldeck/graders/llm.py,sha256=nWMPacy-wTLKcE-PnIBdWyD1OHpXKNaTOyF1eicbdK0,11725
|
|
12
|
-
evaldeck/integrations/__init__.py,sha256=PwvBNsNCRBsFUVC5hd2vGk7r4DntuPZIDGMYJOojJfg,1131
|
|
13
|
-
evaldeck/integrations/langchain.py,sha256=TYbtHgFOjUpGgLihW4Tnqyyq-AiC_9dy8gc7a0_7kIM,3839
|
|
14
|
-
evaldeck/integrations/opentelemetry.py,sha256=j518FXsD0pqMNF4TvO97elX9oDiK_VaKxXd243q8dLE,15164
|
|
15
|
-
evaldeck/metrics/__init__.py,sha256=jXTIx5k9f1CjwS-9jc25YLeodhencoUOLfbP8qvcbbw,551
|
|
16
|
-
evaldeck/metrics/base.py,sha256=ibUQNfbkQEXTX1x8SqmFWelWAF1DQ785LXP1KYIZWUk,1790
|
|
17
|
-
evaldeck/metrics/builtin.py,sha256=ghdqeZRN51PhLeG8bGnPW2NNoPUAaeD05HtYlWw5yQM,5399
|
|
18
|
-
evaldeck-0.1.3.dist-info/METADATA,sha256=9vJ9PR4y-KaUfGjKSNn24T9Hx5hRdiMH9HrBN_nM6oE,8780
|
|
19
|
-
evaldeck-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
20
|
-
evaldeck-0.1.3.dist-info/entry_points.txt,sha256=wtyPiDMSTKf41ShIbQC5X8USDn68OybGecpTaMNaGts,47
|
|
21
|
-
evaldeck-0.1.3.dist-info/licenses/LICENSE,sha256=sEp2tzjeTY9bP_jb1TWAGV4yvxNhVngHpJNglJkT9YA,10770
|
|
22
|
-
evaldeck-0.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|