evaldeck 0.1.3__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {evaldeck-0.1.3 → evaldeck-0.1.4}/.gitignore +3 -1
  2. {evaldeck-0.1.3 → evaldeck-0.1.4}/.pre-commit-config.yaml +4 -1
  3. {evaldeck-0.1.3 → evaldeck-0.1.4}/PKG-INFO +2 -1
  4. {evaldeck-0.1.3 → evaldeck-0.1.4}/pyproject.toml +2 -1
  5. evaldeck-0.1.4/scripts/publish.sh +25 -0
  6. {evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/cli.py +46 -11
  7. {evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/config.py +6 -4
  8. {evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/evaluator.py +10 -9
  9. {evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/graders/base.py +1 -1
  10. {evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/graders/code.py +2 -2
  11. {evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/graders/llm.py +12 -2
  12. {evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/integrations/langchain.py +33 -11
  13. {evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/integrations/opentelemetry.py +15 -16
  14. {evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/test_case.py +2 -1
  15. {evaldeck-0.1.3 → evaldeck-0.1.4}/CONTRIBUTING.md +0 -0
  16. {evaldeck-0.1.3 → evaldeck-0.1.4}/LICENSE +0 -0
  17. {evaldeck-0.1.3 → evaldeck-0.1.4}/README.md +0 -0
  18. {evaldeck-0.1.3 → evaldeck-0.1.4}/mkdocs.yml +0 -0
  19. {evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/__init__.py +0 -0
  20. {evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/graders/__init__.py +0 -0
  21. {evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/integrations/__init__.py +0 -0
  22. {evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/metrics/__init__.py +0 -0
  23. {evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/metrics/base.py +0 -0
  24. {evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/metrics/builtin.py +0 -0
  25. {evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/results.py +0 -0
  26. {evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/trace.py +0 -0
@@ -90,4 +90,6 @@ evaldeck_results/
90
90
  Thumbs.db
91
91
 
92
92
  # Internal
93
- internal/
93
+ internal/
94
+ .claude/
95
+ site/
@@ -23,5 +23,8 @@ repos:
23
23
  - pydantic>=2.0
24
24
  - click>=8.0
25
25
  - types-PyYAML
26
- args: [--ignore-missing-imports]
26
+ - openai>=1.0
27
+ - anthropic>=0.18
28
+ - opentelemetry-sdk>=1.20
29
+ - openinference-instrumentation-langchain>=0.1
27
30
  files: ^src/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evaldeck
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: The evaluation framework for AI agents. Pytest for agents.
5
5
  Project-URL: Homepage, https://github.com/tantra-run/evaldeck-py
6
6
  Project-URL: Documentation, https://tantra-run.github.io/evaldeck-py/
@@ -41,6 +41,7 @@ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
41
41
  Requires-Dist: pytest>=7.0; extra == 'dev'
42
42
  Requires-Dist: ruff>=0.1; extra == 'dev'
43
43
  Requires-Dist: twine>=5.0; extra == 'dev'
44
+ Requires-Dist: types-pyyaml>=6.0; extra == 'dev'
44
45
  Provides-Extra: docs
45
46
  Requires-Dist: mkdocs-autorefs>=0.5; extra == 'docs'
46
47
  Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "evaldeck"
7
- version = "0.1.3"
7
+ version = "0.1.4"
8
8
  description = "The evaluation framework for AI agents. Pytest for agents."
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
@@ -51,6 +51,7 @@ dev = [
51
51
  "pytest-cov>=4.0",
52
52
  "ruff>=0.1",
53
53
  "mypy>=1.0",
54
+ "types-PyYAML>=6.0",
54
55
  "pre-commit>=3.0",
55
56
  "build>=1.0",
56
57
  "twine>=5.0",
@@ -0,0 +1,25 @@
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ # Check for PYPI_API_TOKEN
5
+ if [ -z "$PYPI_API_TOKEN" ]; then
6
+ echo "Error: PYPI_API_TOKEN environment variable not set"
7
+ exit 1
8
+ fi
9
+
10
+ # Get script directory and project root
11
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
12
+ PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
13
+
14
+ cd "$PROJECT_ROOT"
15
+
16
+ echo "Cleaning old build artifacts..."
17
+ rm -rf dist/ build/ *.egg-info src/*.egg-info
18
+
19
+ echo "Building package..."
20
+ python -m build
21
+
22
+ echo "Uploading to PyPI..."
23
+ python -m twine upload dist/* -u __token__ -p "$PYPI_API_TOKEN"
24
+
25
+ echo "Done!"
@@ -2,12 +2,14 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import logging
5
6
  import sys
6
7
  from pathlib import Path
7
8
 
8
9
  import click
9
10
  from rich import box
10
11
  from rich.console import Console
12
+ from rich.logging import RichHandler
11
13
  from rich.panel import Panel
12
14
  from rich.table import Table
13
15
 
@@ -15,6 +17,23 @@ from evaldeck.config import EvaldeckConfig, generate_default_config, generate_ex
15
17
  from evaldeck.results import EvaluationResult, GradeStatus, RunResult
16
18
 
17
19
  console = Console()
20
+ logger = logging.getLogger("evaldeck")
21
+
22
+
23
+ def setup_logging(verbose: bool) -> None:
24
+ """Configure logging with rich handler."""
25
+ # Only configure evaldeck logger, not root (to avoid noise from other libraries)
26
+ handler = RichHandler(console=console, show_time=False, show_path=False)
27
+ handler.setFormatter(logging.Formatter("%(message)s"))
28
+
29
+ logger.addHandler(handler)
30
+ logger.setLevel(logging.DEBUG if verbose else logging.INFO)
31
+
32
+ # Suppress noisy loggers
33
+ logging.getLogger("httpx").setLevel(logging.WARNING)
34
+ logging.getLogger("openai").setLevel(logging.WARNING)
35
+ logging.getLogger("anthropic").setLevel(logging.WARNING)
36
+ logging.getLogger("httpcore").setLevel(logging.WARNING)
18
37
 
19
38
 
20
39
  @click.group()
@@ -93,11 +112,14 @@ def run(
93
112
  workers: int | None,
94
113
  ) -> None:
95
114
  """Run evaluations."""
115
+ setup_logging(verbose)
116
+
96
117
  try:
97
118
  # Load config
98
119
  cfg = EvaldeckConfig.load(config)
120
+ logger.debug(f"Loaded config: test_dir={cfg.test_dir}, agent={cfg.agent.module}")
99
121
  except FileNotFoundError:
100
- console.print("[red]No evaldeck.yaml found. Run 'evaldeck init' first.[/red]")
122
+ logger.error("No evaldeck.yaml found. Run 'evaldeck init' first.")
101
123
  sys.exit(1)
102
124
 
103
125
  console.print("[bold]Evaldeck[/bold] - Running evaluations...\n")
@@ -130,8 +152,8 @@ def run(
130
152
 
131
153
  # Check if agent is configured
132
154
  if not cfg.agent.module or not cfg.agent.function:
133
- console.print("[yellow]No agent configured in evaldeck.yaml[/yellow]")
134
- console.print("Running in dry-run mode (no agent execution)\n")
155
+ logger.warning("No agent configured in evaldeck.yaml")
156
+ logger.info("Running in dry-run mode (no agent execution)\n")
135
157
 
136
158
  # Show what would be run
137
159
  for s in suites:
@@ -140,6 +162,10 @@ def run(
140
162
  console.print(f" - {tc.name}")
141
163
  sys.exit(0)
142
164
 
165
+ logger.debug(f"Agent: {cfg.agent.module}.{cfg.agent.function}")
166
+ if cfg.agent.framework:
167
+ logger.debug(f"Framework: {cfg.agent.framework}")
168
+
143
169
  # Run evaluations
144
170
  def on_result(result: EvaluationResult) -> None:
145
171
  """Print result as it completes."""
@@ -153,9 +179,20 @@ def run(
153
179
  duration = f"({result.duration_ms:.1f}ms)" if result.duration_ms else ""
154
180
  console.print(f" {icon} {result.test_case_name} {duration}")
155
181
 
156
- if verbose and not result.passed:
157
- for grade in result.failed_grades:
158
- console.print(f" [dim]└─ {grade.grader_name}: {grade.message}[/dim]")
182
+ if verbose:
183
+ # Show all grades in verbose mode
184
+ for grade in result.grades:
185
+ if grade.passed:
186
+ grade_icon = "[green]✓[/green]"
187
+ else:
188
+ grade_icon = "[red]✗[/red]"
189
+ msg = grade.message or grade.status.value
190
+ console.print(f" [dim]{grade_icon} {grade.grader_name}: {msg}[/dim]")
191
+
192
+ # Show extra details for LLM graders
193
+ if grade.details and "raw_response" in grade.details:
194
+ response_preview = grade.details["raw_response"][:150].replace("\n", " ")
195
+ logger.debug(f" LLM response: {response_preview}...")
159
196
 
160
197
  # Show concurrency info
161
198
  effective_workers = workers if workers is not None else cfg.execution.workers
@@ -174,14 +211,12 @@ def run(
174
211
  max_concurrent=workers,
175
212
  )
176
213
  except ValueError as e:
177
- console.print(f"[red]Error: {e}[/red]")
214
+ logger.error(f"Error: {e}")
178
215
  sys.exit(1)
179
216
  except Exception as e:
180
- console.print(f"[red]Evaluation error: {e}[/red]")
217
+ logger.error(f"Evaluation error: {e}")
181
218
  if verbose:
182
- import traceback
183
-
184
- console.print(traceback.format_exc())
219
+ logger.exception("Full traceback:")
185
220
  sys.exit(1)
186
221
 
187
222
  # Print summary
@@ -69,10 +69,12 @@ class EvaldeckConfig(BaseModel):
69
69
  execution: ExecutionConfig = Field(default_factory=ExecutionConfig)
70
70
 
71
71
  # Legacy execution defaults (deprecated, use execution instead)
72
- defaults: dict[str, Any] = Field(default_factory=lambda: {
73
- "timeout": 30,
74
- "retries": 0,
75
- })
72
+ defaults: dict[str, Any] = Field(
73
+ default_factory=lambda: {
74
+ "timeout": 30,
75
+ "retries": 0,
76
+ }
77
+ )
76
78
 
77
79
  # Grader configuration
78
80
  graders: GraderDefaults = Field(default_factory=GraderDefaults)
@@ -3,7 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import asyncio
6
- from collections.abc import Awaitable, Callable
6
+ from collections.abc import AsyncIterator, Awaitable, Callable
7
7
  from contextlib import asynccontextmanager
8
8
  from datetime import datetime
9
9
  from typing import TYPE_CHECKING, Any
@@ -32,6 +32,7 @@ from evaldeck.results import (
32
32
  EvaluationResult,
33
33
  GradeResult,
34
34
  GradeStatus,
35
+ MetricResult,
35
36
  RunResult,
36
37
  SuiteResult,
37
38
  )
@@ -266,7 +267,7 @@ class Evaluator:
266
267
  )
267
268
 
268
269
  # Run graders concurrently
269
- async def run_grader(grader):
270
+ async def run_grader(grader: BaseGrader) -> GradeResult:
270
271
  try:
271
272
  return await grader.grade_async(trace, test_case)
272
273
  except Exception as e:
@@ -278,7 +279,7 @@ class Evaluator:
278
279
  result.add_grade(grade)
279
280
 
280
281
  # Calculate metrics concurrently (supports async custom metrics)
281
- async def run_metric(metric):
282
+ async def run_metric(metric: BaseMetric) -> MetricResult | None:
282
283
  try:
283
284
  return await metric.calculate_async(trace, test_case)
284
285
  except Exception:
@@ -348,7 +349,7 @@ class Evaluator:
348
349
  semaphore = asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
349
350
 
350
351
  @asynccontextmanager
351
- async def maybe_semaphore():
352
+ async def maybe_semaphore() -> AsyncIterator[None]:
352
353
  """Context manager that optionally acquires semaphore."""
353
354
  if semaphore:
354
355
  async with semaphore:
@@ -371,11 +372,11 @@ class Evaluator:
371
372
  # Add results in original order
372
373
  results_by_index: dict[int, EvaluationResult] = {}
373
374
  for item in results:
374
- if isinstance(item, Exception):
375
+ if isinstance(item, BaseException):
375
376
  # This shouldn't happen since _evaluate_single_async catches exceptions
376
377
  continue
377
- index, result = item
378
- results_by_index[index] = result
378
+ idx, res = item
379
+ results_by_index[idx] = res
379
380
 
380
381
  for i in range(len(suite.test_cases)):
381
382
  if i in results_by_index:
@@ -414,7 +415,7 @@ class Evaluator:
414
415
  trace = await agent_func(test_case.input) # type: ignore
415
416
  else:
416
417
  # Run sync function in thread pool to not block event loop
417
- trace = await asyncio.to_thread(agent_func, test_case.input) # type: ignore
418
+ trace = await asyncio.to_thread(agent_func, test_case.input)
418
419
 
419
420
  # Use async evaluate to run graders concurrently
420
421
  return await self.evaluate_async(trace, test_case)
@@ -584,4 +585,4 @@ class EvaluationRunner:
584
585
  else:
585
586
  raise ValueError(f"Unknown framework: {agent_config.framework}")
586
587
 
587
- return func
588
+ return func # type: ignore[no-any-return]
@@ -113,7 +113,7 @@ class CompositeGrader(BaseGrader):
113
113
  # Handle any exceptions
114
114
  grade_results: list[GradeResult] = []
115
115
  for i, result in enumerate(results):
116
- if isinstance(result, Exception):
116
+ if isinstance(result, BaseException):
117
117
  grade_results.append(
118
118
  GradeResult.error_result(self.graders[i].name, f"Grader error: {result}")
119
119
  )
@@ -508,7 +508,7 @@ class CustomGrader(BaseGrader):
508
508
  self.func = func
509
509
  self.module_name = module
510
510
  self.function_name = function
511
- self._loaded_func: Callable | None = None
511
+ self._loaded_func: Callable[..., GradeResult] | None = None
512
512
 
513
513
  def _get_func(self) -> Callable[[Trace, EvalCase], GradeResult]:
514
514
  """Get the grading function."""
@@ -550,7 +550,7 @@ class CustomGrader(BaseGrader):
550
550
  try:
551
551
  func = self._get_func()
552
552
  if asyncio.iscoroutinefunction(func):
553
- return await func(trace, test_case)
553
+ return await func(trace, test_case) # type: ignore[no-any-return]
554
554
  else:
555
555
  return await asyncio.to_thread(func, trace, test_case)
556
556
  except Exception as e:
@@ -165,6 +165,7 @@ REASON: Your explanation
165
165
  """Call Anthropic API (sync)."""
166
166
  try:
167
167
  from anthropic import Anthropic
168
+ from anthropic.types import TextBlock
168
169
  except ImportError:
169
170
  raise ImportError(
170
171
  "Anthropic package not installed. Run: pip install evaldeck[anthropic]"
@@ -176,12 +177,17 @@ REASON: Your explanation
176
177
  max_tokens=1024,
177
178
  messages=[{"role": "user", "content": prompt}],
178
179
  )
179
- return response.content[0].text
180
+ # Extract text from first TextBlock
181
+ for block in response.content:
182
+ if isinstance(block, TextBlock):
183
+ return block.text
184
+ return ""
180
185
 
181
186
  async def _call_anthropic_async(self, prompt: str) -> str:
182
187
  """Call Anthropic API (async)."""
183
188
  try:
184
189
  from anthropic import AsyncAnthropic
190
+ from anthropic.types import TextBlock
185
191
  except ImportError:
186
192
  raise ImportError(
187
193
  "Anthropic package not installed. Run: pip install evaldeck[anthropic]"
@@ -193,7 +199,11 @@ REASON: Your explanation
193
199
  max_tokens=1024,
194
200
  messages=[{"role": "user", "content": prompt}],
195
201
  )
196
- return response.content[0].text
202
+ # Extract text from first TextBlock
203
+ for block in response.content:
204
+ if isinstance(block, TextBlock):
205
+ return block.text
206
+ return ""
197
207
 
198
208
  def _parse_response(self, response: str) -> tuple[GradeStatus, str, float | None]:
199
209
  """Parse LLM response to extract verdict.
@@ -5,7 +5,9 @@ Provides automatic instrumentation and trace capture for LangChain/LangGraph age
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
- from typing import TYPE_CHECKING, Any, Callable
8
+ import threading
9
+ from collections.abc import Callable
10
+ from typing import TYPE_CHECKING, Any
9
11
 
10
12
  if TYPE_CHECKING:
11
13
  from evaldeck.trace import Trace
@@ -16,12 +18,17 @@ class LangChainIntegration:
16
18
 
17
19
  Automatically sets up OpenTelemetry tracing and provides a wrapper
18
20
  that invokes the agent and returns a Trace.
21
+
22
+ Thread-safe: uses thread-local storage to track traces per thread,
23
+ allowing parallel test execution.
19
24
  """
20
25
 
21
26
  def __init__(self) -> None:
22
27
  self._processor: Any = None
23
28
  self._agent: Any = None
24
29
  self._initialized = False
30
+ self._lock = threading.Lock()
31
+ self._local = threading.local()
25
32
 
26
33
  def setup(self, agent_factory: Callable[[], Any]) -> None:
27
34
  """Set up instrumentation and create the agent.
@@ -46,7 +53,7 @@ class LangChainIntegration:
46
53
  # Set up OTel tracing
47
54
  self._processor = setup_otel_tracing()
48
55
 
49
- # Instrument LangChain
56
+ # Instrument LangChain (only once)
50
57
  LangChainInstrumentor().instrument()
51
58
 
52
59
  # Create the agent
@@ -56,6 +63,9 @@ class LangChainIntegration:
56
63
  def run(self, input: str) -> Trace:
57
64
  """Run the agent and return a trace.
58
65
 
66
+ Note: Agent invocations are serialized (one at a time) to ensure
67
+ clean trace capture. Evaluations (grading) can still run in parallel.
68
+
59
69
  Args:
60
70
  input: The input string to send to the agent.
61
71
 
@@ -65,18 +75,30 @@ class LangChainIntegration:
65
75
  if not self._initialized:
66
76
  raise RuntimeError("Integration not initialized. Call setup() first.")
67
77
 
68
- # Reset processor for fresh trace
69
- self._processor.reset()
78
+ # Serialize agent invocations to ensure clean trace capture
79
+ # (OTel trace IDs can get mixed when agents run truly in parallel)
80
+ with self._lock:
81
+ # Record traces before
82
+ traces_before = set(self._processor._traces.keys())
83
+
84
+ # Invoke the agent
85
+ self._invoke_agent(input)
86
+
87
+ # Find the new trace created by this invocation
88
+ traces_after = set(self._processor._traces.keys())
89
+ new_trace_ids = traces_after - traces_before
90
+
91
+ if not new_trace_ids:
92
+ raise RuntimeError("No trace captured from agent execution")
70
93
 
71
- # Invoke the agent - auto-detect format
72
- self._invoke_agent(input)
94
+ # Get the trace
95
+ trace_id = new_trace_ids.pop()
96
+ trace: Trace | None = self._processor.get_trace(trace_id)
73
97
 
74
- # Get and return trace
75
- trace = self._processor.get_latest_trace()
76
- if trace is None:
77
- raise RuntimeError("No trace captured from agent execution")
98
+ if trace is None:
99
+ raise RuntimeError("No trace captured from agent execution")
78
100
 
79
- return trace
101
+ return trace
80
102
 
81
103
  def _invoke_agent(self, input: str) -> Any:
82
104
  """Invoke the agent with the appropriate format.
@@ -139,9 +139,12 @@ class EvaldeckSpanProcessor(SpanProcessor):
139
139
  trace.input = str(attrs.get("input.value", trace.input or ""))
140
140
  trace.output = attrs.get("output.value")
141
141
  trace.status = self._map_trace_status(span)
142
- trace.started_at = self._ns_to_datetime(span.start_time)
143
- trace.completed_at = self._ns_to_datetime(span.end_time)
144
- trace.duration_ms = (span.end_time - span.start_time) / 1_000_000
142
+ if span.start_time is not None:
143
+ trace.started_at = self._ns_to_datetime(span.start_time)
144
+ if span.end_time is not None:
145
+ trace.completed_at = self._ns_to_datetime(span.end_time)
146
+ if span.start_time is not None and span.end_time is not None:
147
+ trace.duration_ms = (span.end_time - span.start_time) / 1_000_000
145
148
 
146
149
  # Extract agent/framework info
147
150
  if "llm.system" in attrs:
@@ -150,9 +153,7 @@ class EvaldeckSpanProcessor(SpanProcessor):
150
153
  trace.metadata["otel_trace_id"] = format(span.context.trace_id, "032x")
151
154
  trace.metadata["otel_root_span_id"] = format(span.context.span_id, "016x")
152
155
 
153
- def _span_to_step(
154
- self, span: ReadableSpan, kind: str, attrs: dict[str, Any]
155
- ) -> Step | None:
156
+ def _span_to_step(self, span: ReadableSpan, kind: str, attrs: dict[str, Any]) -> Step | None:
156
157
  """Convert an OpenTelemetry span to an Evaldeck Step."""
157
158
 
158
159
  if kind == SPAN_KIND_LLM:
@@ -196,11 +197,7 @@ class EvaldeckSpanProcessor(SpanProcessor):
196
197
 
197
198
  def _convert_tool_span(self, span: ReadableSpan, attrs: dict[str, Any]) -> Step:
198
199
  """Convert a TOOL span to a Step."""
199
- tool_name = (
200
- attrs.get("tool.name")
201
- or attrs.get("tool_call.function.name")
202
- or "unknown_tool"
203
- )
200
+ tool_name = attrs.get("tool.name") or attrs.get("tool_call.function.name") or "unknown_tool"
204
201
 
205
202
  tool_args = self._parse_json(
206
203
  attrs.get("tool.parameters")
@@ -222,9 +219,7 @@ class EvaldeckSpanProcessor(SpanProcessor):
222
219
  },
223
220
  )
224
221
 
225
- def _convert_retrieval_span(
226
- self, span: ReadableSpan, kind: str, attrs: dict[str, Any]
227
- ) -> Step:
222
+ def _convert_retrieval_span(self, span: ReadableSpan, kind: str, attrs: dict[str, Any]) -> Step:
228
223
  """Convert EMBEDDING/RETRIEVER/RERANKER spans to tool call Steps."""
229
224
  return Step(
230
225
  type=StepType.TOOL_CALL,
@@ -322,12 +317,16 @@ class EvaldeckSpanProcessor(SpanProcessor):
322
317
  def _extract_error(self, span: ReadableSpan) -> str | None:
323
318
  """Extract error message from span if present."""
324
319
  if span.status.status_code == StatusCode.ERROR:
325
- return span.status.description
320
+ desc: str | None = span.status.description
321
+ return desc
326
322
  return None
327
323
 
328
324
  def _calc_duration_ms(self, span: ReadableSpan) -> float:
329
325
  """Calculate span duration in milliseconds."""
330
- return (span.end_time - span.start_time) / 1_000_000
326
+ if span.start_time is None or span.end_time is None:
327
+ return 0.0
328
+ duration: float = (span.end_time - span.start_time) / 1_000_000
329
+ return duration
331
330
 
332
331
  def _ns_to_datetime(self, ns: int) -> datetime:
333
332
  """Convert nanoseconds timestamp to datetime."""
@@ -109,7 +109,8 @@ class EvalCase(BaseModel):
109
109
 
110
110
  def to_yaml(self) -> str:
111
111
  """Convert test case to YAML string."""
112
- return yaml.dump(self.model_dump(exclude_none=True), default_flow_style=False)
112
+ result: str = yaml.dump(self.model_dump(exclude_none=True), default_flow_style=False)
113
+ return result
113
114
 
114
115
 
115
116
  class EvalSuite(BaseModel):
File without changes
File without changes
File without changes
File without changes
File without changes