PyPI - evaldeck - Versions diffs - 0.1.3__tar.gz → 0.1.4__tar.gz - Mend

evaldeck 0.1.3tar.gz → 0.1.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

{evaldeck-0.1.3 → evaldeck-0.1.4}/.gitignore RENAMED Viewed

@@ -90,4 +90,6 @@ evaldeck_results/
 Thumbs.db
 # Internal
-internal/
+internal/
+.claude/
+site/

{evaldeck-0.1.3 → evaldeck-0.1.4}/.pre-commit-config.yaml RENAMED Viewed

@@ -23,5 +23,8 @@ repos:
           - pydantic>=2.0
           - click>=8.0
           - types-PyYAML
-        args: [--ignore-missing-imports]
+          - openai>=1.0
+          - anthropic>=0.18
+          - opentelemetry-sdk>=1.20
+          - openinference-instrumentation-langchain>=0.1
         files: ^src/

{evaldeck-0.1.3 → evaldeck-0.1.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: evaldeck
-Version: 0.1.3
+Version: 0.1.4
 Summary: The evaluation framework for AI agents. Pytest for agents.
 Project-URL: Homepage, https://github.com/tantra-run/evaldeck-py
 Project-URL: Documentation, https://tantra-run.github.io/evaldeck-py/
@@ -41,6 +41,7 @@ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
 Requires-Dist: pytest>=7.0; extra == 'dev'
 Requires-Dist: ruff>=0.1; extra == 'dev'
 Requires-Dist: twine>=5.0; extra == 'dev'
+Requires-Dist: types-pyyaml>=6.0; extra == 'dev'
 Provides-Extra: docs
 Requires-Dist: mkdocs-autorefs>=0.5; extra == 'docs'
 Requires-Dist: mkdocs-material>=9.5; extra == 'docs'

{evaldeck-0.1.3 → evaldeck-0.1.4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "evaldeck"
-version = "0.1.3"
+version = "0.1.4"
 description = "The evaluation framework for AI agents. Pytest for agents."
 readme = "README.md"
 license = "Apache-2.0"
@@ -51,6 +51,7 @@ dev = [
     "pytest-cov>=4.0",
     "ruff>=0.1",
     "mypy>=1.0",
+    "types-PyYAML>=6.0",
     "pre-commit>=3.0",
     "build>=1.0",
     "twine>=5.0",

evaldeck-0.1.4/scripts/publish.sh ADDED Viewed

@@ -0,0 +1,25 @@
+#!/bin/bash
+set -e
+# Check for PYPI_API_TOKEN
+if [ -z "$PYPI_API_TOKEN" ]; then
+    echo "Error: PYPI_API_TOKEN environment variable not set"
+    exit 1
+fi
+# Get script directory and project root
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+cd "$PROJECT_ROOT"
+echo "Cleaning old build artifacts..."
+rm -rf dist/ build/ *.egg-info src/*.egg-info
+echo "Building package..."
+python -m build
+echo "Uploading to PyPI..."
+python -m twine upload dist/* -u __token__ -p "$PYPI_API_TOKEN"
+echo "Done!"

{evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/cli.py RENAMED Viewed

@@ -2,12 +2,14 @@
 from __future__ import annotations
+import logging
 import sys
 from pathlib import Path
 import click
 from rich import box
 from rich.console import Console
+from rich.logging import RichHandler
 from rich.panel import Panel
 from rich.table import Table
@@ -15,6 +17,23 @@ from evaldeck.config import EvaldeckConfig, generate_default_config, generate_ex
 from evaldeck.results import EvaluationResult, GradeStatus, RunResult
 console = Console()
+logger = logging.getLogger("evaldeck")
+def setup_logging(verbose: bool) -> None:
+    """Configure logging with rich handler."""
+    # Only configure evaldeck logger, not root (to avoid noise from other libraries)
+    handler = RichHandler(console=console, show_time=False, show_path=False)
+    handler.setFormatter(logging.Formatter("%(message)s"))
+    logger.addHandler(handler)
+    logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    # Suppress noisy loggers
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+    logging.getLogger("openai").setLevel(logging.WARNING)
+    logging.getLogger("anthropic").setLevel(logging.WARNING)
+    logging.getLogger("httpcore").setLevel(logging.WARNING)
 @click.group()
@@ -93,11 +112,14 @@ def run(
     workers: int | None,
 ) -> None:
     """Run evaluations."""
+    setup_logging(verbose)
     try:
         # Load config
         cfg = EvaldeckConfig.load(config)
+        logger.debug(f"Loaded config: test_dir={cfg.test_dir}, agent={cfg.agent.module}")
     except FileNotFoundError:
-        console.print("[red]No evaldeck.yaml found. Run 'evaldeck init' first.[/red]")
+        logger.error("No evaldeck.yaml found. Run 'evaldeck init' first.")
         sys.exit(1)
     console.print("[bold]Evaldeck[/bold] - Running evaluations...\n")
@@ -130,8 +152,8 @@ def run(
     # Check if agent is configured
     if not cfg.agent.module or not cfg.agent.function:
-        console.print("[yellow]No agent configured in evaldeck.yaml[/yellow]")
-        console.print("Running in dry-run mode (no agent execution)\n")
+        logger.warning("No agent configured in evaldeck.yaml")
+        logger.info("Running in dry-run mode (no agent execution)\n")
         # Show what would be run
         for s in suites:
@@ -140,6 +162,10 @@ def run(
                 console.print(f"  - {tc.name}")
         sys.exit(0)
+    logger.debug(f"Agent: {cfg.agent.module}.{cfg.agent.function}")
+    if cfg.agent.framework:
+        logger.debug(f"Framework: {cfg.agent.framework}")
     # Run evaluations
     def on_result(result: EvaluationResult) -> None:
         """Print result as it completes."""
@@ -153,9 +179,20 @@ def run(
         duration = f"({result.duration_ms:.1f}ms)" if result.duration_ms else ""
         console.print(f"  {icon} {result.test_case_name} {duration}")
-        if verbose and not result.passed:
-            for grade in result.failed_grades:
-                console.print(f"      [dim]└─ {grade.grader_name}: {grade.message}[/dim]")
+        if verbose:
+            # Show all grades in verbose mode
+            for grade in result.grades:
+                if grade.passed:
+                    grade_icon = "[green]✓[/green]"
+                else:
+                    grade_icon = "[red]✗[/red]"
+                msg = grade.message or grade.status.value
+                console.print(f"      [dim]{grade_icon} {grade.grader_name}: {msg}[/dim]")
+                # Show extra details for LLM graders
+                if grade.details and "raw_response" in grade.details:
+                    response_preview = grade.details["raw_response"][:150].replace("\n", " ")
+                    logger.debug(f"        LLM response: {response_preview}...")
     # Show concurrency info
     effective_workers = workers if workers is not None else cfg.execution.workers
@@ -174,14 +211,12 @@ def run(
             max_concurrent=workers,
         )
     except ValueError as e:
-        console.print(f"[red]Error: {e}[/red]")
+        logger.error(f"Error: {e}")
         sys.exit(1)
     except Exception as e:
-        console.print(f"[red]Evaluation error: {e}[/red]")
+        logger.error(f"Evaluation error: {e}")
         if verbose:
-            import traceback
-            console.print(traceback.format_exc())
+            logger.exception("Full traceback:")
         sys.exit(1)
     # Print summary

{evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/config.py RENAMED Viewed

@@ -69,10 +69,12 @@ class EvaldeckConfig(BaseModel):
     execution: ExecutionConfig = Field(default_factory=ExecutionConfig)
     # Legacy execution defaults (deprecated, use execution instead)
-    defaults: dict[str, Any] = Field(default_factory=lambda: {
-        "timeout": 30,
-        "retries": 0,
-    })
+    defaults: dict[str, Any] = Field(
+        default_factory=lambda: {
+            "timeout": 30,
+            "retries": 0,
+        }
+    )
     # Grader configuration
     graders: GraderDefaults = Field(default_factory=GraderDefaults)

{evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/evaluator.py RENAMED Viewed

@@ -3,7 +3,7 @@
 from __future__ import annotations
 import asyncio
-from collections.abc import Awaitable, Callable
+from collections.abc import AsyncIterator, Awaitable, Callable
 from contextlib import asynccontextmanager
 from datetime import datetime
 from typing import TYPE_CHECKING, Any
@@ -32,6 +32,7 @@ from evaldeck.results import (
     EvaluationResult,
     GradeResult,
     GradeStatus,
+    MetricResult,
     RunResult,
     SuiteResult,
 )
@@ -266,7 +267,7 @@ class Evaluator:
         )
         # Run graders concurrently
-        async def run_grader(grader):
+        async def run_grader(grader: BaseGrader) -> GradeResult:
             try:
                 return await grader.grade_async(trace, test_case)
             except Exception as e:
@@ -278,7 +279,7 @@ class Evaluator:
             result.add_grade(grade)
         # Calculate metrics concurrently (supports async custom metrics)
-        async def run_metric(metric):
+        async def run_metric(metric: BaseMetric) -> MetricResult | None:
             try:
                 return await metric.calculate_async(trace, test_case)
             except Exception:
@@ -348,7 +349,7 @@ class Evaluator:
         semaphore = asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
         @asynccontextmanager
-        async def maybe_semaphore():
+        async def maybe_semaphore() -> AsyncIterator[None]:
             """Context manager that optionally acquires semaphore."""
             if semaphore:
                 async with semaphore:
@@ -371,11 +372,11 @@ class Evaluator:
         # Add results in original order
         results_by_index: dict[int, EvaluationResult] = {}
         for item in results:
-            if isinstance(item, Exception):
+            if isinstance(item, BaseException):
                 # This shouldn't happen since _evaluate_single_async catches exceptions
                 continue
-            index, result = item
-            results_by_index[index] = result
+            idx, res = item
+            results_by_index[idx] = res
         for i in range(len(suite.test_cases)):
             if i in results_by_index:
@@ -414,7 +415,7 @@ class Evaluator:
                 trace = await agent_func(test_case.input)  # type: ignore
             else:
                 # Run sync function in thread pool to not block event loop
-                trace = await asyncio.to_thread(agent_func, test_case.input)  # type: ignore
+                trace = await asyncio.to_thread(agent_func, test_case.input)
             # Use async evaluate to run graders concurrently
             return await self.evaluate_async(trace, test_case)
@@ -584,4 +585,4 @@ class EvaluationRunner:
             else:
                 raise ValueError(f"Unknown framework: {agent_config.framework}")
-        return func
+        return func  # type: ignore[no-any-return]

{evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/graders/base.py RENAMED Viewed

@@ -113,7 +113,7 @@ class CompositeGrader(BaseGrader):
         # Handle any exceptions
         grade_results: list[GradeResult] = []
         for i, result in enumerate(results):
-            if isinstance(result, Exception):
+            if isinstance(result, BaseException):
                 grade_results.append(
                     GradeResult.error_result(self.graders[i].name, f"Grader error: {result}")
                 )

{evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/graders/code.py RENAMED Viewed

@@ -508,7 +508,7 @@ class CustomGrader(BaseGrader):
         self.func = func
         self.module_name = module
         self.function_name = function
-        self._loaded_func: Callable | None = None
+        self._loaded_func: Callable[..., GradeResult] | None = None
     def _get_func(self) -> Callable[[Trace, EvalCase], GradeResult]:
         """Get the grading function."""
@@ -550,7 +550,7 @@ class CustomGrader(BaseGrader):
         try:
             func = self._get_func()
             if asyncio.iscoroutinefunction(func):
-                return await func(trace, test_case)
+                return await func(trace, test_case)  # type: ignore[no-any-return]
             else:
                 return await asyncio.to_thread(func, trace, test_case)
         except Exception as e:

{evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/graders/llm.py RENAMED Viewed

@@ -165,6 +165,7 @@ REASON: Your explanation
         """Call Anthropic API (sync)."""
         try:
             from anthropic import Anthropic
+            from anthropic.types import TextBlock
         except ImportError:
             raise ImportError(
                 "Anthropic package not installed. Run: pip install evaldeck[anthropic]"
@@ -176,12 +177,17 @@ REASON: Your explanation
             max_tokens=1024,
             messages=[{"role": "user", "content": prompt}],
         )
-        return response.content[0].text
+        # Extract text from first TextBlock
+        for block in response.content:
+            if isinstance(block, TextBlock):
+                return block.text
+        return ""
     async def _call_anthropic_async(self, prompt: str) -> str:
         """Call Anthropic API (async)."""
         try:
             from anthropic import AsyncAnthropic
+            from anthropic.types import TextBlock
         except ImportError:
             raise ImportError(
                 "Anthropic package not installed. Run: pip install evaldeck[anthropic]"
@@ -193,7 +199,11 @@ REASON: Your explanation
             max_tokens=1024,
             messages=[{"role": "user", "content": prompt}],
         )
-        return response.content[0].text
+        # Extract text from first TextBlock
+        for block in response.content:
+            if isinstance(block, TextBlock):
+                return block.text
+        return ""
     def _parse_response(self, response: str) -> tuple[GradeStatus, str, float | None]:
         """Parse LLM response to extract verdict.

{evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/integrations/langchain.py RENAMED Viewed

@@ -5,7 +5,9 @@ Provides automatic instrumentation and trace capture for LangChain/LangGraph age
 from __future__ import annotations
-from typing import TYPE_CHECKING, Any, Callable
+import threading
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any
 if TYPE_CHECKING:
     from evaldeck.trace import Trace
@@ -16,12 +18,17 @@ class LangChainIntegration:
     Automatically sets up OpenTelemetry tracing and provides a wrapper
     that invokes the agent and returns a Trace.
+    Thread-safe: uses thread-local storage to track traces per thread,
+    allowing parallel test execution.
     """
     def __init__(self) -> None:
         self._processor: Any = None
         self._agent: Any = None
         self._initialized = False
+        self._lock = threading.Lock()
+        self._local = threading.local()
     def setup(self, agent_factory: Callable[[], Any]) -> None:
         """Set up instrumentation and create the agent.
@@ -46,7 +53,7 @@ class LangChainIntegration:
         # Set up OTel tracing
         self._processor = setup_otel_tracing()
-        # Instrument LangChain
+        # Instrument LangChain (only once)
         LangChainInstrumentor().instrument()
         # Create the agent
@@ -56,6 +63,9 @@ class LangChainIntegration:
     def run(self, input: str) -> Trace:
         """Run the agent and return a trace.
+        Note: Agent invocations are serialized (one at a time) to ensure
+        clean trace capture. Evaluations (grading) can still run in parallel.
         Args:
             input: The input string to send to the agent.
@@ -65,18 +75,30 @@ class LangChainIntegration:
         if not self._initialized:
             raise RuntimeError("Integration not initialized. Call setup() first.")
-        # Reset processor for fresh trace
-        self._processor.reset()
+        # Serialize agent invocations to ensure clean trace capture
+        # (OTel trace IDs can get mixed when agents run truly in parallel)
+        with self._lock:
+            # Record traces before
+            traces_before = set(self._processor._traces.keys())
+            # Invoke the agent
+            self._invoke_agent(input)
+            # Find the new trace created by this invocation
+            traces_after = set(self._processor._traces.keys())
+            new_trace_ids = traces_after - traces_before
+            if not new_trace_ids:
+                raise RuntimeError("No trace captured from agent execution")
-        # Invoke the agent - auto-detect format
-        self._invoke_agent(input)
+            # Get the trace
+            trace_id = new_trace_ids.pop()
+            trace: Trace | None = self._processor.get_trace(trace_id)
-        # Get and return trace
-        trace = self._processor.get_latest_trace()
-        if trace is None:
-            raise RuntimeError("No trace captured from agent execution")
+            if trace is None:
+                raise RuntimeError("No trace captured from agent execution")
-        return trace
+            return trace
     def _invoke_agent(self, input: str) -> Any:
         """Invoke the agent with the appropriate format.

{evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/integrations/opentelemetry.py RENAMED Viewed

@@ -139,9 +139,12 @@ class EvaldeckSpanProcessor(SpanProcessor):
         trace.input = str(attrs.get("input.value", trace.input or ""))
         trace.output = attrs.get("output.value")
         trace.status = self._map_trace_status(span)
-        trace.started_at = self._ns_to_datetime(span.start_time)
-        trace.completed_at = self._ns_to_datetime(span.end_time)
-        trace.duration_ms = (span.end_time - span.start_time) / 1_000_000
+        if span.start_time is not None:
+            trace.started_at = self._ns_to_datetime(span.start_time)
+        if span.end_time is not None:
+            trace.completed_at = self._ns_to_datetime(span.end_time)
+        if span.start_time is not None and span.end_time is not None:
+            trace.duration_ms = (span.end_time - span.start_time) / 1_000_000
         # Extract agent/framework info
         if "llm.system" in attrs:
@@ -150,9 +153,7 @@ class EvaldeckSpanProcessor(SpanProcessor):
         trace.metadata["otel_trace_id"] = format(span.context.trace_id, "032x")
         trace.metadata["otel_root_span_id"] = format(span.context.span_id, "016x")
-    def _span_to_step(
-        self, span: ReadableSpan, kind: str, attrs: dict[str, Any]
-    ) -> Step | None:
+    def _span_to_step(self, span: ReadableSpan, kind: str, attrs: dict[str, Any]) -> Step | None:
         """Convert an OpenTelemetry span to an Evaldeck Step."""
         if kind == SPAN_KIND_LLM:
@@ -196,11 +197,7 @@ class EvaldeckSpanProcessor(SpanProcessor):
     def _convert_tool_span(self, span: ReadableSpan, attrs: dict[str, Any]) -> Step:
         """Convert a TOOL span to a Step."""
-        tool_name = (
-            attrs.get("tool.name")
-            or attrs.get("tool_call.function.name")
-            or "unknown_tool"
-        )
+        tool_name = attrs.get("tool.name") or attrs.get("tool_call.function.name") or "unknown_tool"
         tool_args = self._parse_json(
             attrs.get("tool.parameters")
@@ -222,9 +219,7 @@ class EvaldeckSpanProcessor(SpanProcessor):
             },
         )
-    def _convert_retrieval_span(
-        self, span: ReadableSpan, kind: str, attrs: dict[str, Any]
-    ) -> Step:
+    def _convert_retrieval_span(self, span: ReadableSpan, kind: str, attrs: dict[str, Any]) -> Step:
         """Convert EMBEDDING/RETRIEVER/RERANKER spans to tool call Steps."""
         return Step(
             type=StepType.TOOL_CALL,
@@ -322,12 +317,16 @@ class EvaldeckSpanProcessor(SpanProcessor):
     def _extract_error(self, span: ReadableSpan) -> str | None:
         """Extract error message from span if present."""
         if span.status.status_code == StatusCode.ERROR:
-            return span.status.description
+            desc: str | None = span.status.description
+            return desc
         return None
     def _calc_duration_ms(self, span: ReadableSpan) -> float:
         """Calculate span duration in milliseconds."""
-        return (span.end_time - span.start_time) / 1_000_000
+        if span.start_time is None or span.end_time is None:
+            return 0.0
+        duration: float = (span.end_time - span.start_time) / 1_000_000
+        return duration
     def _ns_to_datetime(self, ns: int) -> datetime:
         """Convert nanoseconds timestamp to datetime."""

{evaldeck-0.1.3 → evaldeck-0.1.4}/src/evaldeck/test_case.py RENAMED Viewed

@@ -109,7 +109,8 @@ class EvalCase(BaseModel):
     def to_yaml(self) -> str:
         """Convert test case to YAML string."""
-        return yaml.dump(self.model_dump(exclude_none=True), default_flow_style=False)
+        result: str = yaml.dump(self.model_dump(exclude_none=True), default_flow_style=False)
+        return result
 class EvalSuite(BaseModel):