PyPI - evaldeck - Versions diffs - 0.1.0__py3-none-any.whl - Mend

evaldeck 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

evaldeck/__init__.py +88 -0
evaldeck/cli.py +324 -0
evaldeck/config.py +223 -0
evaldeck/evaluator.py +566 -0
evaldeck/graders/__init__.py +36 -0
evaldeck/graders/base.py +146 -0
evaldeck/graders/code.py +484 -0
evaldeck/graders/llm.py +344 -0
evaldeck/integrations/__init__.py +29 -0
evaldeck/integrations/opentelemetry.py +416 -0
evaldeck/metrics/__init__.py +25 -0
evaldeck/metrics/base.py +62 -0
evaldeck/metrics/builtin.py +195 -0
evaldeck/results.py +211 -0
evaldeck/test_case.py +162 -0
evaldeck/trace.py +215 -0
evaldeck-0.1.0.dist-info/METADATA +363 -0
evaldeck-0.1.0.dist-info/RECORD +21 -0
evaldeck-0.1.0.dist-info/WHEEL +4 -0
evaldeck-0.1.0.dist-info/entry_points.txt +2 -0
evaldeck-0.1.0.dist-info/licenses/LICENSE +190 -0

evaldeck/results.py ADDED Viewed

@@ -0,0 +1,211 @@
+"""Evaluation result data models."""
+from __future__ import annotations
+from datetime import datetime
+from enum import Enum
+from typing import Any
+from pydantic import BaseModel, Field
+class GradeStatus(str, Enum):
+    """Status of a grading result."""
+    PASS = "pass"
+    FAIL = "fail"
+    ERROR = "error"
+    SKIP = "skip"
+class GradeResult(BaseModel):
+    """Result from a single grader."""
+    grader_name: str
+    status: GradeStatus
+    score: float | None = None  # 0.0 to 1.0
+    message: str | None = None
+    details: dict[str, Any] = Field(default_factory=dict)
+    # For debugging
+    expected: Any | None = None
+    actual: Any | None = None
+    @property
+    def passed(self) -> bool:
+        """Check if this grade passed."""
+        return self.status == GradeStatus.PASS
+    @classmethod
+    def passed_result(
+        cls, grader_name: str, message: str | None = None, **kwargs: Any
+    ) -> GradeResult:
+        """Create a passing result."""
+        return cls(grader_name=grader_name, status=GradeStatus.PASS, message=message, **kwargs)
+    @classmethod
+    def failed_result(
+        cls,
+        grader_name: str,
+        message: str,
+        expected: Any = None,
+        actual: Any = None,
+        **kwargs: Any,
+    ) -> GradeResult:
+        """Create a failing result."""
+        return cls(
+            grader_name=grader_name,
+            status=GradeStatus.FAIL,
+            message=message,
+            expected=expected,
+            actual=actual,
+            **kwargs,
+        )
+    @classmethod
+    def error_result(cls, grader_name: str, message: str, **kwargs: Any) -> GradeResult:
+        """Create an error result."""
+        return cls(grader_name=grader_name, status=GradeStatus.ERROR, message=message, **kwargs)
+class MetricResult(BaseModel):
+    """Result from a metric calculation."""
+    metric_name: str
+    value: float
+    unit: str | None = None
+    details: dict[str, Any] = Field(default_factory=dict)
+class EvaluationResult(BaseModel):
+    """Complete result of evaluating a single test case."""
+    test_case_name: str
+    status: GradeStatus
+    grades: list[GradeResult] = Field(default_factory=list)
+    metrics: list[MetricResult] = Field(default_factory=list)
+    # Execution info
+    duration_ms: float | None = None
+    started_at: datetime = Field(default_factory=datetime.now)
+    completed_at: datetime | None = None
+    # For debugging
+    trace_id: str | None = None
+    error: str | None = None
+    @property
+    def passed(self) -> bool:
+        """Check if the evaluation passed."""
+        return self.status == GradeStatus.PASS
+    @property
+    def failed_grades(self) -> list[GradeResult]:
+        """Get all failed grades."""
+        return [g for g in self.grades if g.status == GradeStatus.FAIL]
+    @property
+    def pass_rate(self) -> float:
+        """Calculate pass rate across all grades."""
+        if not self.grades:
+            return 0.0
+        passed = sum(1 for g in self.grades if g.passed)
+        return passed / len(self.grades)
+    def add_grade(self, grade: GradeResult) -> None:
+        """Add a grade result."""
+        self.grades.append(grade)
+        # Update overall status
+        if grade.status == GradeStatus.ERROR:
+            self.status = GradeStatus.ERROR
+        elif grade.status == GradeStatus.FAIL and self.status != GradeStatus.ERROR:
+            self.status = GradeStatus.FAIL
+    def add_metric(self, metric: MetricResult) -> None:
+        """Add a metric result."""
+        self.metrics.append(metric)
+class SuiteResult(BaseModel):
+    """Result of evaluating a test suite."""
+    suite_name: str
+    results: list[EvaluationResult] = Field(default_factory=list)
+    started_at: datetime = Field(default_factory=datetime.now)
+    completed_at: datetime | None = None
+    @property
+    def total(self) -> int:
+        """Total number of test cases."""
+        return len(self.results)
+    @property
+    def passed(self) -> int:
+        """Number of passed test cases."""
+        return sum(1 for r in self.results if r.passed)
+    @property
+    def failed(self) -> int:
+        """Number of failed test cases."""
+        return sum(1 for r in self.results if r.status == GradeStatus.FAIL)
+    @property
+    def errors(self) -> int:
+        """Number of errored test cases."""
+        return sum(1 for r in self.results if r.status == GradeStatus.ERROR)
+    @property
+    def pass_rate(self) -> float:
+        """Overall pass rate."""
+        if not self.results:
+            return 0.0
+        return self.passed / self.total
+    @property
+    def duration_ms(self) -> float:
+        """Total duration in milliseconds."""
+        return sum(r.duration_ms or 0 for r in self.results)
+    def add_result(self, result: EvaluationResult) -> None:
+        """Add an evaluation result."""
+        self.results.append(result)
+class RunResult(BaseModel):
+    """Result of a complete evaluation run (multiple suites)."""
+    suites: list[SuiteResult] = Field(default_factory=list)
+    started_at: datetime = Field(default_factory=datetime.now)
+    completed_at: datetime | None = None
+    config: dict[str, Any] = Field(default_factory=dict)
+    @property
+    def total(self) -> int:
+        """Total test cases across all suites."""
+        return sum(s.total for s in self.suites)
+    @property
+    def passed(self) -> int:
+        """Total passed across all suites."""
+        return sum(s.passed for s in self.suites)
+    @property
+    def failed(self) -> int:
+        """Total failed across all suites."""
+        return sum(s.failed for s in self.suites)
+    @property
+    def pass_rate(self) -> float:
+        """Overall pass rate."""
+        if self.total == 0:
+            return 0.0
+        return self.passed / self.total
+    @property
+    def all_passed(self) -> bool:
+        """Check if all tests passed."""
+        return self.passed == self.total
+    def add_suite(self, suite: SuiteResult) -> None:
+        """Add a suite result."""
+        self.suites.append(suite)

evaldeck/test_case.py ADDED Viewed

@@ -0,0 +1,162 @@
+"""Test case data models for defining agent evaluations."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+import yaml
+from pydantic import BaseModel, Field
+class ExpectedBehavior(BaseModel):
+    """Expected behavior for an agent test case."""
+    # Tool expectations
+    tools_called: list[str] | None = None
+    tools_not_called: list[str] | None = None
+    tool_call_order: list[str] | None = None
+    # Output expectations
+    output_contains: list[str] | None = None
+    output_not_contains: list[str] | None = None
+    output_equals: str | None = None
+    output_matches: str | None = None  # Regex pattern
+    # Execution expectations
+    max_steps: int | None = None
+    min_steps: int | None = None
+    task_completed: bool | None = None
+    # Custom assertions (for code-based graders)
+    custom: dict[str, Any] | None = None
+class GraderConfig(BaseModel):
+    """Configuration for a grader."""
+    type: str  # "contains", "tool_called", "llm", "custom", etc.
+    params: dict[str, Any] = Field(default_factory=dict)
+    # For LLM graders
+    prompt: str | None = None
+    model: str | None = None
+    threshold: float | None = None
+    # For custom graders
+    module: str | None = None
+    function: str | None = None
+class EvalCase(BaseModel):
+    """A test case for evaluating an agent.
+    Test cases define an input to send to the agent and the expected
+    behavior/output to validate against.
+    """
+    name: str
+    description: str | None = None
+    input: str
+    expected: ExpectedBehavior = Field(default_factory=ExpectedBehavior)
+    graders: list[GraderConfig] = Field(default_factory=list)
+    # Execution config
+    timeout: float | None = None
+    retries: int | None = None
+    # Metadata
+    tags: list[str] = Field(default_factory=list)
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    # Reference data (for grading)
+    reference_output: str | None = None
+    reference_tools: list[str] | None = None
+    @classmethod
+    def from_yaml(cls, path: str | Path) -> EvalCase:
+        """Load a test case from a YAML file."""
+        with open(path) as f:
+            data = yaml.safe_load(f)
+        return cls._from_dict(data)
+    @classmethod
+    def from_yaml_string(cls, content: str) -> EvalCase:
+        """Load a test case from a YAML string."""
+        data = yaml.safe_load(content)
+        return cls._from_dict(data)
+    @classmethod
+    def _from_dict(cls, data: dict[str, Any]) -> EvalCase:
+        """Create test case from dictionary, handling nested structures."""
+        # Handle expected behavior
+        if "expected" in data and isinstance(data["expected"], dict):
+            data["expected"] = ExpectedBehavior(**data["expected"])
+        # Handle graders
+        if "graders" in data:
+            graders = []
+            for g in data["graders"]:
+                if isinstance(g, dict):
+                    graders.append(GraderConfig(**g))
+                else:
+                    graders.append(g)
+            data["graders"] = graders
+        return cls(**data)
+    def to_yaml(self) -> str:
+        """Convert test case to YAML string."""
+        return yaml.dump(self.model_dump(exclude_none=True), default_flow_style=False)
+class EvalSuite(BaseModel):
+    """A collection of test cases."""
+    name: str
+    description: str | None = None
+    test_cases: list[EvalCase] = Field(default_factory=list)
+    # Suite-level defaults
+    defaults: dict[str, Any] = Field(default_factory=dict)
+    tags: list[str] = Field(default_factory=list)
+    @classmethod
+    def from_directory(cls, path: str | Path, name: str | None = None) -> EvalSuite:
+        """Load all test cases from a directory."""
+        path = Path(path)
+        if not path.is_dir():
+            raise ValueError(f"Path is not a directory: {path}")
+        test_cases = []
+        for file in sorted(path.glob("*.yaml")):
+            if file.name.startswith("_"):
+                continue
+            try:
+                test_cases.append(EvalCase.from_yaml(file))
+            except Exception as e:
+                raise ValueError(f"Failed to load {file}: {e}") from e
+        for file in sorted(path.glob("*.yml")):
+            if file.name.startswith("_"):
+                continue
+            try:
+                test_cases.append(EvalCase.from_yaml(file))
+            except Exception as e:
+                raise ValueError(f"Failed to load {file}: {e}") from e
+        return cls(
+            name=name or path.name,
+            test_cases=test_cases,
+        )
+    def filter_by_tags(self, tags: list[str]) -> EvalSuite:
+        """Return a new suite with only test cases matching the given tags."""
+        filtered = [tc for tc in self.test_cases if any(t in tc.tags for t in tags)]
+        return EvalSuite(
+            name=self.name,
+            description=self.description,
+            test_cases=filtered,
+            defaults=self.defaults,
+            tags=self.tags,
+        )

evaldeck/trace.py ADDED Viewed

@@ -0,0 +1,215 @@
+"""Trace data models for capturing agent execution."""
+from __future__ import annotations
+from datetime import datetime
+from enum import Enum
+from typing import Any
+from pydantic import BaseModel, Field
+class StepType(str, Enum):
+    """Type of step in an agent trace."""
+    LLM_CALL = "llm_call"
+    TOOL_CALL = "tool_call"
+    REASONING = "reasoning"
+    HUMAN_INPUT = "human_input"
+class StepStatus(str, Enum):
+    """Status of a step execution."""
+    SUCCESS = "success"
+    FAILURE = "failure"
+    PENDING = "pending"
+class TraceStatus(str, Enum):
+    """Status of the overall trace execution."""
+    SUCCESS = "success"
+    FAILURE = "failure"
+    TIMEOUT = "timeout"
+    ERROR = "error"
+class TokenUsage(BaseModel):
+    """Token usage for an LLM call."""
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+    total_tokens: int = 0
+    @property
+    def cost_estimate(self) -> float | None:
+        """Estimate cost based on token usage. Returns None if model unknown."""
+        return None
+class Step(BaseModel):
+    """A single step in an agent's execution trace.
+    Steps can represent LLM calls, tool calls, reasoning steps, or human input.
+    """
+    id: str = Field(default_factory=lambda: "")
+    type: StepType
+    timestamp: datetime = Field(default_factory=datetime.now)
+    status: StepStatus = StepStatus.SUCCESS
+    # For LLM calls
+    model: str | None = None
+    input: str | None = None
+    output: str | None = None
+    tokens: TokenUsage | None = None
+    # For tool calls
+    tool_name: str | None = None
+    tool_args: dict[str, Any] | None = None
+    tool_result: Any | None = None
+    # For reasoning steps
+    reasoning_text: str | None = None
+    # Metadata
+    parent_id: str | None = None
+    error: str | None = None
+    duration_ms: float | None = None
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    def model_post_init(self, __context: Any) -> None:
+        """Generate ID if not provided."""
+        if not self.id:
+            import uuid
+            self.id = str(uuid.uuid4())[:8]
+    @classmethod
+    def llm_call(
+        cls,
+        model: str,
+        input: str,
+        output: str,
+        tokens: TokenUsage | None = None,
+        **kwargs: Any,
+    ) -> Step:
+        """Create an LLM call step."""
+        return cls(
+            type=StepType.LLM_CALL,
+            model=model,
+            input=input,
+            output=output,
+            tokens=tokens,
+            **kwargs,
+        )
+    @classmethod
+    def tool_call(
+        cls,
+        tool_name: str,
+        tool_args: dict[str, Any] | None = None,
+        tool_result: Any = None,
+        **kwargs: Any,
+    ) -> Step:
+        """Create a tool call step."""
+        return cls(
+            type=StepType.TOOL_CALL,
+            tool_name=tool_name,
+            tool_args=tool_args or {},
+            tool_result=tool_result,
+            **kwargs,
+        )
+    @classmethod
+    def reasoning(cls, text: str, **kwargs: Any) -> Step:
+        """Create a reasoning step."""
+        return cls(
+            type=StepType.REASONING,
+            reasoning_text=text,
+            **kwargs,
+        )
+class Trace(BaseModel):
+    """Complete execution trace of an agent.
+    A trace captures everything that happened during an agent's execution,
+    from the initial input to the final output, including all intermediate
+    steps (LLM calls, tool calls, reasoning).
+    """
+    id: str = Field(default_factory=lambda: "")
+    input: str
+    output: str | None = None
+    status: TraceStatus = TraceStatus.SUCCESS
+    steps: list[Step] = Field(default_factory=list)
+    # Timing
+    started_at: datetime = Field(default_factory=datetime.now)
+    completed_at: datetime | None = None
+    duration_ms: float | None = None
+    # Metadata
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    framework: str | None = None
+    agent_name: str | None = None
+    def model_post_init(self, __context: Any) -> None:
+        """Generate ID if not provided."""
+        if not self.id:
+            import uuid
+            self.id = str(uuid.uuid4())[:8]
+    @property
+    def tool_calls(self) -> list[Step]:
+        """Get all tool call steps."""
+        return [s for s in self.steps if s.type == StepType.TOOL_CALL]
+    @property
+    def llm_calls(self) -> list[Step]:
+        """Get all LLM call steps."""
+        return [s for s in self.steps if s.type == StepType.LLM_CALL]
+    @property
+    def tools_called(self) -> list[str]:
+        """Get list of tool names that were called."""
+        return [s.tool_name for s in self.tool_calls if s.tool_name]
+    @property
+    def total_tokens(self) -> int:
+        """Get total tokens used across all LLM calls."""
+        total = 0
+        for step in self.llm_calls:
+            if step.tokens:
+                total += step.tokens.total_tokens
+        return total
+    @property
+    def step_count(self) -> int:
+        """Get total number of steps."""
+        return len(self.steps)
+    def add_step(self, step: Step) -> None:
+        """Add a step to the trace."""
+        self.steps.append(step)
+    def complete(self, output: str, status: TraceStatus = TraceStatus.SUCCESS) -> None:
+        """Mark the trace as complete."""
+        self.output = output
+        self.status = status
+        self.completed_at = datetime.now()
+        if self.started_at:
+            delta = self.completed_at - self.started_at
+            self.duration_ms = delta.total_seconds() * 1000
+    def to_dict(self) -> dict[str, Any]:
+        """Convert trace to dictionary."""
+        return self.model_dump(mode="json")
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> Trace:
+        """Create trace from dictionary."""
+        return cls.model_validate(data)