evaldeck 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
evaldeck/results.py ADDED
@@ -0,0 +1,211 @@
1
+ """Evaluation result data models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime
6
+ from enum import Enum
7
+ from typing import Any
8
+
9
+ from pydantic import BaseModel, Field
10
+
11
+
12
+ class GradeStatus(str, Enum):
13
+ """Status of a grading result."""
14
+
15
+ PASS = "pass"
16
+ FAIL = "fail"
17
+ ERROR = "error"
18
+ SKIP = "skip"
19
+
20
+
21
+ class GradeResult(BaseModel):
22
+ """Result from a single grader."""
23
+
24
+ grader_name: str
25
+ status: GradeStatus
26
+ score: float | None = None # 0.0 to 1.0
27
+ message: str | None = None
28
+ details: dict[str, Any] = Field(default_factory=dict)
29
+
30
+ # For debugging
31
+ expected: Any | None = None
32
+ actual: Any | None = None
33
+
34
+ @property
35
+ def passed(self) -> bool:
36
+ """Check if this grade passed."""
37
+ return self.status == GradeStatus.PASS
38
+
39
+ @classmethod
40
+ def passed_result(
41
+ cls, grader_name: str, message: str | None = None, **kwargs: Any
42
+ ) -> GradeResult:
43
+ """Create a passing result."""
44
+ return cls(grader_name=grader_name, status=GradeStatus.PASS, message=message, **kwargs)
45
+
46
+ @classmethod
47
+ def failed_result(
48
+ cls,
49
+ grader_name: str,
50
+ message: str,
51
+ expected: Any = None,
52
+ actual: Any = None,
53
+ **kwargs: Any,
54
+ ) -> GradeResult:
55
+ """Create a failing result."""
56
+ return cls(
57
+ grader_name=grader_name,
58
+ status=GradeStatus.FAIL,
59
+ message=message,
60
+ expected=expected,
61
+ actual=actual,
62
+ **kwargs,
63
+ )
64
+
65
+ @classmethod
66
+ def error_result(cls, grader_name: str, message: str, **kwargs: Any) -> GradeResult:
67
+ """Create an error result."""
68
+ return cls(grader_name=grader_name, status=GradeStatus.ERROR, message=message, **kwargs)
69
+
70
+
71
+ class MetricResult(BaseModel):
72
+ """Result from a metric calculation."""
73
+
74
+ metric_name: str
75
+ value: float
76
+ unit: str | None = None
77
+ details: dict[str, Any] = Field(default_factory=dict)
78
+
79
+
80
+ class EvaluationResult(BaseModel):
81
+ """Complete result of evaluating a single test case."""
82
+
83
+ test_case_name: str
84
+ status: GradeStatus
85
+ grades: list[GradeResult] = Field(default_factory=list)
86
+ metrics: list[MetricResult] = Field(default_factory=list)
87
+
88
+ # Execution info
89
+ duration_ms: float | None = None
90
+ started_at: datetime = Field(default_factory=datetime.now)
91
+ completed_at: datetime | None = None
92
+
93
+ # For debugging
94
+ trace_id: str | None = None
95
+ error: str | None = None
96
+
97
+ @property
98
+ def passed(self) -> bool:
99
+ """Check if the evaluation passed."""
100
+ return self.status == GradeStatus.PASS
101
+
102
+ @property
103
+ def failed_grades(self) -> list[GradeResult]:
104
+ """Get all failed grades."""
105
+ return [g for g in self.grades if g.status == GradeStatus.FAIL]
106
+
107
+ @property
108
+ def pass_rate(self) -> float:
109
+ """Calculate pass rate across all grades."""
110
+ if not self.grades:
111
+ return 0.0
112
+ passed = sum(1 for g in self.grades if g.passed)
113
+ return passed / len(self.grades)
114
+
115
+ def add_grade(self, grade: GradeResult) -> None:
116
+ """Add a grade result."""
117
+ self.grades.append(grade)
118
+ # Update overall status
119
+ if grade.status == GradeStatus.ERROR:
120
+ self.status = GradeStatus.ERROR
121
+ elif grade.status == GradeStatus.FAIL and self.status != GradeStatus.ERROR:
122
+ self.status = GradeStatus.FAIL
123
+
124
+ def add_metric(self, metric: MetricResult) -> None:
125
+ """Add a metric result."""
126
+ self.metrics.append(metric)
127
+
128
+
129
+ class SuiteResult(BaseModel):
130
+ """Result of evaluating a test suite."""
131
+
132
+ suite_name: str
133
+ results: list[EvaluationResult] = Field(default_factory=list)
134
+ started_at: datetime = Field(default_factory=datetime.now)
135
+ completed_at: datetime | None = None
136
+
137
+ @property
138
+ def total(self) -> int:
139
+ """Total number of test cases."""
140
+ return len(self.results)
141
+
142
+ @property
143
+ def passed(self) -> int:
144
+ """Number of passed test cases."""
145
+ return sum(1 for r in self.results if r.passed)
146
+
147
+ @property
148
+ def failed(self) -> int:
149
+ """Number of failed test cases."""
150
+ return sum(1 for r in self.results if r.status == GradeStatus.FAIL)
151
+
152
+ @property
153
+ def errors(self) -> int:
154
+ """Number of errored test cases."""
155
+ return sum(1 for r in self.results if r.status == GradeStatus.ERROR)
156
+
157
+ @property
158
+ def pass_rate(self) -> float:
159
+ """Overall pass rate."""
160
+ if not self.results:
161
+ return 0.0
162
+ return self.passed / self.total
163
+
164
+ @property
165
+ def duration_ms(self) -> float:
166
+ """Total duration in milliseconds."""
167
+ return sum(r.duration_ms or 0 for r in self.results)
168
+
169
+ def add_result(self, result: EvaluationResult) -> None:
170
+ """Add an evaluation result."""
171
+ self.results.append(result)
172
+
173
+
174
+ class RunResult(BaseModel):
175
+ """Result of a complete evaluation run (multiple suites)."""
176
+
177
+ suites: list[SuiteResult] = Field(default_factory=list)
178
+ started_at: datetime = Field(default_factory=datetime.now)
179
+ completed_at: datetime | None = None
180
+ config: dict[str, Any] = Field(default_factory=dict)
181
+
182
+ @property
183
+ def total(self) -> int:
184
+ """Total test cases across all suites."""
185
+ return sum(s.total for s in self.suites)
186
+
187
+ @property
188
+ def passed(self) -> int:
189
+ """Total passed across all suites."""
190
+ return sum(s.passed for s in self.suites)
191
+
192
+ @property
193
+ def failed(self) -> int:
194
+ """Total failed across all suites."""
195
+ return sum(s.failed for s in self.suites)
196
+
197
+ @property
198
+ def pass_rate(self) -> float:
199
+ """Overall pass rate."""
200
+ if self.total == 0:
201
+ return 0.0
202
+ return self.passed / self.total
203
+
204
+ @property
205
+ def all_passed(self) -> bool:
206
+ """Check if all tests passed."""
207
+ return self.passed == self.total
208
+
209
+ def add_suite(self, suite: SuiteResult) -> None:
210
+ """Add a suite result."""
211
+ self.suites.append(suite)
evaldeck/test_case.py ADDED
@@ -0,0 +1,162 @@
1
+ """Test case data models for defining agent evaluations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import yaml
9
+ from pydantic import BaseModel, Field
10
+
11
+
12
+ class ExpectedBehavior(BaseModel):
13
+ """Expected behavior for an agent test case."""
14
+
15
+ # Tool expectations
16
+ tools_called: list[str] | None = None
17
+ tools_not_called: list[str] | None = None
18
+ tool_call_order: list[str] | None = None
19
+
20
+ # Output expectations
21
+ output_contains: list[str] | None = None
22
+ output_not_contains: list[str] | None = None
23
+ output_equals: str | None = None
24
+ output_matches: str | None = None # Regex pattern
25
+
26
+ # Execution expectations
27
+ max_steps: int | None = None
28
+ min_steps: int | None = None
29
+ task_completed: bool | None = None
30
+
31
+ # Custom assertions (for code-based graders)
32
+ custom: dict[str, Any] | None = None
33
+
34
+
35
+ class GraderConfig(BaseModel):
36
+ """Configuration for a grader."""
37
+
38
+ type: str # "contains", "tool_called", "llm", "custom", etc.
39
+ params: dict[str, Any] = Field(default_factory=dict)
40
+
41
+ # For LLM graders
42
+ prompt: str | None = None
43
+ model: str | None = None
44
+ threshold: float | None = None
45
+
46
+ # For custom graders
47
+ module: str | None = None
48
+ function: str | None = None
49
+
50
+
51
+ class EvalCase(BaseModel):
52
+ """A test case for evaluating an agent.
53
+
54
+ Test cases define an input to send to the agent and the expected
55
+ behavior/output to validate against.
56
+ """
57
+
58
+ name: str
59
+ description: str | None = None
60
+ input: str
61
+ expected: ExpectedBehavior = Field(default_factory=ExpectedBehavior)
62
+ graders: list[GraderConfig] = Field(default_factory=list)
63
+
64
+ # Execution config
65
+ timeout: float | None = None
66
+ retries: int | None = None
67
+
68
+ # Metadata
69
+ tags: list[str] = Field(default_factory=list)
70
+ metadata: dict[str, Any] = Field(default_factory=dict)
71
+
72
+ # Reference data (for grading)
73
+ reference_output: str | None = None
74
+ reference_tools: list[str] | None = None
75
+
76
+ @classmethod
77
+ def from_yaml(cls, path: str | Path) -> EvalCase:
78
+ """Load a test case from a YAML file."""
79
+ with open(path) as f:
80
+ data = yaml.safe_load(f)
81
+ return cls._from_dict(data)
82
+
83
+ @classmethod
84
+ def from_yaml_string(cls, content: str) -> EvalCase:
85
+ """Load a test case from a YAML string."""
86
+ data = yaml.safe_load(content)
87
+ return cls._from_dict(data)
88
+
89
+ @classmethod
90
+ def _from_dict(cls, data: dict[str, Any]) -> EvalCase:
91
+ """Create test case from dictionary, handling nested structures."""
92
+ # Handle expected behavior
93
+ if "expected" in data and isinstance(data["expected"], dict):
94
+ data["expected"] = ExpectedBehavior(**data["expected"])
95
+
96
+ # Handle graders
97
+ if "graders" in data:
98
+ graders = []
99
+ for g in data["graders"]:
100
+ if isinstance(g, dict):
101
+ graders.append(GraderConfig(**g))
102
+ else:
103
+ graders.append(g)
104
+ data["graders"] = graders
105
+
106
+ return cls(**data)
107
+
108
+ def to_yaml(self) -> str:
109
+ """Convert test case to YAML string."""
110
+ return yaml.dump(self.model_dump(exclude_none=True), default_flow_style=False)
111
+
112
+
113
+ class EvalSuite(BaseModel):
114
+ """A collection of test cases."""
115
+
116
+ name: str
117
+ description: str | None = None
118
+ test_cases: list[EvalCase] = Field(default_factory=list)
119
+
120
+ # Suite-level defaults
121
+ defaults: dict[str, Any] = Field(default_factory=dict)
122
+ tags: list[str] = Field(default_factory=list)
123
+
124
+ @classmethod
125
+ def from_directory(cls, path: str | Path, name: str | None = None) -> EvalSuite:
126
+ """Load all test cases from a directory."""
127
+ path = Path(path)
128
+ if not path.is_dir():
129
+ raise ValueError(f"Path is not a directory: {path}")
130
+
131
+ test_cases = []
132
+ for file in sorted(path.glob("*.yaml")):
133
+ if file.name.startswith("_"):
134
+ continue
135
+ try:
136
+ test_cases.append(EvalCase.from_yaml(file))
137
+ except Exception as e:
138
+ raise ValueError(f"Failed to load {file}: {e}") from e
139
+
140
+ for file in sorted(path.glob("*.yml")):
141
+ if file.name.startswith("_"):
142
+ continue
143
+ try:
144
+ test_cases.append(EvalCase.from_yaml(file))
145
+ except Exception as e:
146
+ raise ValueError(f"Failed to load {file}: {e}") from e
147
+
148
+ return cls(
149
+ name=name or path.name,
150
+ test_cases=test_cases,
151
+ )
152
+
153
+ def filter_by_tags(self, tags: list[str]) -> EvalSuite:
154
+ """Return a new suite with only test cases matching the given tags."""
155
+ filtered = [tc for tc in self.test_cases if any(t in tc.tags for t in tags)]
156
+ return EvalSuite(
157
+ name=self.name,
158
+ description=self.description,
159
+ test_cases=filtered,
160
+ defaults=self.defaults,
161
+ tags=self.tags,
162
+ )
evaldeck/trace.py ADDED
@@ -0,0 +1,215 @@
1
+ """Trace data models for capturing agent execution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime
6
+ from enum import Enum
7
+ from typing import Any
8
+
9
+ from pydantic import BaseModel, Field
10
+
11
+
12
+ class StepType(str, Enum):
13
+ """Type of step in an agent trace."""
14
+
15
+ LLM_CALL = "llm_call"
16
+ TOOL_CALL = "tool_call"
17
+ REASONING = "reasoning"
18
+ HUMAN_INPUT = "human_input"
19
+
20
+
21
+ class StepStatus(str, Enum):
22
+ """Status of a step execution."""
23
+
24
+ SUCCESS = "success"
25
+ FAILURE = "failure"
26
+ PENDING = "pending"
27
+
28
+
29
+ class TraceStatus(str, Enum):
30
+ """Status of the overall trace execution."""
31
+
32
+ SUCCESS = "success"
33
+ FAILURE = "failure"
34
+ TIMEOUT = "timeout"
35
+ ERROR = "error"
36
+
37
+
38
+ class TokenUsage(BaseModel):
39
+ """Token usage for an LLM call."""
40
+
41
+ prompt_tokens: int = 0
42
+ completion_tokens: int = 0
43
+ total_tokens: int = 0
44
+
45
+ @property
46
+ def cost_estimate(self) -> float | None:
47
+ """Estimate cost based on token usage. Returns None if model unknown."""
48
+ return None
49
+
50
+
51
+ class Step(BaseModel):
52
+ """A single step in an agent's execution trace.
53
+
54
+ Steps can represent LLM calls, tool calls, reasoning steps, or human input.
55
+ """
56
+
57
+ id: str = Field(default_factory=lambda: "")
58
+ type: StepType
59
+ timestamp: datetime = Field(default_factory=datetime.now)
60
+ status: StepStatus = StepStatus.SUCCESS
61
+
62
+ # For LLM calls
63
+ model: str | None = None
64
+ input: str | None = None
65
+ output: str | None = None
66
+ tokens: TokenUsage | None = None
67
+
68
+ # For tool calls
69
+ tool_name: str | None = None
70
+ tool_args: dict[str, Any] | None = None
71
+ tool_result: Any | None = None
72
+
73
+ # For reasoning steps
74
+ reasoning_text: str | None = None
75
+
76
+ # Metadata
77
+ parent_id: str | None = None
78
+ error: str | None = None
79
+ duration_ms: float | None = None
80
+ metadata: dict[str, Any] = Field(default_factory=dict)
81
+
82
+ def model_post_init(self, __context: Any) -> None:
83
+ """Generate ID if not provided."""
84
+ if not self.id:
85
+ import uuid
86
+
87
+ self.id = str(uuid.uuid4())[:8]
88
+
89
+ @classmethod
90
+ def llm_call(
91
+ cls,
92
+ model: str,
93
+ input: str,
94
+ output: str,
95
+ tokens: TokenUsage | None = None,
96
+ **kwargs: Any,
97
+ ) -> Step:
98
+ """Create an LLM call step."""
99
+ return cls(
100
+ type=StepType.LLM_CALL,
101
+ model=model,
102
+ input=input,
103
+ output=output,
104
+ tokens=tokens,
105
+ **kwargs,
106
+ )
107
+
108
+ @classmethod
109
+ def tool_call(
110
+ cls,
111
+ tool_name: str,
112
+ tool_args: dict[str, Any] | None = None,
113
+ tool_result: Any = None,
114
+ **kwargs: Any,
115
+ ) -> Step:
116
+ """Create a tool call step."""
117
+ return cls(
118
+ type=StepType.TOOL_CALL,
119
+ tool_name=tool_name,
120
+ tool_args=tool_args or {},
121
+ tool_result=tool_result,
122
+ **kwargs,
123
+ )
124
+
125
+ @classmethod
126
+ def reasoning(cls, text: str, **kwargs: Any) -> Step:
127
+ """Create a reasoning step."""
128
+ return cls(
129
+ type=StepType.REASONING,
130
+ reasoning_text=text,
131
+ **kwargs,
132
+ )
133
+
134
+
135
+ class Trace(BaseModel):
136
+ """Complete execution trace of an agent.
137
+
138
+ A trace captures everything that happened during an agent's execution,
139
+ from the initial input to the final output, including all intermediate
140
+ steps (LLM calls, tool calls, reasoning).
141
+ """
142
+
143
+ id: str = Field(default_factory=lambda: "")
144
+ input: str
145
+ output: str | None = None
146
+ status: TraceStatus = TraceStatus.SUCCESS
147
+ steps: list[Step] = Field(default_factory=list)
148
+
149
+ # Timing
150
+ started_at: datetime = Field(default_factory=datetime.now)
151
+ completed_at: datetime | None = None
152
+ duration_ms: float | None = None
153
+
154
+ # Metadata
155
+ metadata: dict[str, Any] = Field(default_factory=dict)
156
+ framework: str | None = None
157
+ agent_name: str | None = None
158
+
159
+ def model_post_init(self, __context: Any) -> None:
160
+ """Generate ID if not provided."""
161
+ if not self.id:
162
+ import uuid
163
+
164
+ self.id = str(uuid.uuid4())[:8]
165
+
166
+ @property
167
+ def tool_calls(self) -> list[Step]:
168
+ """Get all tool call steps."""
169
+ return [s for s in self.steps if s.type == StepType.TOOL_CALL]
170
+
171
+ @property
172
+ def llm_calls(self) -> list[Step]:
173
+ """Get all LLM call steps."""
174
+ return [s for s in self.steps if s.type == StepType.LLM_CALL]
175
+
176
+ @property
177
+ def tools_called(self) -> list[str]:
178
+ """Get list of tool names that were called."""
179
+ return [s.tool_name for s in self.tool_calls if s.tool_name]
180
+
181
+ @property
182
+ def total_tokens(self) -> int:
183
+ """Get total tokens used across all LLM calls."""
184
+ total = 0
185
+ for step in self.llm_calls:
186
+ if step.tokens:
187
+ total += step.tokens.total_tokens
188
+ return total
189
+
190
+ @property
191
+ def step_count(self) -> int:
192
+ """Get total number of steps."""
193
+ return len(self.steps)
194
+
195
+ def add_step(self, step: Step) -> None:
196
+ """Add a step to the trace."""
197
+ self.steps.append(step)
198
+
199
+ def complete(self, output: str, status: TraceStatus = TraceStatus.SUCCESS) -> None:
200
+ """Mark the trace as complete."""
201
+ self.output = output
202
+ self.status = status
203
+ self.completed_at = datetime.now()
204
+ if self.started_at:
205
+ delta = self.completed_at - self.started_at
206
+ self.duration_ms = delta.total_seconds() * 1000
207
+
208
+ def to_dict(self) -> dict[str, Any]:
209
+ """Convert trace to dictionary."""
210
+ return self.model_dump(mode="json")
211
+
212
+ @classmethod
213
+ def from_dict(cls, data: dict[str, Any]) -> Trace:
214
+ """Create trace from dictionary."""
215
+ return cls.model_validate(data)