prela 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- prela/__init__.py +394 -0
- prela/_version.py +3 -0
- prela/contrib/CLI.md +431 -0
- prela/contrib/README.md +118 -0
- prela/contrib/__init__.py +5 -0
- prela/contrib/cli.py +1063 -0
- prela/contrib/explorer.py +571 -0
- prela/core/__init__.py +64 -0
- prela/core/clock.py +98 -0
- prela/core/context.py +228 -0
- prela/core/replay.py +403 -0
- prela/core/sampler.py +178 -0
- prela/core/span.py +295 -0
- prela/core/tracer.py +498 -0
- prela/evals/__init__.py +94 -0
- prela/evals/assertions/README.md +484 -0
- prela/evals/assertions/__init__.py +78 -0
- prela/evals/assertions/base.py +90 -0
- prela/evals/assertions/multi_agent.py +625 -0
- prela/evals/assertions/semantic.py +223 -0
- prela/evals/assertions/structural.py +443 -0
- prela/evals/assertions/tool.py +380 -0
- prela/evals/case.py +370 -0
- prela/evals/n8n/__init__.py +69 -0
- prela/evals/n8n/assertions.py +450 -0
- prela/evals/n8n/runner.py +497 -0
- prela/evals/reporters/README.md +184 -0
- prela/evals/reporters/__init__.py +32 -0
- prela/evals/reporters/console.py +251 -0
- prela/evals/reporters/json.py +176 -0
- prela/evals/reporters/junit.py +278 -0
- prela/evals/runner.py +525 -0
- prela/evals/suite.py +316 -0
- prela/exporters/__init__.py +27 -0
- prela/exporters/base.py +189 -0
- prela/exporters/console.py +443 -0
- prela/exporters/file.py +322 -0
- prela/exporters/http.py +394 -0
- prela/exporters/multi.py +154 -0
- prela/exporters/otlp.py +388 -0
- prela/instrumentation/ANTHROPIC.md +297 -0
- prela/instrumentation/LANGCHAIN.md +480 -0
- prela/instrumentation/OPENAI.md +59 -0
- prela/instrumentation/__init__.py +49 -0
- prela/instrumentation/anthropic.py +1436 -0
- prela/instrumentation/auto.py +129 -0
- prela/instrumentation/base.py +436 -0
- prela/instrumentation/langchain.py +959 -0
- prela/instrumentation/llamaindex.py +719 -0
- prela/instrumentation/multi_agent/__init__.py +48 -0
- prela/instrumentation/multi_agent/autogen.py +357 -0
- prela/instrumentation/multi_agent/crewai.py +404 -0
- prela/instrumentation/multi_agent/langgraph.py +299 -0
- prela/instrumentation/multi_agent/models.py +203 -0
- prela/instrumentation/multi_agent/swarm.py +231 -0
- prela/instrumentation/n8n/__init__.py +68 -0
- prela/instrumentation/n8n/code_node.py +534 -0
- prela/instrumentation/n8n/models.py +336 -0
- prela/instrumentation/n8n/webhook.py +489 -0
- prela/instrumentation/openai.py +1198 -0
- prela/license.py +245 -0
- prela/replay/__init__.py +31 -0
- prela/replay/comparison.py +390 -0
- prela/replay/engine.py +1227 -0
- prela/replay/loader.py +231 -0
- prela/replay/result.py +196 -0
- prela-0.1.0.dist-info/METADATA +399 -0
- prela-0.1.0.dist-info/RECORD +71 -0
- prela-0.1.0.dist-info/WHEEL +4 -0
- prela-0.1.0.dist-info/entry_points.txt +2 -0
- prela-0.1.0.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,484 @@
|
|
|
1
|
+
# Evaluation Assertions
|
|
2
|
+
|
|
3
|
+
This module provides a comprehensive set of assertions for testing AI agent outputs and behaviors. Assertions are the building blocks of evaluation test cases, allowing you to verify that your agent produces expected results.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Assertions evaluate agent outputs, expected values, and execution traces to determine if they meet specified criteria. Each assertion returns an `AssertionResult` with pass/fail status, score (for partial credit), and detailed information about the evaluation.
|
|
8
|
+
|
|
9
|
+
## Assertion Types
|
|
10
|
+
|
|
11
|
+
### Structural Assertions (`structural.py`)
|
|
12
|
+
|
|
13
|
+
Text and data format validation assertions:
|
|
14
|
+
|
|
15
|
+
#### 1. `ContainsAssertion`
|
|
16
|
+
Check if output contains specified text.
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
from prela.evals.assertions import ContainsAssertion
|
|
20
|
+
|
|
21
|
+
# Case-sensitive search
|
|
22
|
+
assertion = ContainsAssertion(text="success", case_sensitive=True)
|
|
23
|
+
result = assertion.evaluate(output="Operation completed successfully", expected=None, trace=None)
|
|
24
|
+
assert result.passed # True
|
|
25
|
+
|
|
26
|
+
# Case-insensitive search
|
|
27
|
+
assertion = ContainsAssertion(text="ERROR", case_sensitive=False)
|
|
28
|
+
result = assertion.evaluate(output="error occurred", expected=None, trace=None)
|
|
29
|
+
assert result.passed # True
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
**Config format:**
|
|
33
|
+
```json
|
|
34
|
+
{
|
|
35
|
+
"text": "success",
|
|
36
|
+
"case_sensitive": true
|
|
37
|
+
}
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
#### 2. `NotContainsAssertion`
|
|
41
|
+
Check if output does NOT contain specified text.
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from prela.evals.assertions import NotContainsAssertion
|
|
45
|
+
|
|
46
|
+
assertion = NotContainsAssertion(text="error", case_sensitive=True)
|
|
47
|
+
result = assertion.evaluate(output="All tests passed!", expected=None, trace=None)
|
|
48
|
+
assert result.passed # True
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
**Config format:**
|
|
52
|
+
```json
|
|
53
|
+
{
|
|
54
|
+
"text": "error",
|
|
55
|
+
"case_sensitive": true
|
|
56
|
+
}
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
#### 3. `RegexAssertion`
|
|
60
|
+
Match output against a regular expression pattern.
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from prela.evals.assertions import RegexAssertion
|
|
64
|
+
import re
|
|
65
|
+
|
|
66
|
+
# Phone number validation
|
|
67
|
+
assertion = RegexAssertion(pattern=r"\d{3}-\d{3}-\d{4}")
|
|
68
|
+
result = assertion.evaluate(output="Call me at 555-123-4567", expected=None, trace=None)
|
|
69
|
+
assert result.passed # True
|
|
70
|
+
assert result.details["matched_text"] == "555-123-4567"
|
|
71
|
+
|
|
72
|
+
# Case-insensitive matching
|
|
73
|
+
assertion = RegexAssertion(pattern=r"hello", flags=re.IGNORECASE)
|
|
74
|
+
result = assertion.evaluate(output="HELLO WORLD", expected=None, trace=None)
|
|
75
|
+
assert result.passed # True
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
**Config format:**
|
|
79
|
+
```json
|
|
80
|
+
{
|
|
81
|
+
"pattern": "\\d{3}-\\d{3}-\\d{4}",
|
|
82
|
+
"flags": 0
|
|
83
|
+
}
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
#### 4. `LengthAssertion`
|
|
87
|
+
Check if output length is within specified bounds.
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from prela.evals.assertions import LengthAssertion
|
|
91
|
+
|
|
92
|
+
# Min and max bounds
|
|
93
|
+
assertion = LengthAssertion(min_length=10, max_length=100)
|
|
94
|
+
result = assertion.evaluate(output="This is a medium length response.", expected=None, trace=None)
|
|
95
|
+
assert result.passed # True
|
|
96
|
+
assert result.actual == 34 # Character count
|
|
97
|
+
|
|
98
|
+
# Min only
|
|
99
|
+
assertion = LengthAssertion(min_length=5)
|
|
100
|
+
result = assertion.evaluate(output="Hi", expected=None, trace=None)
|
|
101
|
+
assert not result.passed # False (too short)
|
|
102
|
+
|
|
103
|
+
# Max only
|
|
104
|
+
assertion = LengthAssertion(max_length=50)
|
|
105
|
+
result = assertion.evaluate(output="Short text", expected=None, trace=None)
|
|
106
|
+
assert result.passed # True
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
**Config format:**
|
|
110
|
+
```json
|
|
111
|
+
{
|
|
112
|
+
"min_length": 10,
|
|
113
|
+
"max_length": 100
|
|
114
|
+
}
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
#### 5. `JSONValidAssertion`
|
|
118
|
+
Validate that output is valid JSON, optionally matching a schema.
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
from prela.evals.assertions import JSONValidAssertion
|
|
122
|
+
|
|
123
|
+
# Basic JSON validation
|
|
124
|
+
assertion = JSONValidAssertion()
|
|
125
|
+
result = assertion.evaluate(output='{"status": "success", "count": 42}', expected=None, trace=None)
|
|
126
|
+
assert result.passed # True
|
|
127
|
+
assert result.actual == {"status": "success", "count": 42}
|
|
128
|
+
|
|
129
|
+
# JSON schema validation (requires jsonschema library)
|
|
130
|
+
schema = {
|
|
131
|
+
"type": "object",
|
|
132
|
+
"properties": {
|
|
133
|
+
"name": {"type": "string"},
|
|
134
|
+
"age": {"type": "number"}
|
|
135
|
+
},
|
|
136
|
+
"required": ["name"]
|
|
137
|
+
}
|
|
138
|
+
assertion = JSONValidAssertion(schema=schema)
|
|
139
|
+
result = assertion.evaluate(output='{"name": "Alice", "age": 30}', expected=None, trace=None)
|
|
140
|
+
assert result.passed # True
|
|
141
|
+
|
|
142
|
+
result = assertion.evaluate(output='{"age": 30}', expected=None, trace=None)
|
|
143
|
+
assert not result.passed # False (missing required field "name")
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
**Config format:**
|
|
147
|
+
```json
|
|
148
|
+
{
|
|
149
|
+
"schema": {
|
|
150
|
+
"type": "object",
|
|
151
|
+
"properties": {
|
|
152
|
+
"name": {"type": "string"}
|
|
153
|
+
},
|
|
154
|
+
"required": ["name"]
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Tool Assertions (`tool.py`)
|
|
160
|
+
|
|
161
|
+
Assertions for verifying agent tool usage based on execution traces:
|
|
162
|
+
|
|
163
|
+
#### 6. `ToolCalledAssertion`
|
|
164
|
+
Check if a specific tool was called during execution.
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
from prela.evals.assertions import ToolCalledAssertion
|
|
168
|
+
|
|
169
|
+
assertion = ToolCalledAssertion(tool_name="web_search")
|
|
170
|
+
result = assertion.evaluate(output=None, expected=None, trace=trace_spans)
|
|
171
|
+
assert result.passed # True if "web_search" tool span found in trace
|
|
172
|
+
assert result.details["call_count"] == 2 # Number of times called
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
**Config format:**
|
|
176
|
+
```json
|
|
177
|
+
{
|
|
178
|
+
"tool_name": "web_search"
|
|
179
|
+
}
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
#### 7. `ToolArgsAssertion`
|
|
183
|
+
Check if a tool was called with expected arguments.
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from prela.evals.assertions import ToolArgsAssertion
|
|
187
|
+
|
|
188
|
+
# Partial match (checks that expected args are present)
|
|
189
|
+
assertion = ToolArgsAssertion(
|
|
190
|
+
tool_name="web_search",
|
|
191
|
+
expected_args={"query": "Python tutorial"},
|
|
192
|
+
partial_match=True
|
|
193
|
+
)
|
|
194
|
+
result = assertion.evaluate(output=None, expected=None, trace=trace_spans)
|
|
195
|
+
assert result.passed # True even if tool has additional args
|
|
196
|
+
|
|
197
|
+
# Exact match (requires exact argument match)
|
|
198
|
+
assertion = ToolArgsAssertion(
|
|
199
|
+
tool_name="calculator",
|
|
200
|
+
expected_args={"x": 5, "y": 10},
|
|
201
|
+
partial_match=False
|
|
202
|
+
)
|
|
203
|
+
result = assertion.evaluate(output=None, expected=None, trace=trace_spans)
|
|
204
|
+
assert result.passed # True only if args exactly match
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
**Config format:**
|
|
208
|
+
```json
|
|
209
|
+
{
|
|
210
|
+
"tool_name": "web_search",
|
|
211
|
+
"expected_args": {"query": "Python"},
|
|
212
|
+
"partial_match": true
|
|
213
|
+
}
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
#### 8. `ToolSequenceAssertion`
|
|
217
|
+
Check if tools were called in a specific order.
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
from prela.evals.assertions import ToolSequenceAssertion
|
|
221
|
+
|
|
222
|
+
# Non-strict mode (other tools can appear between expected sequence)
|
|
223
|
+
assertion = ToolSequenceAssertion(
|
|
224
|
+
sequence=["search", "calculate", "summarize"],
|
|
225
|
+
strict=False
|
|
226
|
+
)
|
|
227
|
+
result = assertion.evaluate(output=None, expected=None, trace=trace_spans)
|
|
228
|
+
assert result.passed # True if tools appear in this order
|
|
229
|
+
|
|
230
|
+
# Strict mode (no other tools allowed between expected sequence)
|
|
231
|
+
assertion = ToolSequenceAssertion(
|
|
232
|
+
sequence=["search", "calculate"],
|
|
233
|
+
strict=True
|
|
234
|
+
)
|
|
235
|
+
result = assertion.evaluate(output=None, expected=None, trace=trace_spans)
|
|
236
|
+
assert result.passed # True only if exactly these tools in order
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
**Config format:**
|
|
240
|
+
```json
|
|
241
|
+
{
|
|
242
|
+
"sequence": ["search", "calculate", "summarize"],
|
|
243
|
+
"strict": false
|
|
244
|
+
}
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### Semantic Assertions (`semantic.py`)
|
|
248
|
+
|
|
249
|
+
Embedding-based semantic similarity comparison (requires `sentence-transformers`):
|
|
250
|
+
|
|
251
|
+
#### 9. `SemanticSimilarityAssertion`
|
|
252
|
+
Check if output is semantically similar to expected text using embeddings.
|
|
253
|
+
|
|
254
|
+
```python
|
|
255
|
+
from prela.evals.assertions import SemanticSimilarityAssertion
|
|
256
|
+
|
|
257
|
+
assertion = SemanticSimilarityAssertion(
|
|
258
|
+
expected_text="The weather is nice today",
|
|
259
|
+
threshold=0.8, # Minimum similarity score (0-1)
|
|
260
|
+
model_name="all-MiniLM-L6-v2" # Sentence transformer model
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
# High similarity (different wording, same meaning)
|
|
264
|
+
result = assertion.evaluate(output="Today has beautiful weather", expected=None, trace=None)
|
|
265
|
+
assert result.passed # True
|
|
266
|
+
assert result.score > 0.8 # Similarity score
|
|
267
|
+
|
|
268
|
+
# Low similarity (different meaning)
|
|
269
|
+
result = assertion.evaluate(output="I like pizza", expected=None, trace=None)
|
|
270
|
+
assert not result.passed # False
|
|
271
|
+
assert result.score < 0.8
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
**Installation:**
|
|
275
|
+
```bash
|
|
276
|
+
pip install sentence-transformers
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
**Config format:**
|
|
280
|
+
```json
|
|
281
|
+
{
|
|
282
|
+
"expected_text": "The weather is nice today",
|
|
283
|
+
"threshold": 0.8,
|
|
284
|
+
"model_name": "all-MiniLM-L6-v2"
|
|
285
|
+
}
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
**Performance notes:**
|
|
289
|
+
- First use downloads the model (~80MB for all-MiniLM-L6-v2)
|
|
290
|
+
- Embeddings are cached in memory for repeated evaluations
|
|
291
|
+
- Model is shared across all instances (class-level cache)
|
|
292
|
+
|
|
293
|
+
## AssertionResult
|
|
294
|
+
|
|
295
|
+
All assertions return an `AssertionResult` object with the following fields:
|
|
296
|
+
|
|
297
|
+
```python
|
|
298
|
+
@dataclass
|
|
299
|
+
class AssertionResult:
|
|
300
|
+
passed: bool # Whether the assertion passed
|
|
301
|
+
assertion_type: str # Type of assertion (e.g., "contains", "regex")
|
|
302
|
+
message: str # Human-readable description
|
|
303
|
+
score: float | None # Optional score 0-1 for partial credit
|
|
304
|
+
expected: Any # Expected value (if applicable)
|
|
305
|
+
actual: Any # Actual value that was evaluated
|
|
306
|
+
details: dict[str, Any] # Additional evaluation details
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
### String representation
|
|
310
|
+
|
|
311
|
+
AssertionResult has a nice string format for console output:
|
|
312
|
+
|
|
313
|
+
```python
|
|
314
|
+
result = assertion.evaluate(...)
|
|
315
|
+
print(result)
|
|
316
|
+
# Output: ✓ PASS [contains] Output contains 'success'
|
|
317
|
+
# Output: ✗ FAIL [regex] Pattern not found
|
|
318
|
+
# Output: ✓ PASS [semantic_similarity] Semantically similar (score: 0.87)
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
## Creating Custom Assertions
|
|
322
|
+
|
|
323
|
+
To create a custom assertion, extend `BaseAssertion`:
|
|
324
|
+
|
|
325
|
+
```python
|
|
326
|
+
from prela.evals.assertions.base import BaseAssertion, AssertionResult
|
|
327
|
+
|
|
328
|
+
class CustomAssertion(BaseAssertion):
|
|
329
|
+
def __init__(self, param1, param2):
|
|
330
|
+
self.param1 = param1
|
|
331
|
+
self.param2 = param2
|
|
332
|
+
|
|
333
|
+
def evaluate(self, output, expected, trace):
|
|
334
|
+
# Your evaluation logic here
|
|
335
|
+
passed = # ... check condition
|
|
336
|
+
|
|
337
|
+
return AssertionResult(
|
|
338
|
+
passed=passed,
|
|
339
|
+
assertion_type="custom",
|
|
340
|
+
message=f"Custom check: {passed}",
|
|
341
|
+
expected=self.param1,
|
|
342
|
+
actual=output,
|
|
343
|
+
details={"param2": self.param2}
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
@classmethod
|
|
347
|
+
def from_config(cls, config):
|
|
348
|
+
return cls(
|
|
349
|
+
param1=config["param1"],
|
|
350
|
+
param2=config.get("param2", "default")
|
|
351
|
+
)
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
## Config-Based Loading
|
|
355
|
+
|
|
356
|
+
All assertions support loading from configuration dictionaries:
|
|
357
|
+
|
|
358
|
+
```python
|
|
359
|
+
from prela.evals.assertions import ContainsAssertion
|
|
360
|
+
|
|
361
|
+
config = {
|
|
362
|
+
"text": "success",
|
|
363
|
+
"case_sensitive": False
|
|
364
|
+
}
|
|
365
|
+
assertion = ContainsAssertion.from_config(config)
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
This enables declarative test definitions in YAML/JSON files:
|
|
369
|
+
|
|
370
|
+
```yaml
|
|
371
|
+
# eval_suite.yaml
|
|
372
|
+
cases:
|
|
373
|
+
- name: "Test successful response"
|
|
374
|
+
input:
|
|
375
|
+
query: "What is 2+2?"
|
|
376
|
+
assertions:
|
|
377
|
+
- type: contains
|
|
378
|
+
config:
|
|
379
|
+
text: "4"
|
|
380
|
+
- type: length
|
|
381
|
+
config:
|
|
382
|
+
min_length: 1
|
|
383
|
+
max_length: 100
|
|
384
|
+
- type: tool_called
|
|
385
|
+
config:
|
|
386
|
+
tool_name: calculator
|
|
387
|
+
```
|
|
388
|
+
|
|
389
|
+
## Integration with EvalCase
|
|
390
|
+
|
|
391
|
+
Assertions are used within `EvalCase` objects:
|
|
392
|
+
|
|
393
|
+
```python
|
|
394
|
+
from prela.evals import EvalCase
|
|
395
|
+
from prela.evals.assertions import ContainsAssertion, ToolCalledAssertion
|
|
396
|
+
|
|
397
|
+
case = EvalCase(
|
|
398
|
+
name="Test calculator agent",
|
|
399
|
+
input={"query": "What is 15 * 23?"},
|
|
400
|
+
assertions=[
|
|
401
|
+
ContainsAssertion(text="345"),
|
|
402
|
+
ToolCalledAssertion(tool_name="calculator")
|
|
403
|
+
]
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# Run the case
|
|
407
|
+
result = case.run(agent_function=my_agent)
|
|
408
|
+
assert result.passed
|
|
409
|
+
```
|
|
410
|
+
|
|
411
|
+
## Best Practices
|
|
412
|
+
|
|
413
|
+
1. **Combine Multiple Assertions**: Use multiple assertions to verify different aspects of agent behavior
|
|
414
|
+
|
|
415
|
+
```python
|
|
416
|
+
assertions = [
|
|
417
|
+
ContainsAssertion(text="success"), # Check output content
|
|
418
|
+
LengthAssertion(min_length=10), # Check output length
|
|
419
|
+
ToolCalledAssertion(tool_name="search"), # Check tool usage
|
|
420
|
+
]
|
|
421
|
+
```
|
|
422
|
+
|
|
423
|
+
2. **Use Appropriate Assertion Types**:
|
|
424
|
+
- Structural assertions for format validation
|
|
425
|
+
- Tool assertions for agent behavior verification
|
|
426
|
+
- Semantic assertions for meaning-based comparison
|
|
427
|
+
|
|
428
|
+
3. **Set Reasonable Thresholds**:
|
|
429
|
+
- Semantic similarity: 0.7-0.8 for similar meaning, 0.9+ for near-identical
|
|
430
|
+
- Length bounds: Consider typical output ranges
|
|
431
|
+
|
|
432
|
+
4. **Handle Optional Dependencies**:
|
|
433
|
+
```python
|
|
434
|
+
try:
|
|
435
|
+
from prela.evals.assertions import SemanticSimilarityAssertion
|
|
436
|
+
use_semantic = True
|
|
437
|
+
except ImportError:
|
|
438
|
+
use_semantic = False
|
|
439
|
+
```
|
|
440
|
+
|
|
441
|
+
5. **Cache Semantic Embeddings**: The semantic assertion automatically caches embeddings. For long-running tests, clear cache periodically:
|
|
442
|
+
|
|
443
|
+
```python
|
|
444
|
+
from prela.evals.assertions import SemanticSimilarityAssertion
|
|
445
|
+
|
|
446
|
+
# After processing many cases
|
|
447
|
+
SemanticSimilarityAssertion.clear_cache()
|
|
448
|
+
```
|
|
449
|
+
|
|
450
|
+
## Performance Considerations
|
|
451
|
+
|
|
452
|
+
- **Structural assertions**: Microsecond-level performance, negligible overhead
|
|
453
|
+
- **Tool assertions**: Fast trace scanning, O(n) where n = number of spans
|
|
454
|
+
- **Semantic assertions**: First use downloads model, subsequent calls cached
|
|
455
|
+
- Model loading: ~1-2 seconds
|
|
456
|
+
- Embedding computation: ~10-50ms per text
|
|
457
|
+
- Cached embeddings: ~1µs lookup
|
|
458
|
+
|
|
459
|
+
## Testing
|
|
460
|
+
|
|
461
|
+
Comprehensive tests are available in `tests/test_evals/test_assertions.py`:
|
|
462
|
+
|
|
463
|
+
```bash
|
|
464
|
+
# Run all assertion tests
|
|
465
|
+
pytest tests/test_evals/test_assertions.py -v
|
|
466
|
+
|
|
467
|
+
# Run specific assertion type
|
|
468
|
+
pytest tests/test_evals/test_assertions.py::TestContainsAssertion -v
|
|
469
|
+
|
|
470
|
+
# Skip semantic tests (if sentence-transformers not installed)
|
|
471
|
+
pytest tests/test_evals/test_assertions.py -v --ignore-glob="*semantic*"
|
|
472
|
+
```
|
|
473
|
+
|
|
474
|
+
## Examples
|
|
475
|
+
|
|
476
|
+
See `examples/assertions_demo.py` for a comprehensive demonstration of all assertion types.
|
|
477
|
+
|
|
478
|
+
## References
|
|
479
|
+
|
|
480
|
+
- Base classes: `prela.evals.assertions.base`
|
|
481
|
+
- Structural: `prela.evals.assertions.structural`
|
|
482
|
+
- Tool: `prela.evals.assertions.tool`
|
|
483
|
+
- Semantic: `prela.evals.assertions.semantic`
|
|
484
|
+
- Tests: `tests/test_evals/test_assertions.py`
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Assertions for evaluating AI agent outputs.
|
|
3
|
+
|
|
4
|
+
This module provides various assertion types for testing AI agent behavior:
|
|
5
|
+
- Structural: Text matching, regex, length, JSON validation
|
|
6
|
+
- Tool: Tool call verification and sequence checking
|
|
7
|
+
- Semantic: Embedding-based similarity comparison
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from prela.evals.assertions.base import AssertionResult, BaseAssertion
|
|
13
|
+
from prela.evals.assertions.structural import (
|
|
14
|
+
ContainsAssertion,
|
|
15
|
+
JSONValidAssertion,
|
|
16
|
+
LengthAssertion,
|
|
17
|
+
NotContainsAssertion,
|
|
18
|
+
RegexAssertion,
|
|
19
|
+
)
|
|
20
|
+
from prela.evals.assertions.tool import (
|
|
21
|
+
ToolArgsAssertion,
|
|
22
|
+
ToolCalledAssertion,
|
|
23
|
+
ToolSequenceAssertion,
|
|
24
|
+
)
|
|
25
|
+
from prela.evals.assertions.multi_agent import (
|
|
26
|
+
AgentCollaborationAssertion,
|
|
27
|
+
AgentUsedAssertion,
|
|
28
|
+
ConversationTurnsAssertion,
|
|
29
|
+
DelegationOccurredAssertion,
|
|
30
|
+
HandoffOccurredAssertion,
|
|
31
|
+
NoCircularDelegationAssertion,
|
|
32
|
+
TaskCompletedAssertion,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Semantic assertions are optional (require sentence-transformers)
|
|
36
|
+
try:
|
|
37
|
+
from prela.evals.assertions.semantic import SemanticSimilarityAssertion
|
|
38
|
+
|
|
39
|
+
__all__ = [
|
|
40
|
+
"AssertionResult",
|
|
41
|
+
"BaseAssertion",
|
|
42
|
+
"ContainsAssertion",
|
|
43
|
+
"NotContainsAssertion",
|
|
44
|
+
"RegexAssertion",
|
|
45
|
+
"LengthAssertion",
|
|
46
|
+
"JSONValidAssertion",
|
|
47
|
+
"ToolCalledAssertion",
|
|
48
|
+
"ToolArgsAssertion",
|
|
49
|
+
"ToolSequenceAssertion",
|
|
50
|
+
"SemanticSimilarityAssertion",
|
|
51
|
+
"AgentUsedAssertion",
|
|
52
|
+
"TaskCompletedAssertion",
|
|
53
|
+
"DelegationOccurredAssertion",
|
|
54
|
+
"HandoffOccurredAssertion",
|
|
55
|
+
"AgentCollaborationAssertion",
|
|
56
|
+
"ConversationTurnsAssertion",
|
|
57
|
+
"NoCircularDelegationAssertion",
|
|
58
|
+
]
|
|
59
|
+
except ImportError:
|
|
60
|
+
__all__ = [
|
|
61
|
+
"AssertionResult",
|
|
62
|
+
"BaseAssertion",
|
|
63
|
+
"ContainsAssertion",
|
|
64
|
+
"NotContainsAssertion",
|
|
65
|
+
"RegexAssertion",
|
|
66
|
+
"LengthAssertion",
|
|
67
|
+
"JSONValidAssertion",
|
|
68
|
+
"ToolCalledAssertion",
|
|
69
|
+
"ToolArgsAssertion",
|
|
70
|
+
"ToolSequenceAssertion",
|
|
71
|
+
"AgentUsedAssertion",
|
|
72
|
+
"TaskCompletedAssertion",
|
|
73
|
+
"DelegationOccurredAssertion",
|
|
74
|
+
"HandoffOccurredAssertion",
|
|
75
|
+
"AgentCollaborationAssertion",
|
|
76
|
+
"ConversationTurnsAssertion",
|
|
77
|
+
"NoCircularDelegationAssertion",
|
|
78
|
+
]
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base classes for evaluation assertions.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from prela.core.span import Span
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class AssertionResult:
|
|
16
|
+
"""Result of an assertion evaluation.
|
|
17
|
+
|
|
18
|
+
Attributes:
|
|
19
|
+
passed: Whether the assertion passed
|
|
20
|
+
assertion_type: Type of assertion (e.g., "contains", "semantic_similarity")
|
|
21
|
+
message: Human-readable message describing the result
|
|
22
|
+
score: Optional score between 0-1 for partial credit assertions
|
|
23
|
+
expected: Expected value (if applicable)
|
|
24
|
+
actual: Actual value that was evaluated
|
|
25
|
+
details: Additional details about the evaluation
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
passed: bool
|
|
29
|
+
assertion_type: str
|
|
30
|
+
message: str
|
|
31
|
+
score: float | None = None
|
|
32
|
+
expected: Any = None
|
|
33
|
+
actual: Any = None
|
|
34
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
35
|
+
|
|
36
|
+
def __str__(self) -> str:
|
|
37
|
+
"""Human-readable string representation."""
|
|
38
|
+
status = "✓ PASS" if self.passed else "✗ FAIL"
|
|
39
|
+
msg = f"{status} [{self.assertion_type}] {self.message}"
|
|
40
|
+
if self.score is not None:
|
|
41
|
+
msg += f" (score: {self.score:.2f})"
|
|
42
|
+
return msg
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class BaseAssertion(ABC):
|
|
46
|
+
"""Base class for all assertions.
|
|
47
|
+
|
|
48
|
+
Assertions evaluate agent outputs and traces to determine if they meet
|
|
49
|
+
expected criteria. Subclasses should implement the evaluate() method
|
|
50
|
+
to perform the actual check.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
def evaluate(
|
|
55
|
+
self,
|
|
56
|
+
output: Any,
|
|
57
|
+
expected: Any | None,
|
|
58
|
+
trace: list[Span] | None,
|
|
59
|
+
) -> AssertionResult:
|
|
60
|
+
"""Evaluate the assertion against the output and trace.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
output: The actual output from the agent/function under test
|
|
64
|
+
expected: The expected output (format depends on assertion type)
|
|
65
|
+
trace: Optional list of spans from the traced execution
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
AssertionResult with pass/fail status and details
|
|
69
|
+
"""
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
@abstractmethod
|
|
74
|
+
def from_config(cls, config: dict[str, Any]) -> BaseAssertion:
|
|
75
|
+
"""Create an assertion instance from configuration dict.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
config: Configuration dictionary with assertion-specific parameters
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Configured assertion instance
|
|
82
|
+
|
|
83
|
+
Raises:
|
|
84
|
+
ValueError: If configuration is invalid
|
|
85
|
+
"""
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
def __repr__(self) -> str:
|
|
89
|
+
"""Developer-friendly representation."""
|
|
90
|
+
return f"{self.__class__.__name__}()"
|