prela 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. prela/__init__.py +394 -0
  2. prela/_version.py +3 -0
  3. prela/contrib/CLI.md +431 -0
  4. prela/contrib/README.md +118 -0
  5. prela/contrib/__init__.py +5 -0
  6. prela/contrib/cli.py +1063 -0
  7. prela/contrib/explorer.py +571 -0
  8. prela/core/__init__.py +64 -0
  9. prela/core/clock.py +98 -0
  10. prela/core/context.py +228 -0
  11. prela/core/replay.py +403 -0
  12. prela/core/sampler.py +178 -0
  13. prela/core/span.py +295 -0
  14. prela/core/tracer.py +498 -0
  15. prela/evals/__init__.py +94 -0
  16. prela/evals/assertions/README.md +484 -0
  17. prela/evals/assertions/__init__.py +78 -0
  18. prela/evals/assertions/base.py +90 -0
  19. prela/evals/assertions/multi_agent.py +625 -0
  20. prela/evals/assertions/semantic.py +223 -0
  21. prela/evals/assertions/structural.py +443 -0
  22. prela/evals/assertions/tool.py +380 -0
  23. prela/evals/case.py +370 -0
  24. prela/evals/n8n/__init__.py +69 -0
  25. prela/evals/n8n/assertions.py +450 -0
  26. prela/evals/n8n/runner.py +497 -0
  27. prela/evals/reporters/README.md +184 -0
  28. prela/evals/reporters/__init__.py +32 -0
  29. prela/evals/reporters/console.py +251 -0
  30. prela/evals/reporters/json.py +176 -0
  31. prela/evals/reporters/junit.py +278 -0
  32. prela/evals/runner.py +525 -0
  33. prela/evals/suite.py +316 -0
  34. prela/exporters/__init__.py +27 -0
  35. prela/exporters/base.py +189 -0
  36. prela/exporters/console.py +443 -0
  37. prela/exporters/file.py +322 -0
  38. prela/exporters/http.py +394 -0
  39. prela/exporters/multi.py +154 -0
  40. prela/exporters/otlp.py +388 -0
  41. prela/instrumentation/ANTHROPIC.md +297 -0
  42. prela/instrumentation/LANGCHAIN.md +480 -0
  43. prela/instrumentation/OPENAI.md +59 -0
  44. prela/instrumentation/__init__.py +49 -0
  45. prela/instrumentation/anthropic.py +1436 -0
  46. prela/instrumentation/auto.py +129 -0
  47. prela/instrumentation/base.py +436 -0
  48. prela/instrumentation/langchain.py +959 -0
  49. prela/instrumentation/llamaindex.py +719 -0
  50. prela/instrumentation/multi_agent/__init__.py +48 -0
  51. prela/instrumentation/multi_agent/autogen.py +357 -0
  52. prela/instrumentation/multi_agent/crewai.py +404 -0
  53. prela/instrumentation/multi_agent/langgraph.py +299 -0
  54. prela/instrumentation/multi_agent/models.py +203 -0
  55. prela/instrumentation/multi_agent/swarm.py +231 -0
  56. prela/instrumentation/n8n/__init__.py +68 -0
  57. prela/instrumentation/n8n/code_node.py +534 -0
  58. prela/instrumentation/n8n/models.py +336 -0
  59. prela/instrumentation/n8n/webhook.py +489 -0
  60. prela/instrumentation/openai.py +1198 -0
  61. prela/license.py +245 -0
  62. prela/replay/__init__.py +31 -0
  63. prela/replay/comparison.py +390 -0
  64. prela/replay/engine.py +1227 -0
  65. prela/replay/loader.py +231 -0
  66. prela/replay/result.py +196 -0
  67. prela-0.1.0.dist-info/METADATA +399 -0
  68. prela-0.1.0.dist-info/RECORD +71 -0
  69. prela-0.1.0.dist-info/WHEEL +4 -0
  70. prela-0.1.0.dist-info/entry_points.txt +2 -0
  71. prela-0.1.0.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,484 @@
1
+ # Evaluation Assertions
2
+
3
+ This module provides a comprehensive set of assertions for testing AI agent outputs and behaviors. Assertions are the building blocks of evaluation test cases, allowing you to verify that your agent produces expected results.
4
+
5
+ ## Overview
6
+
7
+ Assertions evaluate agent outputs, expected values, and execution traces to determine if they meet specified criteria. Each assertion returns an `AssertionResult` with pass/fail status, score (for partial credit), and detailed information about the evaluation.
8
+
9
+ ## Assertion Types
10
+
11
+ ### Structural Assertions (`structural.py`)
12
+
13
+ Text and data format validation assertions:
14
+
15
+ #### 1. `ContainsAssertion`
16
+ Check if output contains specified text.
17
+
18
+ ```python
19
+ from prela.evals.assertions import ContainsAssertion
20
+
21
+ # Case-sensitive search
22
+ assertion = ContainsAssertion(text="success", case_sensitive=True)
23
+ result = assertion.evaluate(output="Operation completed successfully", expected=None, trace=None)
24
+ assert result.passed # True
25
+
26
+ # Case-insensitive search
27
+ assertion = ContainsAssertion(text="ERROR", case_sensitive=False)
28
+ result = assertion.evaluate(output="error occurred", expected=None, trace=None)
29
+ assert result.passed # True
30
+ ```
31
+
32
+ **Config format:**
33
+ ```json
34
+ {
35
+ "text": "success",
36
+ "case_sensitive": true
37
+ }
38
+ ```
39
+
40
+ #### 2. `NotContainsAssertion`
41
+ Check if output does NOT contain specified text.
42
+
43
+ ```python
44
+ from prela.evals.assertions import NotContainsAssertion
45
+
46
+ assertion = NotContainsAssertion(text="error", case_sensitive=True)
47
+ result = assertion.evaluate(output="All tests passed!", expected=None, trace=None)
48
+ assert result.passed # True
49
+ ```
50
+
51
+ **Config format:**
52
+ ```json
53
+ {
54
+ "text": "error",
55
+ "case_sensitive": true
56
+ }
57
+ ```
58
+
59
+ #### 3. `RegexAssertion`
60
+ Match output against a regular expression pattern.
61
+
62
+ ```python
63
+ from prela.evals.assertions import RegexAssertion
64
+ import re
65
+
66
+ # Phone number validation
67
+ assertion = RegexAssertion(pattern=r"\d{3}-\d{3}-\d{4}")
68
+ result = assertion.evaluate(output="Call me at 555-123-4567", expected=None, trace=None)
69
+ assert result.passed # True
70
+ assert result.details["matched_text"] == "555-123-4567"
71
+
72
+ # Case-insensitive matching
73
+ assertion = RegexAssertion(pattern=r"hello", flags=re.IGNORECASE)
74
+ result = assertion.evaluate(output="HELLO WORLD", expected=None, trace=None)
75
+ assert result.passed # True
76
+ ```
77
+
78
+ **Config format:**
79
+ ```json
80
+ {
81
+ "pattern": "\\d{3}-\\d{3}-\\d{4}",
82
+ "flags": 0
83
+ }
84
+ ```
85
+
86
+ #### 4. `LengthAssertion`
87
+ Check if output length is within specified bounds.
88
+
89
+ ```python
90
+ from prela.evals.assertions import LengthAssertion
91
+
92
+ # Min and max bounds
93
+ assertion = LengthAssertion(min_length=10, max_length=100)
94
+ result = assertion.evaluate(output="This is a medium length response.", expected=None, trace=None)
95
+ assert result.passed # True
96
+ assert result.actual == 34 # Character count
97
+
98
+ # Min only
99
+ assertion = LengthAssertion(min_length=5)
100
+ result = assertion.evaluate(output="Hi", expected=None, trace=None)
101
+ assert not result.passed # False (too short)
102
+
103
+ # Max only
104
+ assertion = LengthAssertion(max_length=50)
105
+ result = assertion.evaluate(output="Short text", expected=None, trace=None)
106
+ assert result.passed # True
107
+ ```
108
+
109
+ **Config format:**
110
+ ```json
111
+ {
112
+ "min_length": 10,
113
+ "max_length": 100
114
+ }
115
+ ```
116
+
117
+ #### 5. `JSONValidAssertion`
118
+ Validate that output is valid JSON, optionally matching a schema.
119
+
120
+ ```python
121
+ from prela.evals.assertions import JSONValidAssertion
122
+
123
+ # Basic JSON validation
124
+ assertion = JSONValidAssertion()
125
+ result = assertion.evaluate(output='{"status": "success", "count": 42}', expected=None, trace=None)
126
+ assert result.passed # True
127
+ assert result.actual == {"status": "success", "count": 42}
128
+
129
+ # JSON schema validation (requires jsonschema library)
130
+ schema = {
131
+ "type": "object",
132
+ "properties": {
133
+ "name": {"type": "string"},
134
+ "age": {"type": "number"}
135
+ },
136
+ "required": ["name"]
137
+ }
138
+ assertion = JSONValidAssertion(schema=schema)
139
+ result = assertion.evaluate(output='{"name": "Alice", "age": 30}', expected=None, trace=None)
140
+ assert result.passed # True
141
+
142
+ result = assertion.evaluate(output='{"age": 30}', expected=None, trace=None)
143
+ assert not result.passed # False (missing required field "name")
144
+ ```
145
+
146
+ **Config format:**
147
+ ```json
148
+ {
149
+ "schema": {
150
+ "type": "object",
151
+ "properties": {
152
+ "name": {"type": "string"}
153
+ },
154
+ "required": ["name"]
155
+ }
156
+ }
157
+ ```
158
+
159
+ ### Tool Assertions (`tool.py`)
160
+
161
+ Assertions for verifying agent tool usage based on execution traces:
162
+
163
+ #### 6. `ToolCalledAssertion`
164
+ Check if a specific tool was called during execution.
165
+
166
+ ```python
167
+ from prela.evals.assertions import ToolCalledAssertion
168
+
169
+ assertion = ToolCalledAssertion(tool_name="web_search")
170
+ result = assertion.evaluate(output=None, expected=None, trace=trace_spans)
171
+ assert result.passed # True if "web_search" tool span found in trace
172
+ assert result.details["call_count"] == 2 # Number of times called
173
+ ```
174
+
175
+ **Config format:**
176
+ ```json
177
+ {
178
+ "tool_name": "web_search"
179
+ }
180
+ ```
181
+
182
+ #### 7. `ToolArgsAssertion`
183
+ Check if a tool was called with expected arguments.
184
+
185
+ ```python
186
+ from prela.evals.assertions import ToolArgsAssertion
187
+
188
+ # Partial match (checks that expected args are present)
189
+ assertion = ToolArgsAssertion(
190
+ tool_name="web_search",
191
+ expected_args={"query": "Python tutorial"},
192
+ partial_match=True
193
+ )
194
+ result = assertion.evaluate(output=None, expected=None, trace=trace_spans)
195
+ assert result.passed # True even if tool has additional args
196
+
197
+ # Exact match (requires exact argument match)
198
+ assertion = ToolArgsAssertion(
199
+ tool_name="calculator",
200
+ expected_args={"x": 5, "y": 10},
201
+ partial_match=False
202
+ )
203
+ result = assertion.evaluate(output=None, expected=None, trace=trace_spans)
204
+ assert result.passed # True only if args exactly match
205
+ ```
206
+
207
+ **Config format:**
208
+ ```json
209
+ {
210
+ "tool_name": "web_search",
211
+ "expected_args": {"query": "Python"},
212
+ "partial_match": true
213
+ }
214
+ ```
215
+
216
+ #### 8. `ToolSequenceAssertion`
217
+ Check if tools were called in a specific order.
218
+
219
+ ```python
220
+ from prela.evals.assertions import ToolSequenceAssertion
221
+
222
+ # Non-strict mode (other tools can appear between expected sequence)
223
+ assertion = ToolSequenceAssertion(
224
+ sequence=["search", "calculate", "summarize"],
225
+ strict=False
226
+ )
227
+ result = assertion.evaluate(output=None, expected=None, trace=trace_spans)
228
+ assert result.passed # True if tools appear in this order
229
+
230
+ # Strict mode (no other tools allowed between expected sequence)
231
+ assertion = ToolSequenceAssertion(
232
+ sequence=["search", "calculate"],
233
+ strict=True
234
+ )
235
+ result = assertion.evaluate(output=None, expected=None, trace=trace_spans)
236
+ assert result.passed # True only if exactly these tools in order
237
+ ```
238
+
239
+ **Config format:**
240
+ ```json
241
+ {
242
+ "sequence": ["search", "calculate", "summarize"],
243
+ "strict": false
244
+ }
245
+ ```
246
+
247
+ ### Semantic Assertions (`semantic.py`)
248
+
249
+ Embedding-based semantic similarity comparison (requires `sentence-transformers`):
250
+
251
+ #### 9. `SemanticSimilarityAssertion`
252
+ Check if output is semantically similar to expected text using embeddings.
253
+
254
+ ```python
255
+ from prela.evals.assertions import SemanticSimilarityAssertion
256
+
257
+ assertion = SemanticSimilarityAssertion(
258
+ expected_text="The weather is nice today",
259
+ threshold=0.8, # Minimum similarity score (0-1)
260
+ model_name="all-MiniLM-L6-v2" # Sentence transformer model
261
+ )
262
+
263
+ # High similarity (different wording, same meaning)
264
+ result = assertion.evaluate(output="Today has beautiful weather", expected=None, trace=None)
265
+ assert result.passed # True
266
+ assert result.score > 0.8 # Similarity score
267
+
268
+ # Low similarity (different meaning)
269
+ result = assertion.evaluate(output="I like pizza", expected=None, trace=None)
270
+ assert not result.passed # False
271
+ assert result.score < 0.8
272
+ ```
273
+
274
+ **Installation:**
275
+ ```bash
276
+ pip install sentence-transformers
277
+ ```
278
+
279
+ **Config format:**
280
+ ```json
281
+ {
282
+ "expected_text": "The weather is nice today",
283
+ "threshold": 0.8,
284
+ "model_name": "all-MiniLM-L6-v2"
285
+ }
286
+ ```
287
+
288
+ **Performance notes:**
289
+ - First use downloads the model (~80MB for all-MiniLM-L6-v2)
290
+ - Embeddings are cached in memory for repeated evaluations
291
+ - Model is shared across all instances (class-level cache)
292
+
293
+ ## AssertionResult
294
+
295
+ All assertions return an `AssertionResult` object with the following fields:
296
+
297
+ ```python
298
+ @dataclass
299
+ class AssertionResult:
300
+ passed: bool # Whether the assertion passed
301
+ assertion_type: str # Type of assertion (e.g., "contains", "regex")
302
+ message: str # Human-readable description
303
+ score: float | None # Optional score 0-1 for partial credit
304
+ expected: Any # Expected value (if applicable)
305
+ actual: Any # Actual value that was evaluated
306
+ details: dict[str, Any] # Additional evaluation details
307
+ ```
308
+
309
+ ### String representation
310
+
311
+ AssertionResult has a nice string format for console output:
312
+
313
+ ```python
314
+ result = assertion.evaluate(...)
315
+ print(result)
316
+ # Output: ✓ PASS [contains] Output contains 'success'
317
+ # Output: ✗ FAIL [regex] Pattern not found
318
+ # Output: ✓ PASS [semantic_similarity] Semantically similar (score: 0.87)
319
+ ```
320
+
321
+ ## Creating Custom Assertions
322
+
323
+ To create a custom assertion, extend `BaseAssertion`:
324
+
325
+ ```python
326
+ from prela.evals.assertions.base import BaseAssertion, AssertionResult
327
+
328
+ class CustomAssertion(BaseAssertion):
329
+ def __init__(self, param1, param2):
330
+ self.param1 = param1
331
+ self.param2 = param2
332
+
333
+ def evaluate(self, output, expected, trace):
334
+ # Your evaluation logic here
335
+ passed = # ... check condition
336
+
337
+ return AssertionResult(
338
+ passed=passed,
339
+ assertion_type="custom",
340
+ message=f"Custom check: {passed}",
341
+ expected=self.param1,
342
+ actual=output,
343
+ details={"param2": self.param2}
344
+ )
345
+
346
+ @classmethod
347
+ def from_config(cls, config):
348
+ return cls(
349
+ param1=config["param1"],
350
+ param2=config.get("param2", "default")
351
+ )
352
+ ```
353
+
354
+ ## Config-Based Loading
355
+
356
+ All assertions support loading from configuration dictionaries:
357
+
358
+ ```python
359
+ from prela.evals.assertions import ContainsAssertion
360
+
361
+ config = {
362
+ "text": "success",
363
+ "case_sensitive": False
364
+ }
365
+ assertion = ContainsAssertion.from_config(config)
366
+ ```
367
+
368
+ This enables declarative test definitions in YAML/JSON files:
369
+
370
+ ```yaml
371
+ # eval_suite.yaml
372
+ cases:
373
+ - name: "Test successful response"
374
+ input:
375
+ query: "What is 2+2?"
376
+ assertions:
377
+ - type: contains
378
+ config:
379
+ text: "4"
380
+ - type: length
381
+ config:
382
+ min_length: 1
383
+ max_length: 100
384
+ - type: tool_called
385
+ config:
386
+ tool_name: calculator
387
+ ```
388
+
389
+ ## Integration with EvalCase
390
+
391
+ Assertions are used within `EvalCase` objects:
392
+
393
+ ```python
394
+ from prela.evals import EvalCase
395
+ from prela.evals.assertions import ContainsAssertion, ToolCalledAssertion
396
+
397
+ case = EvalCase(
398
+ name="Test calculator agent",
399
+ input={"query": "What is 15 * 23?"},
400
+ assertions=[
401
+ ContainsAssertion(text="345"),
402
+ ToolCalledAssertion(tool_name="calculator")
403
+ ]
404
+ )
405
+
406
+ # Run the case
407
+ result = case.run(agent_function=my_agent)
408
+ assert result.passed
409
+ ```
410
+
411
+ ## Best Practices
412
+
413
+ 1. **Combine Multiple Assertions**: Use multiple assertions to verify different aspects of agent behavior
414
+
415
+ ```python
416
+ assertions = [
417
+ ContainsAssertion(text="success"), # Check output content
418
+ LengthAssertion(min_length=10), # Check output length
419
+ ToolCalledAssertion(tool_name="search"), # Check tool usage
420
+ ]
421
+ ```
422
+
423
+ 2. **Use Appropriate Assertion Types**:
424
+ - Structural assertions for format validation
425
+ - Tool assertions for agent behavior verification
426
+ - Semantic assertions for meaning-based comparison
427
+
428
+ 3. **Set Reasonable Thresholds**:
429
+ - Semantic similarity: 0.7-0.8 for similar meaning, 0.9+ for near-identical
430
+ - Length bounds: Consider typical output ranges
431
+
432
+ 4. **Handle Optional Dependencies**:
433
+ ```python
434
+ try:
435
+ from prela.evals.assertions import SemanticSimilarityAssertion
436
+ use_semantic = True
437
+ except ImportError:
438
+ use_semantic = False
439
+ ```
440
+
441
+ 5. **Cache Semantic Embeddings**: The semantic assertion automatically caches embeddings. For long-running tests, clear cache periodically:
442
+
443
+ ```python
444
+ from prela.evals.assertions import SemanticSimilarityAssertion
445
+
446
+ # After processing many cases
447
+ SemanticSimilarityAssertion.clear_cache()
448
+ ```
449
+
450
+ ## Performance Considerations
451
+
452
+ - **Structural assertions**: Microsecond-level performance, negligible overhead
453
+ - **Tool assertions**: Fast trace scanning, O(n) where n = number of spans
454
+ - **Semantic assertions**: First use downloads model, subsequent calls cached
455
+ - Model loading: ~1-2 seconds
456
+ - Embedding computation: ~10-50ms per text
457
+ - Cached embeddings: ~1µs lookup
458
+
459
+ ## Testing
460
+
461
+ Comprehensive tests are available in `tests/test_evals/test_assertions.py`:
462
+
463
+ ```bash
464
+ # Run all assertion tests
465
+ pytest tests/test_evals/test_assertions.py -v
466
+
467
+ # Run specific assertion type
468
+ pytest tests/test_evals/test_assertions.py::TestContainsAssertion -v
469
+
470
+ # Skip semantic tests (if sentence-transformers not installed)
471
+ pytest tests/test_evals/test_assertions.py -v --ignore-glob="*semantic*"
472
+ ```
473
+
474
+ ## Examples
475
+
476
+ See `examples/assertions_demo.py` for a comprehensive demonstration of all assertion types.
477
+
478
+ ## References
479
+
480
+ - Base classes: `prela.evals.assertions.base`
481
+ - Structural: `prela.evals.assertions.structural`
482
+ - Tool: `prela.evals.assertions.tool`
483
+ - Semantic: `prela.evals.assertions.semantic`
484
+ - Tests: `tests/test_evals/test_assertions.py`
@@ -0,0 +1,78 @@
1
+ """
2
+ Assertions for evaluating AI agent outputs.
3
+
4
+ This module provides various assertion types for testing AI agent behavior:
5
+ - Structural: Text matching, regex, length, JSON validation
6
+ - Tool: Tool call verification and sequence checking
7
+ - Semantic: Embedding-based similarity comparison
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from prela.evals.assertions.base import AssertionResult, BaseAssertion
13
+ from prela.evals.assertions.structural import (
14
+ ContainsAssertion,
15
+ JSONValidAssertion,
16
+ LengthAssertion,
17
+ NotContainsAssertion,
18
+ RegexAssertion,
19
+ )
20
+ from prela.evals.assertions.tool import (
21
+ ToolArgsAssertion,
22
+ ToolCalledAssertion,
23
+ ToolSequenceAssertion,
24
+ )
25
+ from prela.evals.assertions.multi_agent import (
26
+ AgentCollaborationAssertion,
27
+ AgentUsedAssertion,
28
+ ConversationTurnsAssertion,
29
+ DelegationOccurredAssertion,
30
+ HandoffOccurredAssertion,
31
+ NoCircularDelegationAssertion,
32
+ TaskCompletedAssertion,
33
+ )
34
+
35
+ # Semantic assertions are optional (require sentence-transformers)
36
+ try:
37
+ from prela.evals.assertions.semantic import SemanticSimilarityAssertion
38
+
39
+ __all__ = [
40
+ "AssertionResult",
41
+ "BaseAssertion",
42
+ "ContainsAssertion",
43
+ "NotContainsAssertion",
44
+ "RegexAssertion",
45
+ "LengthAssertion",
46
+ "JSONValidAssertion",
47
+ "ToolCalledAssertion",
48
+ "ToolArgsAssertion",
49
+ "ToolSequenceAssertion",
50
+ "SemanticSimilarityAssertion",
51
+ "AgentUsedAssertion",
52
+ "TaskCompletedAssertion",
53
+ "DelegationOccurredAssertion",
54
+ "HandoffOccurredAssertion",
55
+ "AgentCollaborationAssertion",
56
+ "ConversationTurnsAssertion",
57
+ "NoCircularDelegationAssertion",
58
+ ]
59
+ except ImportError:
60
+ __all__ = [
61
+ "AssertionResult",
62
+ "BaseAssertion",
63
+ "ContainsAssertion",
64
+ "NotContainsAssertion",
65
+ "RegexAssertion",
66
+ "LengthAssertion",
67
+ "JSONValidAssertion",
68
+ "ToolCalledAssertion",
69
+ "ToolArgsAssertion",
70
+ "ToolSequenceAssertion",
71
+ "AgentUsedAssertion",
72
+ "TaskCompletedAssertion",
73
+ "DelegationOccurredAssertion",
74
+ "HandoffOccurredAssertion",
75
+ "AgentCollaborationAssertion",
76
+ "ConversationTurnsAssertion",
77
+ "NoCircularDelegationAssertion",
78
+ ]
@@ -0,0 +1,90 @@
1
+ """
2
+ Base classes for evaluation assertions.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from abc import ABC, abstractmethod
8
+ from dataclasses import dataclass, field
9
+ from typing import Any
10
+
11
+ from prela.core.span import Span
12
+
13
+
14
+ @dataclass
15
+ class AssertionResult:
16
+ """Result of an assertion evaluation.
17
+
18
+ Attributes:
19
+ passed: Whether the assertion passed
20
+ assertion_type: Type of assertion (e.g., "contains", "semantic_similarity")
21
+ message: Human-readable message describing the result
22
+ score: Optional score between 0-1 for partial credit assertions
23
+ expected: Expected value (if applicable)
24
+ actual: Actual value that was evaluated
25
+ details: Additional details about the evaluation
26
+ """
27
+
28
+ passed: bool
29
+ assertion_type: str
30
+ message: str
31
+ score: float | None = None
32
+ expected: Any = None
33
+ actual: Any = None
34
+ details: dict[str, Any] = field(default_factory=dict)
35
+
36
+ def __str__(self) -> str:
37
+ """Human-readable string representation."""
38
+ status = "✓ PASS" if self.passed else "✗ FAIL"
39
+ msg = f"{status} [{self.assertion_type}] {self.message}"
40
+ if self.score is not None:
41
+ msg += f" (score: {self.score:.2f})"
42
+ return msg
43
+
44
+
45
+ class BaseAssertion(ABC):
46
+ """Base class for all assertions.
47
+
48
+ Assertions evaluate agent outputs and traces to determine if they meet
49
+ expected criteria. Subclasses should implement the evaluate() method
50
+ to perform the actual check.
51
+ """
52
+
53
+ @abstractmethod
54
+ def evaluate(
55
+ self,
56
+ output: Any,
57
+ expected: Any | None,
58
+ trace: list[Span] | None,
59
+ ) -> AssertionResult:
60
+ """Evaluate the assertion against the output and trace.
61
+
62
+ Args:
63
+ output: The actual output from the agent/function under test
64
+ expected: The expected output (format depends on assertion type)
65
+ trace: Optional list of spans from the traced execution
66
+
67
+ Returns:
68
+ AssertionResult with pass/fail status and details
69
+ """
70
+ pass
71
+
72
+ @classmethod
73
+ @abstractmethod
74
+ def from_config(cls, config: dict[str, Any]) -> BaseAssertion:
75
+ """Create an assertion instance from configuration dict.
76
+
77
+ Args:
78
+ config: Configuration dictionary with assertion-specific parameters
79
+
80
+ Returns:
81
+ Configured assertion instance
82
+
83
+ Raises:
84
+ ValueError: If configuration is invalid
85
+ """
86
+ pass
87
+
88
+ def __repr__(self) -> str:
89
+ """Developer-friendly representation."""
90
+ return f"{self.__class__.__name__}()"