prela 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. prela/__init__.py +394 -0
  2. prela/_version.py +3 -0
  3. prela/contrib/CLI.md +431 -0
  4. prela/contrib/README.md +118 -0
  5. prela/contrib/__init__.py +5 -0
  6. prela/contrib/cli.py +1063 -0
  7. prela/contrib/explorer.py +571 -0
  8. prela/core/__init__.py +64 -0
  9. prela/core/clock.py +98 -0
  10. prela/core/context.py +228 -0
  11. prela/core/replay.py +403 -0
  12. prela/core/sampler.py +178 -0
  13. prela/core/span.py +295 -0
  14. prela/core/tracer.py +498 -0
  15. prela/evals/__init__.py +94 -0
  16. prela/evals/assertions/README.md +484 -0
  17. prela/evals/assertions/__init__.py +78 -0
  18. prela/evals/assertions/base.py +90 -0
  19. prela/evals/assertions/multi_agent.py +625 -0
  20. prela/evals/assertions/semantic.py +223 -0
  21. prela/evals/assertions/structural.py +443 -0
  22. prela/evals/assertions/tool.py +380 -0
  23. prela/evals/case.py +370 -0
  24. prela/evals/n8n/__init__.py +69 -0
  25. prela/evals/n8n/assertions.py +450 -0
  26. prela/evals/n8n/runner.py +497 -0
  27. prela/evals/reporters/README.md +184 -0
  28. prela/evals/reporters/__init__.py +32 -0
  29. prela/evals/reporters/console.py +251 -0
  30. prela/evals/reporters/json.py +176 -0
  31. prela/evals/reporters/junit.py +278 -0
  32. prela/evals/runner.py +525 -0
  33. prela/evals/suite.py +316 -0
  34. prela/exporters/__init__.py +27 -0
  35. prela/exporters/base.py +189 -0
  36. prela/exporters/console.py +443 -0
  37. prela/exporters/file.py +322 -0
  38. prela/exporters/http.py +394 -0
  39. prela/exporters/multi.py +154 -0
  40. prela/exporters/otlp.py +388 -0
  41. prela/instrumentation/ANTHROPIC.md +297 -0
  42. prela/instrumentation/LANGCHAIN.md +480 -0
  43. prela/instrumentation/OPENAI.md +59 -0
  44. prela/instrumentation/__init__.py +49 -0
  45. prela/instrumentation/anthropic.py +1436 -0
  46. prela/instrumentation/auto.py +129 -0
  47. prela/instrumentation/base.py +436 -0
  48. prela/instrumentation/langchain.py +959 -0
  49. prela/instrumentation/llamaindex.py +719 -0
  50. prela/instrumentation/multi_agent/__init__.py +48 -0
  51. prela/instrumentation/multi_agent/autogen.py +357 -0
  52. prela/instrumentation/multi_agent/crewai.py +404 -0
  53. prela/instrumentation/multi_agent/langgraph.py +299 -0
  54. prela/instrumentation/multi_agent/models.py +203 -0
  55. prela/instrumentation/multi_agent/swarm.py +231 -0
  56. prela/instrumentation/n8n/__init__.py +68 -0
  57. prela/instrumentation/n8n/code_node.py +534 -0
  58. prela/instrumentation/n8n/models.py +336 -0
  59. prela/instrumentation/n8n/webhook.py +489 -0
  60. prela/instrumentation/openai.py +1198 -0
  61. prela/license.py +245 -0
  62. prela/replay/__init__.py +31 -0
  63. prela/replay/comparison.py +390 -0
  64. prela/replay/engine.py +1227 -0
  65. prela/replay/loader.py +231 -0
  66. prela/replay/result.py +196 -0
  67. prela-0.1.0.dist-info/METADATA +399 -0
  68. prela-0.1.0.dist-info/RECORD +71 -0
  69. prela-0.1.0.dist-info/WHEEL +4 -0
  70. prela-0.1.0.dist-info/entry_points.txt +2 -0
  71. prela-0.1.0.dist-info/licenses/LICENSE +190 -0
prela/evals/case.py ADDED
@@ -0,0 +1,370 @@
1
+ """Eval case data structures for defining test cases.
2
+
3
+ This module provides the core data structures for defining evaluation cases:
4
+ - EvalInput: What goes into the agent
5
+ - EvalExpected: What we compare against
6
+ - EvalCase: Complete test case with input, expected output, and assertions
7
+
8
+ Example:
9
+ >>> from prela.evals import EvalCase, EvalInput, EvalExpected
10
+ >>> case = EvalCase(
11
+ ... id="test_qa",
12
+ ... name="Basic QA test",
13
+ ... input=EvalInput(query="What is 2+2?"),
14
+ ... expected=EvalExpected(contains=["4"])
15
+ ... )
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from dataclasses import dataclass, field
21
+ from typing import Any
22
+
23
+
24
+ @dataclass
25
+ class EvalInput:
26
+ """Input data for an eval case.
27
+
28
+ Represents what goes into the agent being tested. Can be a simple query,
29
+ a list of messages, or custom context data.
30
+
31
+ Attributes:
32
+ query: Simple string query/prompt (for basic use cases)
33
+ messages: List of message dicts (for chat-based agents)
34
+ context: Additional context data (e.g., retrieved documents, metadata)
35
+
36
+ Example:
37
+ >>> # Simple query
38
+ >>> input1 = EvalInput(query="What is the capital of France?")
39
+ >>>
40
+ >>> # Chat messages
41
+ >>> input2 = EvalInput(messages=[
42
+ ... {"role": "system", "content": "You are a helpful assistant"},
43
+ ... {"role": "user", "content": "Hello!"}
44
+ ... ])
45
+ >>>
46
+ >>> # Query with context
47
+ >>> input3 = EvalInput(
48
+ ... query="Summarize the document",
49
+ ... context={"document": "Long text here..."}
50
+ ... )
51
+ """
52
+
53
+ query: str | None = None
54
+ messages: list[dict] | None = None
55
+ context: dict[str, Any] | None = None
56
+
57
+ def __post_init__(self) -> None:
58
+ """Validate that at least one input type is provided."""
59
+ if self.query is None and self.messages is None:
60
+ raise ValueError("EvalInput must have either 'query' or 'messages'")
61
+
62
+ def to_agent_input(self) -> dict[str, Any]:
63
+ """Convert to format that agent expects.
64
+
65
+ Returns:
66
+ Dictionary with all non-None input fields.
67
+
68
+ Example:
69
+ >>> input = EvalInput(query="Hello", context={"user_id": "123"})
70
+ >>> input.to_agent_input()
71
+ {'query': 'Hello', 'context': {'user_id': '123'}}
72
+ """
73
+ result: dict[str, Any] = {}
74
+
75
+ if self.query is not None:
76
+ result["query"] = self.query
77
+
78
+ if self.messages is not None:
79
+ result["messages"] = self.messages
80
+
81
+ if self.context is not None:
82
+ result["context"] = self.context
83
+
84
+ return result
85
+
86
+ @classmethod
87
+ def from_dict(cls, data: dict[str, Any]) -> EvalInput:
88
+ """Create EvalInput from dictionary.
89
+
90
+ Args:
91
+ data: Dictionary with 'query', 'messages', and/or 'context' keys
92
+
93
+ Returns:
94
+ EvalInput instance
95
+
96
+ Example:
97
+ >>> data = {"query": "Hello", "context": {"key": "value"}}
98
+ >>> input = EvalInput.from_dict(data)
99
+ """
100
+ return cls(
101
+ query=data.get("query"),
102
+ messages=data.get("messages"),
103
+ context=data.get("context"),
104
+ )
105
+
106
+ def to_dict(self) -> dict[str, Any]:
107
+ """Convert to dictionary for serialization.
108
+
109
+ Returns:
110
+ Dictionary representation of the input.
111
+ """
112
+ result: dict[str, Any] = {}
113
+
114
+ if self.query is not None:
115
+ result["query"] = self.query
116
+
117
+ if self.messages is not None:
118
+ result["messages"] = self.messages
119
+
120
+ if self.context is not None:
121
+ result["context"] = self.context
122
+
123
+ return result
124
+
125
+
126
+ @dataclass
127
+ class EvalExpected:
128
+ """Expected output for an eval case.
129
+
130
+ Defines what the agent's output should look like. Supports multiple
131
+ validation strategies:
132
+ - Exact output match
133
+ - Contains/not_contains substring checks
134
+ - Tool call validation
135
+ - Custom metadata checks
136
+
137
+ Attributes:
138
+ output: Exact expected output string
139
+ contains: List of substrings that must appear in output
140
+ not_contains: List of substrings that must NOT appear in output
141
+ tool_calls: Expected tool calls (list of dicts with 'name', 'args', etc.)
142
+ metadata: Expected metadata fields (e.g., final_answer, confidence)
143
+
144
+ Example:
145
+ >>> # Exact match
146
+ >>> expected1 = EvalExpected(output="The answer is 42")
147
+ >>>
148
+ >>> # Substring checks
149
+ >>> expected2 = EvalExpected(
150
+ ... contains=["Paris", "capital"],
151
+ ... not_contains=["London", "Berlin"]
152
+ ... )
153
+ >>>
154
+ >>> # Tool call validation
155
+ >>> expected3 = EvalExpected(tool_calls=[
156
+ ... {"name": "search", "args": {"query": "weather"}}
157
+ ... ])
158
+ """
159
+
160
+ output: str | None = None
161
+ contains: list[str] | None = None
162
+ not_contains: list[str] | None = None
163
+ tool_calls: list[dict[str, Any]] | None = None
164
+ metadata: dict[str, Any] | None = None
165
+
166
+ def __post_init__(self) -> None:
167
+ """Validate that at least one expectation is provided."""
168
+ if (
169
+ self.output is None
170
+ and self.contains is None
171
+ and self.not_contains is None
172
+ and self.tool_calls is None
173
+ and self.metadata is None
174
+ ):
175
+ raise ValueError(
176
+ "EvalExpected must have at least one expectation "
177
+ "(output, contains, not_contains, tool_calls, or metadata)"
178
+ )
179
+
180
+ @classmethod
181
+ def from_dict(cls, data: dict[str, Any]) -> EvalExpected:
182
+ """Create EvalExpected from dictionary.
183
+
184
+ Args:
185
+ data: Dictionary with expected output specifications
186
+
187
+ Returns:
188
+ EvalExpected instance
189
+
190
+ Example:
191
+ >>> data = {"contains": ["Paris"], "not_contains": ["London"]}
192
+ >>> expected = EvalExpected.from_dict(data)
193
+ """
194
+ return cls(
195
+ output=data.get("output"),
196
+ contains=data.get("contains"),
197
+ not_contains=data.get("not_contains"),
198
+ tool_calls=data.get("tool_calls"),
199
+ metadata=data.get("metadata"),
200
+ )
201
+
202
+ def to_dict(self) -> dict[str, Any]:
203
+ """Convert to dictionary for serialization.
204
+
205
+ Returns:
206
+ Dictionary representation of the expected output.
207
+ """
208
+ result: dict[str, Any] = {}
209
+
210
+ if self.output is not None:
211
+ result["output"] = self.output
212
+
213
+ if self.contains is not None:
214
+ result["contains"] = self.contains
215
+
216
+ if self.not_contains is not None:
217
+ result["not_contains"] = self.not_contains
218
+
219
+ if self.tool_calls is not None:
220
+ result["tool_calls"] = self.tool_calls
221
+
222
+ if self.metadata is not None:
223
+ result["metadata"] = self.metadata
224
+
225
+ return result
226
+
227
+
228
+ @dataclass
229
+ class EvalCase:
230
+ """Complete evaluation test case.
231
+
232
+ Represents a single test case with input, expected output, and assertions.
233
+ Eval cases are the building blocks of eval suites.
234
+
235
+ Attributes:
236
+ id: Unique identifier for this test case
237
+ name: Human-readable test case name
238
+ input: Input data for the agent
239
+ expected: Expected output (optional, can use assertions instead)
240
+ assertions: List of assertion configurations (dicts with 'type', 'value', etc.)
241
+ tags: Tags for filtering/grouping test cases
242
+ timeout_seconds: Maximum execution time for this test case
243
+ metadata: Additional metadata for this test case
244
+
245
+ Example:
246
+ >>> case = EvalCase(
247
+ ... id="test_basic_qa",
248
+ ... name="Basic factual question",
249
+ ... input=EvalInput(query="What is the capital of France?"),
250
+ ... expected=EvalExpected(contains=["Paris"]),
251
+ ... assertions=[
252
+ ... {"type": "contains", "value": "Paris"},
253
+ ... {"type": "semantic_similarity", "threshold": 0.8}
254
+ ... ],
255
+ ... tags=["qa", "geography"],
256
+ ... timeout_seconds=10.0
257
+ ... )
258
+ """
259
+
260
+ id: str
261
+ name: str
262
+ input: EvalInput
263
+ expected: EvalExpected | None = None
264
+ assertions: list[dict[str, Any]] | None = None
265
+ tags: list[str] = field(default_factory=list)
266
+ timeout_seconds: float = 30.0
267
+ metadata: dict[str, Any] = field(default_factory=dict)
268
+
269
+ def __post_init__(self) -> None:
270
+ """Validate test case configuration."""
271
+ if not self.id:
272
+ raise ValueError("EvalCase must have a non-empty 'id'")
273
+
274
+ if not self.name:
275
+ raise ValueError("EvalCase must have a non-empty 'name'")
276
+
277
+ if self.timeout_seconds <= 0:
278
+ raise ValueError("timeout_seconds must be positive")
279
+
280
+ # Validate that we have at least expected or assertions
281
+ if self.expected is None and (self.assertions is None or len(self.assertions) == 0):
282
+ raise ValueError("EvalCase must have either 'expected' or 'assertions'")
283
+
284
+ @classmethod
285
+ def from_dict(cls, data: dict[str, Any]) -> EvalCase:
286
+ """Create EvalCase from dictionary.
287
+
288
+ Args:
289
+ data: Dictionary with test case specification
290
+
291
+ Returns:
292
+ EvalCase instance
293
+
294
+ Example:
295
+ >>> data = {
296
+ ... "id": "test_1",
297
+ ... "name": "Test case 1",
298
+ ... "input": {"query": "Hello"},
299
+ ... "expected": {"contains": ["Hi"]},
300
+ ... "tags": ["greeting"]
301
+ ... }
302
+ >>> case = EvalCase.from_dict(data)
303
+ """
304
+ # Parse input
305
+ input_data = data.get("input")
306
+ if input_data is None:
307
+ raise ValueError("EvalCase must have 'input' field")
308
+
309
+ if isinstance(input_data, EvalInput):
310
+ input_obj = input_data
311
+ else:
312
+ input_obj = EvalInput.from_dict(input_data)
313
+
314
+ # Parse expected (optional)
315
+ expected_data = data.get("expected")
316
+ expected_obj: EvalExpected | None = None
317
+ if expected_data is not None:
318
+ if isinstance(expected_data, EvalExpected):
319
+ expected_obj = expected_data
320
+ else:
321
+ expected_obj = EvalExpected.from_dict(expected_data)
322
+
323
+ return cls(
324
+ id=data["id"],
325
+ name=data["name"],
326
+ input=input_obj,
327
+ expected=expected_obj,
328
+ assertions=data.get("assertions"),
329
+ tags=data.get("tags", []),
330
+ timeout_seconds=data.get("timeout_seconds", 30.0),
331
+ metadata=data.get("metadata", {}),
332
+ )
333
+
334
+ def to_dict(self) -> dict[str, Any]:
335
+ """Convert to dictionary for serialization.
336
+
337
+ Returns:
338
+ Dictionary representation of the test case.
339
+
340
+ Example:
341
+ >>> case = EvalCase(
342
+ ... id="test_1",
343
+ ... name="Test",
344
+ ... input=EvalInput(query="Hello"),
345
+ ... expected=EvalExpected(contains=["Hi"])
346
+ ... )
347
+ >>> data = case.to_dict()
348
+ >>> data["id"]
349
+ 'test_1'
350
+ """
351
+ result: dict[str, Any] = {
352
+ "id": self.id,
353
+ "name": self.name,
354
+ "input": self.input.to_dict(),
355
+ "timeout_seconds": self.timeout_seconds,
356
+ }
357
+
358
+ if self.expected is not None:
359
+ result["expected"] = self.expected.to_dict()
360
+
361
+ if self.assertions is not None and len(self.assertions) > 0:
362
+ result["assertions"] = self.assertions
363
+
364
+ if len(self.tags) > 0:
365
+ result["tags"] = self.tags
366
+
367
+ if len(self.metadata) > 0:
368
+ result["metadata"] = self.metadata
369
+
370
+ return result
@@ -0,0 +1,69 @@
1
+ """n8n workflow evaluation framework.
2
+
3
+ This module provides specialized evaluation tools for testing n8n workflows:
4
+ - N8nEvalCase: Test case for n8n workflows with trigger data
5
+ - N8nWorkflowEvalConfig: Configuration for n8n workflow evaluation
6
+ - N8nWorkflowEvalRunner: Runner for executing n8n workflow tests
7
+ - eval_n8n_workflow: Convenience function for quick testing
8
+ - n8n-specific assertions: node_completed, node_output, duration_under, etc.
9
+
10
+ Example:
11
+ >>> from prela.evals.n8n import eval_n8n_workflow, N8nEvalCase
12
+ >>> from prela.evals.n8n.assertions import node_completed, duration_under
13
+ >>>
14
+ >>> results = await eval_n8n_workflow(
15
+ ... workflow_id="abc123",
16
+ ... test_cases=[
17
+ ... N8nEvalCase(
18
+ ... id="test_1",
19
+ ... name="High-intent lead",
20
+ ... trigger_data={"email": "I want to buy..."},
21
+ ... workflow_assertions=[
22
+ ... node_completed("Classify Intent"),
23
+ ... duration_under(5.0)
24
+ ... ]
25
+ ... )
26
+ ... ]
27
+ ... )
28
+ """
29
+
30
+ from prela.evals.n8n.assertions import (
31
+ N8nAINodeTokens,
32
+ N8nNodeCompleted,
33
+ N8nNodeOutput,
34
+ N8nWorkflowDuration,
35
+ N8nWorkflowStatus,
36
+ duration_under,
37
+ node_completed,
38
+ node_output,
39
+ tokens_under,
40
+ workflow_completed,
41
+ workflow_status,
42
+ )
43
+ from prela.evals.n8n.runner import (
44
+ N8nEvalCase,
45
+ N8nWorkflowEvalConfig,
46
+ N8nWorkflowEvalRunner,
47
+ eval_n8n_workflow,
48
+ )
49
+
50
+ __all__ = [
51
+ # Runner components
52
+ "N8nEvalCase",
53
+ "N8nWorkflowEvalConfig",
54
+ "N8nWorkflowEvalRunner",
55
+ "eval_n8n_workflow",
56
+ # Assertion classes
57
+ "N8nNodeCompleted",
58
+ "N8nNodeOutput",
59
+ "N8nWorkflowDuration",
60
+ "N8nAINodeTokens",
61
+ "N8nWorkflowStatus",
62
+ # Convenience functions
63
+ "node_completed",
64
+ "node_output",
65
+ "duration_under",
66
+ "tokens_under",
67
+ "workflow_completed",
68
+ "workflow_status",
69
+ ]