prela 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- prela/__init__.py +394 -0
- prela/_version.py +3 -0
- prela/contrib/CLI.md +431 -0
- prela/contrib/README.md +118 -0
- prela/contrib/__init__.py +5 -0
- prela/contrib/cli.py +1063 -0
- prela/contrib/explorer.py +571 -0
- prela/core/__init__.py +64 -0
- prela/core/clock.py +98 -0
- prela/core/context.py +228 -0
- prela/core/replay.py +403 -0
- prela/core/sampler.py +178 -0
- prela/core/span.py +295 -0
- prela/core/tracer.py +498 -0
- prela/evals/__init__.py +94 -0
- prela/evals/assertions/README.md +484 -0
- prela/evals/assertions/__init__.py +78 -0
- prela/evals/assertions/base.py +90 -0
- prela/evals/assertions/multi_agent.py +625 -0
- prela/evals/assertions/semantic.py +223 -0
- prela/evals/assertions/structural.py +443 -0
- prela/evals/assertions/tool.py +380 -0
- prela/evals/case.py +370 -0
- prela/evals/n8n/__init__.py +69 -0
- prela/evals/n8n/assertions.py +450 -0
- prela/evals/n8n/runner.py +497 -0
- prela/evals/reporters/README.md +184 -0
- prela/evals/reporters/__init__.py +32 -0
- prela/evals/reporters/console.py +251 -0
- prela/evals/reporters/json.py +176 -0
- prela/evals/reporters/junit.py +278 -0
- prela/evals/runner.py +525 -0
- prela/evals/suite.py +316 -0
- prela/exporters/__init__.py +27 -0
- prela/exporters/base.py +189 -0
- prela/exporters/console.py +443 -0
- prela/exporters/file.py +322 -0
- prela/exporters/http.py +394 -0
- prela/exporters/multi.py +154 -0
- prela/exporters/otlp.py +388 -0
- prela/instrumentation/ANTHROPIC.md +297 -0
- prela/instrumentation/LANGCHAIN.md +480 -0
- prela/instrumentation/OPENAI.md +59 -0
- prela/instrumentation/__init__.py +49 -0
- prela/instrumentation/anthropic.py +1436 -0
- prela/instrumentation/auto.py +129 -0
- prela/instrumentation/base.py +436 -0
- prela/instrumentation/langchain.py +959 -0
- prela/instrumentation/llamaindex.py +719 -0
- prela/instrumentation/multi_agent/__init__.py +48 -0
- prela/instrumentation/multi_agent/autogen.py +357 -0
- prela/instrumentation/multi_agent/crewai.py +404 -0
- prela/instrumentation/multi_agent/langgraph.py +299 -0
- prela/instrumentation/multi_agent/models.py +203 -0
- prela/instrumentation/multi_agent/swarm.py +231 -0
- prela/instrumentation/n8n/__init__.py +68 -0
- prela/instrumentation/n8n/code_node.py +534 -0
- prela/instrumentation/n8n/models.py +336 -0
- prela/instrumentation/n8n/webhook.py +489 -0
- prela/instrumentation/openai.py +1198 -0
- prela/license.py +245 -0
- prela/replay/__init__.py +31 -0
- prela/replay/comparison.py +390 -0
- prela/replay/engine.py +1227 -0
- prela/replay/loader.py +231 -0
- prela/replay/result.py +196 -0
- prela-0.1.0.dist-info/METADATA +399 -0
- prela-0.1.0.dist-info/RECORD +71 -0
- prela-0.1.0.dist-info/WHEEL +4 -0
- prela-0.1.0.dist-info/entry_points.txt +2 -0
- prela-0.1.0.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tool-related assertions for verifying agent tool usage.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from prela.core.span import Span, SpanType
|
|
10
|
+
from prela.evals.assertions.base import AssertionResult, BaseAssertion
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ToolCalledAssertion(BaseAssertion):
|
|
14
|
+
"""Assert that a specific tool was called during execution.
|
|
15
|
+
|
|
16
|
+
This assertion examines the trace to verify that a tool span with the
|
|
17
|
+
specified name exists.
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
>>> assertion = ToolCalledAssertion(tool_name="web_search")
|
|
21
|
+
>>> result = assertion.evaluate(output=None, expected=None, trace=spans)
|
|
22
|
+
>>> assert result.passed
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, tool_name: str):
|
|
26
|
+
"""Initialize tool called assertion.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
tool_name: Name of the tool that should have been called
|
|
30
|
+
"""
|
|
31
|
+
self.tool_name = tool_name
|
|
32
|
+
|
|
33
|
+
def evaluate(
|
|
34
|
+
self,
|
|
35
|
+
output: Any,
|
|
36
|
+
expected: Any | None,
|
|
37
|
+
trace: list[Span] | None,
|
|
38
|
+
) -> AssertionResult:
|
|
39
|
+
"""Check if the specified tool was called in the trace."""
|
|
40
|
+
if trace is None or len(trace) == 0:
|
|
41
|
+
return AssertionResult(
|
|
42
|
+
passed=False,
|
|
43
|
+
assertion_type="tool_called",
|
|
44
|
+
message=f"No trace available to check for tool '{self.tool_name}'",
|
|
45
|
+
expected=f"tool '{self.tool_name}' called",
|
|
46
|
+
actual="no trace",
|
|
47
|
+
details={},
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Look for tool spans with matching name
|
|
51
|
+
tool_spans = [
|
|
52
|
+
span for span in trace
|
|
53
|
+
if span.span_type == SpanType.TOOL and span.name == self.tool_name
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
passed = len(tool_spans) > 0
|
|
57
|
+
|
|
58
|
+
if passed:
|
|
59
|
+
message = f"Tool '{self.tool_name}' was called {len(tool_spans)} time(s)"
|
|
60
|
+
details = {
|
|
61
|
+
"call_count": len(tool_spans),
|
|
62
|
+
"span_ids": [span.span_id for span in tool_spans],
|
|
63
|
+
}
|
|
64
|
+
else:
|
|
65
|
+
# List available tools to help debugging
|
|
66
|
+
available_tools = {
|
|
67
|
+
span.name for span in trace
|
|
68
|
+
if span.span_type == SpanType.TOOL
|
|
69
|
+
}
|
|
70
|
+
message = f"Tool '{self.tool_name}' was not called"
|
|
71
|
+
details = {
|
|
72
|
+
"call_count": 0,
|
|
73
|
+
"available_tools": list(available_tools),
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return AssertionResult(
|
|
77
|
+
passed=passed,
|
|
78
|
+
assertion_type="tool_called",
|
|
79
|
+
message=message,
|
|
80
|
+
expected=f"tool '{self.tool_name}' called",
|
|
81
|
+
actual=f"{len(tool_spans)} calls" if passed else "not called",
|
|
82
|
+
details=details,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def from_config(cls, config: dict[str, Any]) -> ToolCalledAssertion:
|
|
87
|
+
"""Create from configuration.
|
|
88
|
+
|
|
89
|
+
Config format:
|
|
90
|
+
{
|
|
91
|
+
"tool_name": "web_search"
|
|
92
|
+
}
|
|
93
|
+
"""
|
|
94
|
+
if "tool_name" not in config:
|
|
95
|
+
raise ValueError("ToolCalledAssertion requires 'tool_name' in config")
|
|
96
|
+
|
|
97
|
+
return cls(tool_name=config["tool_name"])
|
|
98
|
+
|
|
99
|
+
def __repr__(self) -> str:
|
|
100
|
+
return f"ToolCalledAssertion(tool_name={self.tool_name!r})"
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class ToolArgsAssertion(BaseAssertion):
|
|
104
|
+
"""Assert that a tool was called with expected arguments.
|
|
105
|
+
|
|
106
|
+
This assertion verifies both that the tool was called and that it was
|
|
107
|
+
called with specific argument values.
|
|
108
|
+
|
|
109
|
+
Example:
|
|
110
|
+
>>> assertion = ToolArgsAssertion(
|
|
111
|
+
... tool_name="web_search",
|
|
112
|
+
... expected_args={"query": "Python tutorial"}
|
|
113
|
+
... )
|
|
114
|
+
>>> result = assertion.evaluate(output=None, expected=None, trace=spans)
|
|
115
|
+
>>> assert result.passed
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
def __init__(
|
|
119
|
+
self,
|
|
120
|
+
tool_name: str,
|
|
121
|
+
expected_args: dict[str, Any],
|
|
122
|
+
partial_match: bool = True,
|
|
123
|
+
):
|
|
124
|
+
"""Initialize tool args assertion.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
tool_name: Name of the tool to check
|
|
128
|
+
expected_args: Expected argument key-value pairs
|
|
129
|
+
partial_match: If True, only check that expected_args are present
|
|
130
|
+
(allow additional args). If False, require exact match.
|
|
131
|
+
"""
|
|
132
|
+
self.tool_name = tool_name
|
|
133
|
+
self.expected_args = expected_args
|
|
134
|
+
self.partial_match = partial_match
|
|
135
|
+
|
|
136
|
+
def evaluate(
|
|
137
|
+
self,
|
|
138
|
+
output: Any,
|
|
139
|
+
expected: Any | None,
|
|
140
|
+
trace: list[Span] | None,
|
|
141
|
+
) -> AssertionResult:
|
|
142
|
+
"""Check if tool was called with expected arguments."""
|
|
143
|
+
if trace is None or len(trace) == 0:
|
|
144
|
+
return AssertionResult(
|
|
145
|
+
passed=False,
|
|
146
|
+
assertion_type="tool_args",
|
|
147
|
+
message=f"No trace available to check tool '{self.tool_name}' arguments",
|
|
148
|
+
expected=f"tool '{self.tool_name}' with args {self.expected_args}",
|
|
149
|
+
actual="no trace",
|
|
150
|
+
details={},
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Find tool spans with matching name
|
|
154
|
+
tool_spans = [
|
|
155
|
+
span for span in trace
|
|
156
|
+
if span.span_type == SpanType.TOOL and span.name == self.tool_name
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
if not tool_spans:
|
|
160
|
+
return AssertionResult(
|
|
161
|
+
passed=False,
|
|
162
|
+
assertion_type="tool_args",
|
|
163
|
+
message=f"Tool '{self.tool_name}' was not called",
|
|
164
|
+
expected=f"tool '{self.tool_name}' with args {self.expected_args}",
|
|
165
|
+
actual="tool not called",
|
|
166
|
+
details={},
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Check each tool call for matching arguments
|
|
170
|
+
matches = []
|
|
171
|
+
for span in tool_spans:
|
|
172
|
+
# Look for tool input in attributes (common patterns)
|
|
173
|
+
actual_args = {}
|
|
174
|
+
for key, value in span.attributes.items():
|
|
175
|
+
if key.startswith("tool.input.") or key.startswith("input."):
|
|
176
|
+
arg_name = key.split(".")[-1]
|
|
177
|
+
actual_args[arg_name] = value
|
|
178
|
+
|
|
179
|
+
# Also check for generic "input" attribute
|
|
180
|
+
if "input" in span.attributes and isinstance(span.attributes["input"], dict):
|
|
181
|
+
actual_args.update(span.attributes["input"])
|
|
182
|
+
|
|
183
|
+
# Check if this call matches expected args
|
|
184
|
+
if self.partial_match:
|
|
185
|
+
# Check that all expected args are present with correct values
|
|
186
|
+
match = all(
|
|
187
|
+
actual_args.get(key) == value
|
|
188
|
+
for key, value in self.expected_args.items()
|
|
189
|
+
)
|
|
190
|
+
else:
|
|
191
|
+
# Require exact match
|
|
192
|
+
match = actual_args == self.expected_args
|
|
193
|
+
|
|
194
|
+
if match:
|
|
195
|
+
matches.append((span, actual_args))
|
|
196
|
+
|
|
197
|
+
passed = len(matches) > 0
|
|
198
|
+
|
|
199
|
+
if passed:
|
|
200
|
+
message = f"Tool '{self.tool_name}' was called with expected arguments ({len(matches)} time(s))"
|
|
201
|
+
details = {
|
|
202
|
+
"match_count": len(matches),
|
|
203
|
+
"span_ids": [span.span_id for span, _ in matches],
|
|
204
|
+
"matched_args": [args for _, args in matches],
|
|
205
|
+
}
|
|
206
|
+
else:
|
|
207
|
+
# Show actual args from first call for debugging
|
|
208
|
+
first_span_args = {}
|
|
209
|
+
if tool_spans:
|
|
210
|
+
for key, value in tool_spans[0].attributes.items():
|
|
211
|
+
if key.startswith("tool.input.") or key.startswith("input."):
|
|
212
|
+
arg_name = key.split(".")[-1]
|
|
213
|
+
first_span_args[arg_name] = value
|
|
214
|
+
|
|
215
|
+
message = f"Tool '{self.tool_name}' was called but not with expected arguments"
|
|
216
|
+
details = {
|
|
217
|
+
"match_count": 0,
|
|
218
|
+
"call_count": len(tool_spans),
|
|
219
|
+
"first_call_args": first_span_args,
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
return AssertionResult(
|
|
223
|
+
passed=passed,
|
|
224
|
+
assertion_type="tool_args",
|
|
225
|
+
message=message,
|
|
226
|
+
expected=self.expected_args,
|
|
227
|
+
actual=matches[0][1] if matches else first_span_args,
|
|
228
|
+
details=details,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
@classmethod
|
|
232
|
+
def from_config(cls, config: dict[str, Any]) -> ToolArgsAssertion:
|
|
233
|
+
"""Create from configuration.
|
|
234
|
+
|
|
235
|
+
Config format:
|
|
236
|
+
{
|
|
237
|
+
"tool_name": "web_search",
|
|
238
|
+
"expected_args": {"query": "Python"},
|
|
239
|
+
"partial_match": true # optional, default: true
|
|
240
|
+
}
|
|
241
|
+
"""
|
|
242
|
+
if "tool_name" not in config:
|
|
243
|
+
raise ValueError("ToolArgsAssertion requires 'tool_name' in config")
|
|
244
|
+
if "expected_args" not in config:
|
|
245
|
+
raise ValueError("ToolArgsAssertion requires 'expected_args' in config")
|
|
246
|
+
|
|
247
|
+
return cls(
|
|
248
|
+
tool_name=config["tool_name"],
|
|
249
|
+
expected_args=config["expected_args"],
|
|
250
|
+
partial_match=config.get("partial_match", True),
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
def __repr__(self) -> str:
|
|
254
|
+
return (
|
|
255
|
+
f"ToolArgsAssertion(tool_name={self.tool_name!r}, "
|
|
256
|
+
f"expected_args={self.expected_args}, "
|
|
257
|
+
f"partial_match={self.partial_match})"
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
class ToolSequenceAssertion(BaseAssertion):
|
|
262
|
+
"""Assert that tools were called in a specific order.
|
|
263
|
+
|
|
264
|
+
This assertion verifies that tools appear in the trace in the expected
|
|
265
|
+
sequence, though other tools may appear between them.
|
|
266
|
+
|
|
267
|
+
Example:
|
|
268
|
+
>>> assertion = ToolSequenceAssertion(
|
|
269
|
+
... sequence=["web_search", "calculator", "summarize"]
|
|
270
|
+
... )
|
|
271
|
+
>>> result = assertion.evaluate(output=None, expected=None, trace=spans)
|
|
272
|
+
>>> assert result.passed
|
|
273
|
+
"""
|
|
274
|
+
|
|
275
|
+
def __init__(self, sequence: list[str], strict: bool = False):
|
|
276
|
+
"""Initialize tool sequence assertion.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
sequence: Expected sequence of tool names
|
|
280
|
+
strict: If True, no other tools can appear between expected ones.
|
|
281
|
+
If False, other tools are allowed between expected sequence.
|
|
282
|
+
"""
|
|
283
|
+
if not sequence:
|
|
284
|
+
raise ValueError("sequence cannot be empty")
|
|
285
|
+
|
|
286
|
+
self.sequence = sequence
|
|
287
|
+
self.strict = strict
|
|
288
|
+
|
|
289
|
+
def evaluate(
|
|
290
|
+
self,
|
|
291
|
+
output: Any,
|
|
292
|
+
expected: Any | None,
|
|
293
|
+
trace: list[Span] | None,
|
|
294
|
+
) -> AssertionResult:
|
|
295
|
+
"""Check if tools were called in the expected sequence."""
|
|
296
|
+
if trace is None or len(trace) == 0:
|
|
297
|
+
return AssertionResult(
|
|
298
|
+
passed=False,
|
|
299
|
+
assertion_type="tool_sequence",
|
|
300
|
+
message="No trace available to check tool sequence",
|
|
301
|
+
expected=f"sequence: {self.sequence}",
|
|
302
|
+
actual="no trace",
|
|
303
|
+
details={},
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# Extract tool call sequence from trace (ordered by started_at)
|
|
307
|
+
tool_spans = [
|
|
308
|
+
span for span in sorted(trace, key=lambda s: s.started_at)
|
|
309
|
+
if span.span_type == SpanType.TOOL
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
if not tool_spans:
|
|
313
|
+
return AssertionResult(
|
|
314
|
+
passed=False,
|
|
315
|
+
assertion_type="tool_sequence",
|
|
316
|
+
message="No tool calls found in trace",
|
|
317
|
+
expected=f"sequence: {self.sequence}",
|
|
318
|
+
actual="no tools called",
|
|
319
|
+
details={},
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
actual_sequence = [span.name for span in tool_spans]
|
|
323
|
+
|
|
324
|
+
# Check sequence
|
|
325
|
+
if self.strict:
|
|
326
|
+
# Strict mode: must match exactly
|
|
327
|
+
passed = actual_sequence == self.sequence
|
|
328
|
+
if passed:
|
|
329
|
+
message = f"Tool sequence matches exactly: {self.sequence}"
|
|
330
|
+
else:
|
|
331
|
+
message = f"Tool sequence does not match. Expected {self.sequence}, got {actual_sequence}"
|
|
332
|
+
else:
|
|
333
|
+
# Non-strict: check subsequence
|
|
334
|
+
seq_idx = 0
|
|
335
|
+
for tool_name in actual_sequence:
|
|
336
|
+
if seq_idx < len(self.sequence) and tool_name == self.sequence[seq_idx]:
|
|
337
|
+
seq_idx += 1
|
|
338
|
+
|
|
339
|
+
passed = seq_idx == len(self.sequence)
|
|
340
|
+
|
|
341
|
+
if passed:
|
|
342
|
+
message = f"Tool sequence {self.sequence} found in correct order"
|
|
343
|
+
else:
|
|
344
|
+
found = self.sequence[:seq_idx]
|
|
345
|
+
missing = self.sequence[seq_idx:]
|
|
346
|
+
message = f"Tool sequence incomplete. Found {found}, missing {missing}"
|
|
347
|
+
|
|
348
|
+
return AssertionResult(
|
|
349
|
+
passed=passed,
|
|
350
|
+
assertion_type="tool_sequence",
|
|
351
|
+
message=message,
|
|
352
|
+
expected=self.sequence,
|
|
353
|
+
actual=actual_sequence,
|
|
354
|
+
details={
|
|
355
|
+
"strict": self.strict,
|
|
356
|
+
"expected_length": len(self.sequence),
|
|
357
|
+
"actual_length": len(actual_sequence),
|
|
358
|
+
},
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
@classmethod
|
|
362
|
+
def from_config(cls, config: dict[str, Any]) -> ToolSequenceAssertion:
|
|
363
|
+
"""Create from configuration.
|
|
364
|
+
|
|
365
|
+
Config format:
|
|
366
|
+
{
|
|
367
|
+
"sequence": ["tool1", "tool2", "tool3"],
|
|
368
|
+
"strict": false # optional, default: false
|
|
369
|
+
}
|
|
370
|
+
"""
|
|
371
|
+
if "sequence" not in config:
|
|
372
|
+
raise ValueError("ToolSequenceAssertion requires 'sequence' in config")
|
|
373
|
+
|
|
374
|
+
return cls(
|
|
375
|
+
sequence=config["sequence"],
|
|
376
|
+
strict=config.get("strict", False),
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
def __repr__(self) -> str:
|
|
380
|
+
return f"ToolSequenceAssertion(sequence={self.sequence}, strict={self.strict})"
|