prela 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- prela/__init__.py +394 -0
- prela/_version.py +3 -0
- prela/contrib/CLI.md +431 -0
- prela/contrib/README.md +118 -0
- prela/contrib/__init__.py +5 -0
- prela/contrib/cli.py +1063 -0
- prela/contrib/explorer.py +571 -0
- prela/core/__init__.py +64 -0
- prela/core/clock.py +98 -0
- prela/core/context.py +228 -0
- prela/core/replay.py +403 -0
- prela/core/sampler.py +178 -0
- prela/core/span.py +295 -0
- prela/core/tracer.py +498 -0
- prela/evals/__init__.py +94 -0
- prela/evals/assertions/README.md +484 -0
- prela/evals/assertions/__init__.py +78 -0
- prela/evals/assertions/base.py +90 -0
- prela/evals/assertions/multi_agent.py +625 -0
- prela/evals/assertions/semantic.py +223 -0
- prela/evals/assertions/structural.py +443 -0
- prela/evals/assertions/tool.py +380 -0
- prela/evals/case.py +370 -0
- prela/evals/n8n/__init__.py +69 -0
- prela/evals/n8n/assertions.py +450 -0
- prela/evals/n8n/runner.py +497 -0
- prela/evals/reporters/README.md +184 -0
- prela/evals/reporters/__init__.py +32 -0
- prela/evals/reporters/console.py +251 -0
- prela/evals/reporters/json.py +176 -0
- prela/evals/reporters/junit.py +278 -0
- prela/evals/runner.py +525 -0
- prela/evals/suite.py +316 -0
- prela/exporters/__init__.py +27 -0
- prela/exporters/base.py +189 -0
- prela/exporters/console.py +443 -0
- prela/exporters/file.py +322 -0
- prela/exporters/http.py +394 -0
- prela/exporters/multi.py +154 -0
- prela/exporters/otlp.py +388 -0
- prela/instrumentation/ANTHROPIC.md +297 -0
- prela/instrumentation/LANGCHAIN.md +480 -0
- prela/instrumentation/OPENAI.md +59 -0
- prela/instrumentation/__init__.py +49 -0
- prela/instrumentation/anthropic.py +1436 -0
- prela/instrumentation/auto.py +129 -0
- prela/instrumentation/base.py +436 -0
- prela/instrumentation/langchain.py +959 -0
- prela/instrumentation/llamaindex.py +719 -0
- prela/instrumentation/multi_agent/__init__.py +48 -0
- prela/instrumentation/multi_agent/autogen.py +357 -0
- prela/instrumentation/multi_agent/crewai.py +404 -0
- prela/instrumentation/multi_agent/langgraph.py +299 -0
- prela/instrumentation/multi_agent/models.py +203 -0
- prela/instrumentation/multi_agent/swarm.py +231 -0
- prela/instrumentation/n8n/__init__.py +68 -0
- prela/instrumentation/n8n/code_node.py +534 -0
- prela/instrumentation/n8n/models.py +336 -0
- prela/instrumentation/n8n/webhook.py +489 -0
- prela/instrumentation/openai.py +1198 -0
- prela/license.py +245 -0
- prela/replay/__init__.py +31 -0
- prela/replay/comparison.py +390 -0
- prela/replay/engine.py +1227 -0
- prela/replay/loader.py +231 -0
- prela/replay/result.py +196 -0
- prela-0.1.0.dist-info/METADATA +399 -0
- prela-0.1.0.dist-info/RECORD +71 -0
- prela-0.1.0.dist-info/WHEEL +4 -0
- prela-0.1.0.dist-info/entry_points.txt +2 -0
- prela-0.1.0.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evaluation runner for n8n workflows.
|
|
3
|
+
|
|
4
|
+
This module provides specialized evaluation tools for testing n8n workflows by:
|
|
5
|
+
1. Triggering workflows via n8n API/webhook
|
|
6
|
+
2. Waiting for execution completion
|
|
7
|
+
3. Fetching execution results
|
|
8
|
+
4. Running assertions on nodes and workflow outputs
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import time
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from typing import Any, Optional
|
|
17
|
+
|
|
18
|
+
import httpx
|
|
19
|
+
|
|
20
|
+
from prela.core.clock import now
|
|
21
|
+
from prela.core.context import get_current_trace_id
|
|
22
|
+
from prela.core.span import SpanType
|
|
23
|
+
from prela.core.tracer import Tracer
|
|
24
|
+
from prela.evals.assertions.base import AssertionResult, BaseAssertion
|
|
25
|
+
from prela.evals.suite import EvalSuite
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class N8nWorkflowEvalConfig:
|
|
30
|
+
"""Configuration for evaluating an n8n workflow.
|
|
31
|
+
|
|
32
|
+
Attributes:
|
|
33
|
+
workflow_id: n8n workflow ID to test
|
|
34
|
+
n8n_base_url: Base URL of n8n instance (default: http://localhost:5678)
|
|
35
|
+
n8n_api_key: API key for n8n authentication (optional)
|
|
36
|
+
timeout_seconds: Maximum seconds to wait for workflow completion (default: 120)
|
|
37
|
+
capture_traces: Whether to capture traces during execution (default: True)
|
|
38
|
+
|
|
39
|
+
Example:
|
|
40
|
+
>>> config = N8nWorkflowEvalConfig(
|
|
41
|
+
... workflow_id="abc123",
|
|
42
|
+
... n8n_base_url="https://n8n.example.com",
|
|
43
|
+
... n8n_api_key="your-api-key",
|
|
44
|
+
... timeout_seconds=60
|
|
45
|
+
... )
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
workflow_id: str
|
|
49
|
+
n8n_base_url: str = "http://localhost:5678"
|
|
50
|
+
n8n_api_key: Optional[str] = None
|
|
51
|
+
timeout_seconds: int = 120
|
|
52
|
+
capture_traces: bool = True
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class N8nEvalCase:
|
|
57
|
+
"""Test case for an n8n workflow.
|
|
58
|
+
|
|
59
|
+
Unlike regular EvalCase which requires EvalInput, N8nEvalCase uses trigger_data
|
|
60
|
+
to start workflows and includes n8n-specific assertion capabilities.
|
|
61
|
+
|
|
62
|
+
Attributes:
|
|
63
|
+
id: Unique test case ID
|
|
64
|
+
name: Human-readable test case name
|
|
65
|
+
trigger_data: Data to send when triggering the workflow
|
|
66
|
+
node_assertions: Mapping of node_name -> list of assertions to run on that node's output
|
|
67
|
+
workflow_assertions: List of assertions to run on the complete workflow execution
|
|
68
|
+
expected_output: Expected final output from the workflow (optional)
|
|
69
|
+
tags: Optional tags for filtering/grouping
|
|
70
|
+
timeout_seconds: Maximum execution time for this test case
|
|
71
|
+
metadata: Additional metadata for this test case
|
|
72
|
+
|
|
73
|
+
Example:
|
|
74
|
+
>>> from prela.evals.assertions import ContainsAssertion
|
|
75
|
+
>>> case = N8nEvalCase(
|
|
76
|
+
... id="test_lead_scoring",
|
|
77
|
+
... name="High-intent lead classification",
|
|
78
|
+
... trigger_data={
|
|
79
|
+
... "email": "I want to buy your product immediately",
|
|
80
|
+
... "company": "ACME Corp"
|
|
81
|
+
... },
|
|
82
|
+
... node_assertions={
|
|
83
|
+
... "AI Intent Classifier": [
|
|
84
|
+
... ContainsAssertion(text="high_intent")
|
|
85
|
+
... ],
|
|
86
|
+
... "Lead Scorer": [
|
|
87
|
+
... ContainsAssertion(text="score")
|
|
88
|
+
... ]
|
|
89
|
+
... },
|
|
90
|
+
... expected_output={"intent": "high_intent", "score": 90}
|
|
91
|
+
... )
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
id: str
|
|
95
|
+
name: str
|
|
96
|
+
trigger_data: dict = field(default_factory=dict)
|
|
97
|
+
node_assertions: Optional[dict[str, list[BaseAssertion]]] = None
|
|
98
|
+
workflow_assertions: Optional[list[BaseAssertion]] = None
|
|
99
|
+
expected_output: Optional[Any] = None
|
|
100
|
+
tags: list[str] = field(default_factory=list)
|
|
101
|
+
timeout_seconds: float = 30.0
|
|
102
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class N8nWorkflowEvalRunner:
|
|
106
|
+
"""
|
|
107
|
+
Runs evaluations against n8n workflows.
|
|
108
|
+
|
|
109
|
+
This runner triggers n8n workflows via the API, waits for completion,
|
|
110
|
+
fetches results, and runs assertions on both node-level and workflow-level outputs.
|
|
111
|
+
|
|
112
|
+
Example:
|
|
113
|
+
>>> import asyncio
|
|
114
|
+
>>> from prela.evals.n8n import N8nWorkflowEvalConfig, N8nWorkflowEvalRunner, N8nEvalCase
|
|
115
|
+
>>>
|
|
116
|
+
>>> config = N8nWorkflowEvalConfig(workflow_id="abc123")
|
|
117
|
+
>>> runner = N8nWorkflowEvalRunner(config)
|
|
118
|
+
>>>
|
|
119
|
+
>>> case = N8nEvalCase(
|
|
120
|
+
... id="test_1",
|
|
121
|
+
... name="Test workflow",
|
|
122
|
+
... trigger_data={"message": "Hello"}
|
|
123
|
+
... )
|
|
124
|
+
>>>
|
|
125
|
+
>>> result = asyncio.run(runner.run_case(case))
|
|
126
|
+
>>> print(result["passed"])
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
def __init__(self, config: N8nWorkflowEvalConfig, tracer: Optional[Tracer] = None):
|
|
130
|
+
"""Initialize the n8n workflow evaluation runner.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
config: Configuration for the n8n workflow evaluation
|
|
134
|
+
tracer: Optional tracer for capturing execution traces
|
|
135
|
+
"""
|
|
136
|
+
self.config = config
|
|
137
|
+
self.tracer = tracer
|
|
138
|
+
self.client = httpx.AsyncClient(
|
|
139
|
+
base_url=config.n8n_base_url,
|
|
140
|
+
headers=(
|
|
141
|
+
{"X-N8N-API-KEY": config.n8n_api_key} if config.n8n_api_key else {}
|
|
142
|
+
),
|
|
143
|
+
timeout=config.timeout_seconds,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
async def run_case(self, case: N8nEvalCase) -> dict:
|
|
147
|
+
"""Run a single eval case against the n8n workflow.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
case: The test case to run
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Dictionary with execution results including:
|
|
154
|
+
- execution_id: n8n execution ID
|
|
155
|
+
- status: Execution status (success, error, crashed)
|
|
156
|
+
- duration_ms: Execution duration in milliseconds
|
|
157
|
+
- node_results: Assertion results per node
|
|
158
|
+
- workflow_results: Workflow-level assertion results
|
|
159
|
+
- passed: Whether all assertions passed
|
|
160
|
+
- output_mismatch: If expected_output provided and doesn't match
|
|
161
|
+
|
|
162
|
+
Example:
|
|
163
|
+
>>> result = await runner.run_case(case)
|
|
164
|
+
>>> print(f"Passed: {result['passed']}")
|
|
165
|
+
>>> print(f"Duration: {result['duration_ms']}ms")
|
|
166
|
+
"""
|
|
167
|
+
start_time = time.perf_counter()
|
|
168
|
+
|
|
169
|
+
# Create span if tracer available
|
|
170
|
+
span = None
|
|
171
|
+
if self.tracer and self.config.capture_traces:
|
|
172
|
+
span = self.tracer.start_span(
|
|
173
|
+
name=f"n8n.eval.{case.name}",
|
|
174
|
+
span_type=SpanType.AGENT,
|
|
175
|
+
attributes={
|
|
176
|
+
"eval.case_id": case.id,
|
|
177
|
+
"eval.case_name": case.name,
|
|
178
|
+
"n8n.workflow_id": self.config.workflow_id,
|
|
179
|
+
},
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
# 1. Trigger the workflow
|
|
184
|
+
execution_id = await self._trigger_workflow(case.trigger_data)
|
|
185
|
+
|
|
186
|
+
# 2. Wait for completion
|
|
187
|
+
execution_result = await self._wait_for_completion(execution_id)
|
|
188
|
+
|
|
189
|
+
# 3. Build results structure
|
|
190
|
+
duration_ms = (time.perf_counter() - start_time) * 1000
|
|
191
|
+
|
|
192
|
+
results = {
|
|
193
|
+
"execution_id": execution_id,
|
|
194
|
+
"status": execution_result["status"],
|
|
195
|
+
"duration_ms": duration_ms,
|
|
196
|
+
"node_results": {},
|
|
197
|
+
"workflow_results": [],
|
|
198
|
+
"passed": True,
|
|
199
|
+
"trace_id": get_current_trace_id() if self.tracer else None,
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
# 4. Run node-level assertions
|
|
203
|
+
if case.node_assertions:
|
|
204
|
+
for node_name, assertions in case.node_assertions.items():
|
|
205
|
+
node_data = self._get_node_data(execution_result, node_name)
|
|
206
|
+
node_results = []
|
|
207
|
+
|
|
208
|
+
for assertion in assertions:
|
|
209
|
+
try:
|
|
210
|
+
result = assertion.evaluate(
|
|
211
|
+
output=node_data, expected=None, trace=None
|
|
212
|
+
)
|
|
213
|
+
node_results.append(result)
|
|
214
|
+
if not result.passed:
|
|
215
|
+
results["passed"] = False
|
|
216
|
+
except Exception as e:
|
|
217
|
+
# Assertion evaluation failed
|
|
218
|
+
error_result = AssertionResult(
|
|
219
|
+
passed=False,
|
|
220
|
+
assertion_type="error",
|
|
221
|
+
message=f"Assertion failed: {str(e)}",
|
|
222
|
+
expected=None,
|
|
223
|
+
actual=None,
|
|
224
|
+
)
|
|
225
|
+
node_results.append(error_result)
|
|
226
|
+
results["passed"] = False
|
|
227
|
+
|
|
228
|
+
results["node_results"][node_name] = node_results
|
|
229
|
+
|
|
230
|
+
# 5. Run workflow-level assertions
|
|
231
|
+
if case.workflow_assertions:
|
|
232
|
+
for assertion in case.workflow_assertions:
|
|
233
|
+
try:
|
|
234
|
+
result = assertion.evaluate(
|
|
235
|
+
output=execution_result, expected=None, trace=None
|
|
236
|
+
)
|
|
237
|
+
results["workflow_results"].append(result)
|
|
238
|
+
if not result.passed:
|
|
239
|
+
results["passed"] = False
|
|
240
|
+
except Exception as e:
|
|
241
|
+
error_result = AssertionResult(
|
|
242
|
+
passed=False,
|
|
243
|
+
assertion_type="error",
|
|
244
|
+
message=f"Assertion failed: {str(e)}",
|
|
245
|
+
expected=None,
|
|
246
|
+
actual=None,
|
|
247
|
+
)
|
|
248
|
+
results["workflow_results"].append(error_result)
|
|
249
|
+
results["passed"] = False
|
|
250
|
+
|
|
251
|
+
# 6. Check expected output if provided
|
|
252
|
+
if case.expected_output is not None:
|
|
253
|
+
actual_output = execution_result.get("output")
|
|
254
|
+
if actual_output != case.expected_output:
|
|
255
|
+
results["passed"] = False
|
|
256
|
+
results["output_mismatch"] = {
|
|
257
|
+
"expected": case.expected_output,
|
|
258
|
+
"actual": actual_output,
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
# End span with success
|
|
262
|
+
if span:
|
|
263
|
+
span.set_attribute("eval.passed", results["passed"])
|
|
264
|
+
span.set_attribute("eval.duration_ms", duration_ms)
|
|
265
|
+
span.end()
|
|
266
|
+
|
|
267
|
+
return results
|
|
268
|
+
|
|
269
|
+
except Exception as e:
|
|
270
|
+
# Execution failed
|
|
271
|
+
duration_ms = (time.perf_counter() - start_time) * 1000
|
|
272
|
+
|
|
273
|
+
if span:
|
|
274
|
+
span.set_attribute("eval.passed", False)
|
|
275
|
+
span.set_attribute("eval.error", str(e))
|
|
276
|
+
span.end()
|
|
277
|
+
|
|
278
|
+
return {
|
|
279
|
+
"execution_id": None,
|
|
280
|
+
"status": "error",
|
|
281
|
+
"duration_ms": duration_ms,
|
|
282
|
+
"node_results": {},
|
|
283
|
+
"workflow_results": [],
|
|
284
|
+
"passed": False,
|
|
285
|
+
"error": str(e),
|
|
286
|
+
"trace_id": get_current_trace_id() if self.tracer else None,
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
async def run_suite(self, suite: EvalSuite) -> dict:
|
|
290
|
+
"""Run a full evaluation suite against the n8n workflow.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
suite: The evaluation suite containing test cases
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
Dictionary with aggregated results:
|
|
297
|
+
- suite_name: Name of the suite
|
|
298
|
+
- total: Total number of test cases
|
|
299
|
+
- passed: Number of passed test cases
|
|
300
|
+
- failed: Number of failed test cases
|
|
301
|
+
- cases: List of individual case results
|
|
302
|
+
|
|
303
|
+
Example:
|
|
304
|
+
>>> suite = EvalSuite(name="Lead Scoring Tests", cases=[case1, case2])
|
|
305
|
+
>>> results = await runner.run_suite(suite)
|
|
306
|
+
>>> print(f"Pass rate: {results['passed']}/{results['total']}")
|
|
307
|
+
"""
|
|
308
|
+
results = {
|
|
309
|
+
"suite_name": suite.name,
|
|
310
|
+
"total": len(suite.cases),
|
|
311
|
+
"passed": 0,
|
|
312
|
+
"failed": 0,
|
|
313
|
+
"cases": [],
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
# Run setup if provided
|
|
317
|
+
if suite.setup:
|
|
318
|
+
try:
|
|
319
|
+
suite.setup()
|
|
320
|
+
except Exception as e:
|
|
321
|
+
# Setup failed, abort suite
|
|
322
|
+
return {
|
|
323
|
+
**results,
|
|
324
|
+
"setup_error": str(e),
|
|
325
|
+
"failed": len(suite.cases),
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
# Execute each case
|
|
329
|
+
for case in suite.cases:
|
|
330
|
+
case_result = await self.run_case(case)
|
|
331
|
+
results["cases"].append(case_result)
|
|
332
|
+
|
|
333
|
+
if case_result["passed"]:
|
|
334
|
+
results["passed"] += 1
|
|
335
|
+
else:
|
|
336
|
+
results["failed"] += 1
|
|
337
|
+
|
|
338
|
+
# Run teardown if provided
|
|
339
|
+
if suite.teardown:
|
|
340
|
+
try:
|
|
341
|
+
suite.teardown()
|
|
342
|
+
except Exception as e:
|
|
343
|
+
# Teardown failed, include in results
|
|
344
|
+
results["teardown_error"] = str(e)
|
|
345
|
+
|
|
346
|
+
return results
|
|
347
|
+
|
|
348
|
+
async def _trigger_workflow(self, trigger_data: dict) -> str:
|
|
349
|
+
"""Trigger the n8n workflow and return execution ID.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
trigger_data: Data to send to the workflow trigger
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
Execution ID from n8n
|
|
356
|
+
|
|
357
|
+
Raises:
|
|
358
|
+
httpx.HTTPStatusError: If the API request fails
|
|
359
|
+
"""
|
|
360
|
+
# Use n8n API to execute workflow
|
|
361
|
+
# POST /api/v1/workflows/{workflow_id}/execute
|
|
362
|
+
response = await self.client.post(
|
|
363
|
+
f"/api/v1/workflows/{self.config.workflow_id}/execute",
|
|
364
|
+
json={"data": trigger_data},
|
|
365
|
+
)
|
|
366
|
+
response.raise_for_status()
|
|
367
|
+
data = response.json()
|
|
368
|
+
return data["data"]["executionId"]
|
|
369
|
+
|
|
370
|
+
async def _wait_for_completion(self, execution_id: str) -> dict:
|
|
371
|
+
"""Poll for workflow execution completion.
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
execution_id: n8n execution ID to poll
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
Execution result data from n8n
|
|
378
|
+
|
|
379
|
+
Raises:
|
|
380
|
+
TimeoutError: If execution doesn't complete within timeout
|
|
381
|
+
httpx.HTTPStatusError: If the API request fails
|
|
382
|
+
"""
|
|
383
|
+
start_time = asyncio.get_event_loop().time()
|
|
384
|
+
|
|
385
|
+
while True:
|
|
386
|
+
# GET /api/v1/executions/{execution_id}
|
|
387
|
+
response = await self.client.get(f"/api/v1/executions/{execution_id}")
|
|
388
|
+
response.raise_for_status()
|
|
389
|
+
data = response.json()["data"]
|
|
390
|
+
|
|
391
|
+
# Check if execution completed
|
|
392
|
+
if data["status"] in ["success", "error", "crashed"]:
|
|
393
|
+
return data
|
|
394
|
+
|
|
395
|
+
# Check timeout
|
|
396
|
+
elapsed = asyncio.get_event_loop().time() - start_time
|
|
397
|
+
if elapsed > self.config.timeout_seconds:
|
|
398
|
+
raise TimeoutError(
|
|
399
|
+
f"Workflow execution {execution_id} timed out after {self.config.timeout_seconds}s"
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
# Wait before polling again
|
|
403
|
+
await asyncio.sleep(1)
|
|
404
|
+
|
|
405
|
+
def _get_node_data(
|
|
406
|
+
self, execution_result: dict, node_name: str
|
|
407
|
+
) -> Optional[dict]:
|
|
408
|
+
"""Extract data for a specific node from execution result.
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
execution_result: Complete execution result from n8n (already unwrapped from response.json()["data"])
|
|
412
|
+
node_name: Name of the node to extract data for
|
|
413
|
+
|
|
414
|
+
Returns:
|
|
415
|
+
Node data dictionary or None if node not found
|
|
416
|
+
"""
|
|
417
|
+
# Try different possible structures
|
|
418
|
+
# Structure 1: resultData.runData[node_name]
|
|
419
|
+
if "resultData" in execution_result:
|
|
420
|
+
run_data = execution_result["resultData"].get("runData", {})
|
|
421
|
+
if node_name in run_data:
|
|
422
|
+
return run_data[node_name]
|
|
423
|
+
|
|
424
|
+
# Structure 2: nodes array
|
|
425
|
+
for node in execution_result.get("nodes", []):
|
|
426
|
+
if node.get("name") == node_name:
|
|
427
|
+
return node
|
|
428
|
+
|
|
429
|
+
return None
|
|
430
|
+
|
|
431
|
+
async def close(self):
|
|
432
|
+
"""Close the HTTP client."""
|
|
433
|
+
await self.client.aclose()
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
async def eval_n8n_workflow(
|
|
437
|
+
workflow_id: str,
|
|
438
|
+
test_cases: list[N8nEvalCase],
|
|
439
|
+
n8n_url: str = "http://localhost:5678",
|
|
440
|
+
n8n_api_key: Optional[str] = None,
|
|
441
|
+
timeout_seconds: int = 120,
|
|
442
|
+
tracer: Optional[Tracer] = None,
|
|
443
|
+
) -> dict:
|
|
444
|
+
"""
|
|
445
|
+
Quick way to run evaluations against an n8n workflow.
|
|
446
|
+
|
|
447
|
+
This is a convenience function that sets up the configuration, runner,
|
|
448
|
+
and suite, then executes all test cases.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
workflow_id: n8n workflow ID to test
|
|
452
|
+
test_cases: List of N8nEvalCase instances
|
|
453
|
+
n8n_url: Base URL of n8n instance (default: http://localhost:5678)
|
|
454
|
+
n8n_api_key: API key for n8n authentication (optional)
|
|
455
|
+
timeout_seconds: Maximum seconds to wait for each execution (default: 120)
|
|
456
|
+
tracer: Optional tracer for capturing execution traces
|
|
457
|
+
|
|
458
|
+
Returns:
|
|
459
|
+
Dictionary with evaluation results (see run_suite for structure)
|
|
460
|
+
|
|
461
|
+
Example:
|
|
462
|
+
>>> from prela.evals.n8n import eval_n8n_workflow, N8nEvalCase
|
|
463
|
+
>>> from prela.evals.assertions import ContainsAssertion
|
|
464
|
+
>>>
|
|
465
|
+
>>> results = await eval_n8n_workflow(
|
|
466
|
+
... workflow_id="abc123",
|
|
467
|
+
... test_cases=[
|
|
468
|
+
... N8nEvalCase(
|
|
469
|
+
... id="test_1",
|
|
470
|
+
... name="High-intent lead",
|
|
471
|
+
... trigger_data={"email": "I want to buy..."},
|
|
472
|
+
... node_assertions={
|
|
473
|
+
... "Classify Intent": [
|
|
474
|
+
... ContainsAssertion(substring="high")
|
|
475
|
+
... ]
|
|
476
|
+
... }
|
|
477
|
+
... )
|
|
478
|
+
... ],
|
|
479
|
+
... n8n_url="https://n8n.example.com",
|
|
480
|
+
... n8n_api_key="your-api-key"
|
|
481
|
+
... )
|
|
482
|
+
>>> print(f"Pass rate: {results['passed']}/{results['total']}")
|
|
483
|
+
"""
|
|
484
|
+
config = N8nWorkflowEvalConfig(
|
|
485
|
+
workflow_id=workflow_id,
|
|
486
|
+
n8n_base_url=n8n_url,
|
|
487
|
+
n8n_api_key=n8n_api_key,
|
|
488
|
+
timeout_seconds=timeout_seconds,
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
runner = N8nWorkflowEvalRunner(config, tracer=tracer)
|
|
492
|
+
|
|
493
|
+
try:
|
|
494
|
+
suite = EvalSuite(name=f"n8n-{workflow_id}", cases=test_cases)
|
|
495
|
+
return await runner.run_suite(suite)
|
|
496
|
+
finally:
|
|
497
|
+
await runner.close()
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# Evaluation Reporters
|
|
2
|
+
|
|
3
|
+
Three production-ready reporters for outputting evaluation results in different formats.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from prela.evals import EvalRunner
|
|
9
|
+
from prela.evals.reporters import ConsoleReporter, JSONReporter, JUnitReporter
|
|
10
|
+
|
|
11
|
+
# Run your evaluation
|
|
12
|
+
runner = EvalRunner(suite, agent)
|
|
13
|
+
result = runner.run()
|
|
14
|
+
|
|
15
|
+
# Report to terminal
|
|
16
|
+
console = ConsoleReporter(verbose=True)
|
|
17
|
+
console.report(result)
|
|
18
|
+
|
|
19
|
+
# Save to JSON
|
|
20
|
+
json_reporter = JSONReporter("results.json")
|
|
21
|
+
json_reporter.report(result)
|
|
22
|
+
|
|
23
|
+
# Generate JUnit XML for CI
|
|
24
|
+
junit = JUnitReporter("junit.xml")
|
|
25
|
+
junit.report(result)
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## ConsoleReporter
|
|
29
|
+
|
|
30
|
+
Beautiful terminal output with colors and tables.
|
|
31
|
+
|
|
32
|
+
**Parameters:**
|
|
33
|
+
- `verbose` (bool): Show detailed failure information (default: True)
|
|
34
|
+
- `use_colors` (bool): Use colored output via rich library (default: True)
|
|
35
|
+
|
|
36
|
+
**Example:**
|
|
37
|
+
```python
|
|
38
|
+
reporter = ConsoleReporter(verbose=True, use_colors=True)
|
|
39
|
+
reporter.report(result)
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
**Output:**
|
|
43
|
+
```
|
|
44
|
+
╭──────────────────────────── ✓ Test Suite ────────────────────────────╮
|
|
45
|
+
│ Total: 10 | Passed: 9 (90.0%) | Failed: 1 │
|
|
46
|
+
│ Duration: 2.50s │
|
|
47
|
+
╰──────────────────────────────────────────────────────────────────────╯
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
**Use Cases:**
|
|
51
|
+
- Development and debugging
|
|
52
|
+
- Quick visual feedback
|
|
53
|
+
- Local testing
|
|
54
|
+
|
|
55
|
+
## JSONReporter
|
|
56
|
+
|
|
57
|
+
Structured JSON output for programmatic access.
|
|
58
|
+
|
|
59
|
+
**Parameters:**
|
|
60
|
+
- `output_path` (str | Path): Path to output JSON file
|
|
61
|
+
- `indent` (int): JSON indentation (default: 2, use None for compact)
|
|
62
|
+
|
|
63
|
+
**Example:**
|
|
64
|
+
```python
|
|
65
|
+
reporter = JSONReporter("eval_results/run_001.json", indent=2)
|
|
66
|
+
reporter.report(result)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
**Output:**
|
|
70
|
+
```json
|
|
71
|
+
{
|
|
72
|
+
"suite_name": "Test Suite",
|
|
73
|
+
"started_at": "2026-01-27T14:30:00+00:00",
|
|
74
|
+
"summary": {
|
|
75
|
+
"total_cases": 10,
|
|
76
|
+
"passed_cases": 9,
|
|
77
|
+
"pass_rate": 0.9
|
|
78
|
+
},
|
|
79
|
+
"case_results": [...]
|
|
80
|
+
}
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
**Use Cases:**
|
|
84
|
+
- Data analysis
|
|
85
|
+
- Historical tracking
|
|
86
|
+
- Programmatic processing
|
|
87
|
+
- Integration with analytics tools
|
|
88
|
+
|
|
89
|
+
## JUnitReporter
|
|
90
|
+
|
|
91
|
+
JUnit XML format for CI/CD integration.
|
|
92
|
+
|
|
93
|
+
**Parameters:**
|
|
94
|
+
- `output_path` (str | Path): Path to output XML file
|
|
95
|
+
|
|
96
|
+
**Example:**
|
|
97
|
+
```python
|
|
98
|
+
reporter = JUnitReporter("test-results/junit.xml")
|
|
99
|
+
reporter.report(result)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
**Output:**
|
|
103
|
+
```xml
|
|
104
|
+
<?xml version='1.0' encoding='utf-8'?>
|
|
105
|
+
<testsuite name="Test Suite" tests="10" failures="1" ...>
|
|
106
|
+
<testcase name="Test 1" classname="Test Suite" time="0.145">
|
|
107
|
+
...
|
|
108
|
+
</testcase>
|
|
109
|
+
</testsuite>
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
**Use Cases:**
|
|
113
|
+
- CI/CD integration (Jenkins, GitHub Actions, GitLab)
|
|
114
|
+
- Test result visualization
|
|
115
|
+
- Automated failure notifications
|
|
116
|
+
- Test trend tracking
|
|
117
|
+
|
|
118
|
+
## Using Multiple Reporters
|
|
119
|
+
|
|
120
|
+
Report to multiple outputs simultaneously:
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
# Run evaluation once
|
|
124
|
+
result = runner.run()
|
|
125
|
+
|
|
126
|
+
# Report to multiple outputs
|
|
127
|
+
reporters = [
|
|
128
|
+
ConsoleReporter(verbose=False), # Terminal output
|
|
129
|
+
JSONReporter("results/eval.json"), # Data export
|
|
130
|
+
JUnitReporter("results/junit.xml"), # CI integration
|
|
131
|
+
]
|
|
132
|
+
|
|
133
|
+
for reporter in reporters:
|
|
134
|
+
reporter.report(result)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## CI/CD Integration Examples
|
|
138
|
+
|
|
139
|
+
### GitHub Actions
|
|
140
|
+
|
|
141
|
+
```yaml
|
|
142
|
+
- name: Run evaluations
|
|
143
|
+
run: |
|
|
144
|
+
python run_evals.py
|
|
145
|
+
|
|
146
|
+
- name: Publish test results
|
|
147
|
+
uses: EnricoMi/publish-unit-test-result-action@v2
|
|
148
|
+
if: always()
|
|
149
|
+
with:
|
|
150
|
+
files: test-results/junit.xml
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### GitLab CI
|
|
154
|
+
|
|
155
|
+
```yaml
|
|
156
|
+
test:
|
|
157
|
+
script:
|
|
158
|
+
- python run_evals.py
|
|
159
|
+
artifacts:
|
|
160
|
+
when: always
|
|
161
|
+
reports:
|
|
162
|
+
junit: test-results/junit.xml
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### Jenkins
|
|
166
|
+
|
|
167
|
+
```groovy
|
|
168
|
+
stage('Test') {
|
|
169
|
+
steps {
|
|
170
|
+
sh 'python run_evals.py'
|
|
171
|
+
}
|
|
172
|
+
post {
|
|
173
|
+
always {
|
|
174
|
+
junit 'test-results/junit.xml'
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## See Also
|
|
181
|
+
|
|
182
|
+
- Full demo: `examples/reporters_demo.py`
|
|
183
|
+
- Tests: `tests/test_evals/test_reporters.py`
|
|
184
|
+
- Documentation: `/REPORTERS_IMPLEMENTATION_SUMMARY.md`
|