prela 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. prela/__init__.py +394 -0
  2. prela/_version.py +3 -0
  3. prela/contrib/CLI.md +431 -0
  4. prela/contrib/README.md +118 -0
  5. prela/contrib/__init__.py +5 -0
  6. prela/contrib/cli.py +1063 -0
  7. prela/contrib/explorer.py +571 -0
  8. prela/core/__init__.py +64 -0
  9. prela/core/clock.py +98 -0
  10. prela/core/context.py +228 -0
  11. prela/core/replay.py +403 -0
  12. prela/core/sampler.py +178 -0
  13. prela/core/span.py +295 -0
  14. prela/core/tracer.py +498 -0
  15. prela/evals/__init__.py +94 -0
  16. prela/evals/assertions/README.md +484 -0
  17. prela/evals/assertions/__init__.py +78 -0
  18. prela/evals/assertions/base.py +90 -0
  19. prela/evals/assertions/multi_agent.py +625 -0
  20. prela/evals/assertions/semantic.py +223 -0
  21. prela/evals/assertions/structural.py +443 -0
  22. prela/evals/assertions/tool.py +380 -0
  23. prela/evals/case.py +370 -0
  24. prela/evals/n8n/__init__.py +69 -0
  25. prela/evals/n8n/assertions.py +450 -0
  26. prela/evals/n8n/runner.py +497 -0
  27. prela/evals/reporters/README.md +184 -0
  28. prela/evals/reporters/__init__.py +32 -0
  29. prela/evals/reporters/console.py +251 -0
  30. prela/evals/reporters/json.py +176 -0
  31. prela/evals/reporters/junit.py +278 -0
  32. prela/evals/runner.py +525 -0
  33. prela/evals/suite.py +316 -0
  34. prela/exporters/__init__.py +27 -0
  35. prela/exporters/base.py +189 -0
  36. prela/exporters/console.py +443 -0
  37. prela/exporters/file.py +322 -0
  38. prela/exporters/http.py +394 -0
  39. prela/exporters/multi.py +154 -0
  40. prela/exporters/otlp.py +388 -0
  41. prela/instrumentation/ANTHROPIC.md +297 -0
  42. prela/instrumentation/LANGCHAIN.md +480 -0
  43. prela/instrumentation/OPENAI.md +59 -0
  44. prela/instrumentation/__init__.py +49 -0
  45. prela/instrumentation/anthropic.py +1436 -0
  46. prela/instrumentation/auto.py +129 -0
  47. prela/instrumentation/base.py +436 -0
  48. prela/instrumentation/langchain.py +959 -0
  49. prela/instrumentation/llamaindex.py +719 -0
  50. prela/instrumentation/multi_agent/__init__.py +48 -0
  51. prela/instrumentation/multi_agent/autogen.py +357 -0
  52. prela/instrumentation/multi_agent/crewai.py +404 -0
  53. prela/instrumentation/multi_agent/langgraph.py +299 -0
  54. prela/instrumentation/multi_agent/models.py +203 -0
  55. prela/instrumentation/multi_agent/swarm.py +231 -0
  56. prela/instrumentation/n8n/__init__.py +68 -0
  57. prela/instrumentation/n8n/code_node.py +534 -0
  58. prela/instrumentation/n8n/models.py +336 -0
  59. prela/instrumentation/n8n/webhook.py +489 -0
  60. prela/instrumentation/openai.py +1198 -0
  61. prela/license.py +245 -0
  62. prela/replay/__init__.py +31 -0
  63. prela/replay/comparison.py +390 -0
  64. prela/replay/engine.py +1227 -0
  65. prela/replay/loader.py +231 -0
  66. prela/replay/result.py +196 -0
  67. prela-0.1.0.dist-info/METADATA +399 -0
  68. prela-0.1.0.dist-info/RECORD +71 -0
  69. prela-0.1.0.dist-info/WHEEL +4 -0
  70. prela-0.1.0.dist-info/entry_points.txt +2 -0
  71. prela-0.1.0.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,497 @@
1
+ """
2
+ Evaluation runner for n8n workflows.
3
+
4
+ This module provides specialized evaluation tools for testing n8n workflows by:
5
+ 1. Triggering workflows via n8n API/webhook
6
+ 2. Waiting for execution completion
7
+ 3. Fetching execution results
8
+ 4. Running assertions on nodes and workflow outputs
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ import time
15
+ from dataclasses import dataclass, field
16
+ from typing import Any, Optional
17
+
18
+ import httpx
19
+
20
+ from prela.core.clock import now
21
+ from prela.core.context import get_current_trace_id
22
+ from prela.core.span import SpanType
23
+ from prela.core.tracer import Tracer
24
+ from prela.evals.assertions.base import AssertionResult, BaseAssertion
25
+ from prela.evals.suite import EvalSuite
26
+
27
+
28
+ @dataclass
29
+ class N8nWorkflowEvalConfig:
30
+ """Configuration for evaluating an n8n workflow.
31
+
32
+ Attributes:
33
+ workflow_id: n8n workflow ID to test
34
+ n8n_base_url: Base URL of n8n instance (default: http://localhost:5678)
35
+ n8n_api_key: API key for n8n authentication (optional)
36
+ timeout_seconds: Maximum seconds to wait for workflow completion (default: 120)
37
+ capture_traces: Whether to capture traces during execution (default: True)
38
+
39
+ Example:
40
+ >>> config = N8nWorkflowEvalConfig(
41
+ ... workflow_id="abc123",
42
+ ... n8n_base_url="https://n8n.example.com",
43
+ ... n8n_api_key="your-api-key",
44
+ ... timeout_seconds=60
45
+ ... )
46
+ """
47
+
48
+ workflow_id: str
49
+ n8n_base_url: str = "http://localhost:5678"
50
+ n8n_api_key: Optional[str] = None
51
+ timeout_seconds: int = 120
52
+ capture_traces: bool = True
53
+
54
+
55
+ @dataclass
56
+ class N8nEvalCase:
57
+ """Test case for an n8n workflow.
58
+
59
+ Unlike regular EvalCase which requires EvalInput, N8nEvalCase uses trigger_data
60
+ to start workflows and includes n8n-specific assertion capabilities.
61
+
62
+ Attributes:
63
+ id: Unique test case ID
64
+ name: Human-readable test case name
65
+ trigger_data: Data to send when triggering the workflow
66
+ node_assertions: Mapping of node_name -> list of assertions to run on that node's output
67
+ workflow_assertions: List of assertions to run on the complete workflow execution
68
+ expected_output: Expected final output from the workflow (optional)
69
+ tags: Optional tags for filtering/grouping
70
+ timeout_seconds: Maximum execution time for this test case
71
+ metadata: Additional metadata for this test case
72
+
73
+ Example:
74
+ >>> from prela.evals.assertions import ContainsAssertion
75
+ >>> case = N8nEvalCase(
76
+ ... id="test_lead_scoring",
77
+ ... name="High-intent lead classification",
78
+ ... trigger_data={
79
+ ... "email": "I want to buy your product immediately",
80
+ ... "company": "ACME Corp"
81
+ ... },
82
+ ... node_assertions={
83
+ ... "AI Intent Classifier": [
84
+ ... ContainsAssertion(text="high_intent")
85
+ ... ],
86
+ ... "Lead Scorer": [
87
+ ... ContainsAssertion(text="score")
88
+ ... ]
89
+ ... },
90
+ ... expected_output={"intent": "high_intent", "score": 90}
91
+ ... )
92
+ """
93
+
94
+ id: str
95
+ name: str
96
+ trigger_data: dict = field(default_factory=dict)
97
+ node_assertions: Optional[dict[str, list[BaseAssertion]]] = None
98
+ workflow_assertions: Optional[list[BaseAssertion]] = None
99
+ expected_output: Optional[Any] = None
100
+ tags: list[str] = field(default_factory=list)
101
+ timeout_seconds: float = 30.0
102
+ metadata: dict[str, Any] = field(default_factory=dict)
103
+
104
+
105
+ class N8nWorkflowEvalRunner:
106
+ """
107
+ Runs evaluations against n8n workflows.
108
+
109
+ This runner triggers n8n workflows via the API, waits for completion,
110
+ fetches results, and runs assertions on both node-level and workflow-level outputs.
111
+
112
+ Example:
113
+ >>> import asyncio
114
+ >>> from prela.evals.n8n import N8nWorkflowEvalConfig, N8nWorkflowEvalRunner, N8nEvalCase
115
+ >>>
116
+ >>> config = N8nWorkflowEvalConfig(workflow_id="abc123")
117
+ >>> runner = N8nWorkflowEvalRunner(config)
118
+ >>>
119
+ >>> case = N8nEvalCase(
120
+ ... id="test_1",
121
+ ... name="Test workflow",
122
+ ... trigger_data={"message": "Hello"}
123
+ ... )
124
+ >>>
125
+ >>> result = asyncio.run(runner.run_case(case))
126
+ >>> print(result["passed"])
127
+ """
128
+
129
+ def __init__(self, config: N8nWorkflowEvalConfig, tracer: Optional[Tracer] = None):
130
+ """Initialize the n8n workflow evaluation runner.
131
+
132
+ Args:
133
+ config: Configuration for the n8n workflow evaluation
134
+ tracer: Optional tracer for capturing execution traces
135
+ """
136
+ self.config = config
137
+ self.tracer = tracer
138
+ self.client = httpx.AsyncClient(
139
+ base_url=config.n8n_base_url,
140
+ headers=(
141
+ {"X-N8N-API-KEY": config.n8n_api_key} if config.n8n_api_key else {}
142
+ ),
143
+ timeout=config.timeout_seconds,
144
+ )
145
+
146
+ async def run_case(self, case: N8nEvalCase) -> dict:
147
+ """Run a single eval case against the n8n workflow.
148
+
149
+ Args:
150
+ case: The test case to run
151
+
152
+ Returns:
153
+ Dictionary with execution results including:
154
+ - execution_id: n8n execution ID
155
+ - status: Execution status (success, error, crashed)
156
+ - duration_ms: Execution duration in milliseconds
157
+ - node_results: Assertion results per node
158
+ - workflow_results: Workflow-level assertion results
159
+ - passed: Whether all assertions passed
160
+ - output_mismatch: If expected_output provided and doesn't match
161
+
162
+ Example:
163
+ >>> result = await runner.run_case(case)
164
+ >>> print(f"Passed: {result['passed']}")
165
+ >>> print(f"Duration: {result['duration_ms']}ms")
166
+ """
167
+ start_time = time.perf_counter()
168
+
169
+ # Create span if tracer available
170
+ span = None
171
+ if self.tracer and self.config.capture_traces:
172
+ span = self.tracer.start_span(
173
+ name=f"n8n.eval.{case.name}",
174
+ span_type=SpanType.AGENT,
175
+ attributes={
176
+ "eval.case_id": case.id,
177
+ "eval.case_name": case.name,
178
+ "n8n.workflow_id": self.config.workflow_id,
179
+ },
180
+ )
181
+
182
+ try:
183
+ # 1. Trigger the workflow
184
+ execution_id = await self._trigger_workflow(case.trigger_data)
185
+
186
+ # 2. Wait for completion
187
+ execution_result = await self._wait_for_completion(execution_id)
188
+
189
+ # 3. Build results structure
190
+ duration_ms = (time.perf_counter() - start_time) * 1000
191
+
192
+ results = {
193
+ "execution_id": execution_id,
194
+ "status": execution_result["status"],
195
+ "duration_ms": duration_ms,
196
+ "node_results": {},
197
+ "workflow_results": [],
198
+ "passed": True,
199
+ "trace_id": get_current_trace_id() if self.tracer else None,
200
+ }
201
+
202
+ # 4. Run node-level assertions
203
+ if case.node_assertions:
204
+ for node_name, assertions in case.node_assertions.items():
205
+ node_data = self._get_node_data(execution_result, node_name)
206
+ node_results = []
207
+
208
+ for assertion in assertions:
209
+ try:
210
+ result = assertion.evaluate(
211
+ output=node_data, expected=None, trace=None
212
+ )
213
+ node_results.append(result)
214
+ if not result.passed:
215
+ results["passed"] = False
216
+ except Exception as e:
217
+ # Assertion evaluation failed
218
+ error_result = AssertionResult(
219
+ passed=False,
220
+ assertion_type="error",
221
+ message=f"Assertion failed: {str(e)}",
222
+ expected=None,
223
+ actual=None,
224
+ )
225
+ node_results.append(error_result)
226
+ results["passed"] = False
227
+
228
+ results["node_results"][node_name] = node_results
229
+
230
+ # 5. Run workflow-level assertions
231
+ if case.workflow_assertions:
232
+ for assertion in case.workflow_assertions:
233
+ try:
234
+ result = assertion.evaluate(
235
+ output=execution_result, expected=None, trace=None
236
+ )
237
+ results["workflow_results"].append(result)
238
+ if not result.passed:
239
+ results["passed"] = False
240
+ except Exception as e:
241
+ error_result = AssertionResult(
242
+ passed=False,
243
+ assertion_type="error",
244
+ message=f"Assertion failed: {str(e)}",
245
+ expected=None,
246
+ actual=None,
247
+ )
248
+ results["workflow_results"].append(error_result)
249
+ results["passed"] = False
250
+
251
+ # 6. Check expected output if provided
252
+ if case.expected_output is not None:
253
+ actual_output = execution_result.get("output")
254
+ if actual_output != case.expected_output:
255
+ results["passed"] = False
256
+ results["output_mismatch"] = {
257
+ "expected": case.expected_output,
258
+ "actual": actual_output,
259
+ }
260
+
261
+ # End span with success
262
+ if span:
263
+ span.set_attribute("eval.passed", results["passed"])
264
+ span.set_attribute("eval.duration_ms", duration_ms)
265
+ span.end()
266
+
267
+ return results
268
+
269
+ except Exception as e:
270
+ # Execution failed
271
+ duration_ms = (time.perf_counter() - start_time) * 1000
272
+
273
+ if span:
274
+ span.set_attribute("eval.passed", False)
275
+ span.set_attribute("eval.error", str(e))
276
+ span.end()
277
+
278
+ return {
279
+ "execution_id": None,
280
+ "status": "error",
281
+ "duration_ms": duration_ms,
282
+ "node_results": {},
283
+ "workflow_results": [],
284
+ "passed": False,
285
+ "error": str(e),
286
+ "trace_id": get_current_trace_id() if self.tracer else None,
287
+ }
288
+
289
+ async def run_suite(self, suite: EvalSuite) -> dict:
290
+ """Run a full evaluation suite against the n8n workflow.
291
+
292
+ Args:
293
+ suite: The evaluation suite containing test cases
294
+
295
+ Returns:
296
+ Dictionary with aggregated results:
297
+ - suite_name: Name of the suite
298
+ - total: Total number of test cases
299
+ - passed: Number of passed test cases
300
+ - failed: Number of failed test cases
301
+ - cases: List of individual case results
302
+
303
+ Example:
304
+ >>> suite = EvalSuite(name="Lead Scoring Tests", cases=[case1, case2])
305
+ >>> results = await runner.run_suite(suite)
306
+ >>> print(f"Pass rate: {results['passed']}/{results['total']}")
307
+ """
308
+ results = {
309
+ "suite_name": suite.name,
310
+ "total": len(suite.cases),
311
+ "passed": 0,
312
+ "failed": 0,
313
+ "cases": [],
314
+ }
315
+
316
+ # Run setup if provided
317
+ if suite.setup:
318
+ try:
319
+ suite.setup()
320
+ except Exception as e:
321
+ # Setup failed, abort suite
322
+ return {
323
+ **results,
324
+ "setup_error": str(e),
325
+ "failed": len(suite.cases),
326
+ }
327
+
328
+ # Execute each case
329
+ for case in suite.cases:
330
+ case_result = await self.run_case(case)
331
+ results["cases"].append(case_result)
332
+
333
+ if case_result["passed"]:
334
+ results["passed"] += 1
335
+ else:
336
+ results["failed"] += 1
337
+
338
+ # Run teardown if provided
339
+ if suite.teardown:
340
+ try:
341
+ suite.teardown()
342
+ except Exception as e:
343
+ # Teardown failed, include in results
344
+ results["teardown_error"] = str(e)
345
+
346
+ return results
347
+
348
+ async def _trigger_workflow(self, trigger_data: dict) -> str:
349
+ """Trigger the n8n workflow and return execution ID.
350
+
351
+ Args:
352
+ trigger_data: Data to send to the workflow trigger
353
+
354
+ Returns:
355
+ Execution ID from n8n
356
+
357
+ Raises:
358
+ httpx.HTTPStatusError: If the API request fails
359
+ """
360
+ # Use n8n API to execute workflow
361
+ # POST /api/v1/workflows/{workflow_id}/execute
362
+ response = await self.client.post(
363
+ f"/api/v1/workflows/{self.config.workflow_id}/execute",
364
+ json={"data": trigger_data},
365
+ )
366
+ response.raise_for_status()
367
+ data = response.json()
368
+ return data["data"]["executionId"]
369
+
370
+ async def _wait_for_completion(self, execution_id: str) -> dict:
371
+ """Poll for workflow execution completion.
372
+
373
+ Args:
374
+ execution_id: n8n execution ID to poll
375
+
376
+ Returns:
377
+ Execution result data from n8n
378
+
379
+ Raises:
380
+ TimeoutError: If execution doesn't complete within timeout
381
+ httpx.HTTPStatusError: If the API request fails
382
+ """
383
+ start_time = asyncio.get_event_loop().time()
384
+
385
+ while True:
386
+ # GET /api/v1/executions/{execution_id}
387
+ response = await self.client.get(f"/api/v1/executions/{execution_id}")
388
+ response.raise_for_status()
389
+ data = response.json()["data"]
390
+
391
+ # Check if execution completed
392
+ if data["status"] in ["success", "error", "crashed"]:
393
+ return data
394
+
395
+ # Check timeout
396
+ elapsed = asyncio.get_event_loop().time() - start_time
397
+ if elapsed > self.config.timeout_seconds:
398
+ raise TimeoutError(
399
+ f"Workflow execution {execution_id} timed out after {self.config.timeout_seconds}s"
400
+ )
401
+
402
+ # Wait before polling again
403
+ await asyncio.sleep(1)
404
+
405
+ def _get_node_data(
406
+ self, execution_result: dict, node_name: str
407
+ ) -> Optional[dict]:
408
+ """Extract data for a specific node from execution result.
409
+
410
+ Args:
411
+ execution_result: Complete execution result from n8n (already unwrapped from response.json()["data"])
412
+ node_name: Name of the node to extract data for
413
+
414
+ Returns:
415
+ Node data dictionary or None if node not found
416
+ """
417
+ # Try different possible structures
418
+ # Structure 1: resultData.runData[node_name]
419
+ if "resultData" in execution_result:
420
+ run_data = execution_result["resultData"].get("runData", {})
421
+ if node_name in run_data:
422
+ return run_data[node_name]
423
+
424
+ # Structure 2: nodes array
425
+ for node in execution_result.get("nodes", []):
426
+ if node.get("name") == node_name:
427
+ return node
428
+
429
+ return None
430
+
431
+ async def close(self):
432
+ """Close the HTTP client."""
433
+ await self.client.aclose()
434
+
435
+
436
+ async def eval_n8n_workflow(
437
+ workflow_id: str,
438
+ test_cases: list[N8nEvalCase],
439
+ n8n_url: str = "http://localhost:5678",
440
+ n8n_api_key: Optional[str] = None,
441
+ timeout_seconds: int = 120,
442
+ tracer: Optional[Tracer] = None,
443
+ ) -> dict:
444
+ """
445
+ Quick way to run evaluations against an n8n workflow.
446
+
447
+ This is a convenience function that sets up the configuration, runner,
448
+ and suite, then executes all test cases.
449
+
450
+ Args:
451
+ workflow_id: n8n workflow ID to test
452
+ test_cases: List of N8nEvalCase instances
453
+ n8n_url: Base URL of n8n instance (default: http://localhost:5678)
454
+ n8n_api_key: API key for n8n authentication (optional)
455
+ timeout_seconds: Maximum seconds to wait for each execution (default: 120)
456
+ tracer: Optional tracer for capturing execution traces
457
+
458
+ Returns:
459
+ Dictionary with evaluation results (see run_suite for structure)
460
+
461
+ Example:
462
+ >>> from prela.evals.n8n import eval_n8n_workflow, N8nEvalCase
463
+ >>> from prela.evals.assertions import ContainsAssertion
464
+ >>>
465
+ >>> results = await eval_n8n_workflow(
466
+ ... workflow_id="abc123",
467
+ ... test_cases=[
468
+ ... N8nEvalCase(
469
+ ... id="test_1",
470
+ ... name="High-intent lead",
471
+ ... trigger_data={"email": "I want to buy..."},
472
+ ... node_assertions={
473
+ ... "Classify Intent": [
474
+ ... ContainsAssertion(substring="high")
475
+ ... ]
476
+ ... }
477
+ ... )
478
+ ... ],
479
+ ... n8n_url="https://n8n.example.com",
480
+ ... n8n_api_key="your-api-key"
481
+ ... )
482
+ >>> print(f"Pass rate: {results['passed']}/{results['total']}")
483
+ """
484
+ config = N8nWorkflowEvalConfig(
485
+ workflow_id=workflow_id,
486
+ n8n_base_url=n8n_url,
487
+ n8n_api_key=n8n_api_key,
488
+ timeout_seconds=timeout_seconds,
489
+ )
490
+
491
+ runner = N8nWorkflowEvalRunner(config, tracer=tracer)
492
+
493
+ try:
494
+ suite = EvalSuite(name=f"n8n-{workflow_id}", cases=test_cases)
495
+ return await runner.run_suite(suite)
496
+ finally:
497
+ await runner.close()
@@ -0,0 +1,184 @@
1
+ # Evaluation Reporters
2
+
3
+ Three production-ready reporters for outputting evaluation results in different formats.
4
+
5
+ ## Quick Start
6
+
7
+ ```python
8
+ from prela.evals import EvalRunner
9
+ from prela.evals.reporters import ConsoleReporter, JSONReporter, JUnitReporter
10
+
11
+ # Run your evaluation
12
+ runner = EvalRunner(suite, agent)
13
+ result = runner.run()
14
+
15
+ # Report to terminal
16
+ console = ConsoleReporter(verbose=True)
17
+ console.report(result)
18
+
19
+ # Save to JSON
20
+ json_reporter = JSONReporter("results.json")
21
+ json_reporter.report(result)
22
+
23
+ # Generate JUnit XML for CI
24
+ junit = JUnitReporter("junit.xml")
25
+ junit.report(result)
26
+ ```
27
+
28
+ ## ConsoleReporter
29
+
30
+ Beautiful terminal output with colors and tables.
31
+
32
+ **Parameters:**
33
+ - `verbose` (bool): Show detailed failure information (default: True)
34
+ - `use_colors` (bool): Use colored output via rich library (default: True)
35
+
36
+ **Example:**
37
+ ```python
38
+ reporter = ConsoleReporter(verbose=True, use_colors=True)
39
+ reporter.report(result)
40
+ ```
41
+
42
+ **Output:**
43
+ ```
44
+ ╭──────────────────────────── ✓ Test Suite ────────────────────────────╮
45
+ │ Total: 10 | Passed: 9 (90.0%) | Failed: 1 │
46
+ │ Duration: 2.50s │
47
+ ╰──────────────────────────────────────────────────────────────────────╯
48
+ ```
49
+
50
+ **Use Cases:**
51
+ - Development and debugging
52
+ - Quick visual feedback
53
+ - Local testing
54
+
55
+ ## JSONReporter
56
+
57
+ Structured JSON output for programmatic access.
58
+
59
+ **Parameters:**
60
+ - `output_path` (str | Path): Path to output JSON file
61
+ - `indent` (int): JSON indentation (default: 2, use None for compact)
62
+
63
+ **Example:**
64
+ ```python
65
+ reporter = JSONReporter("eval_results/run_001.json", indent=2)
66
+ reporter.report(result)
67
+ ```
68
+
69
+ **Output:**
70
+ ```json
71
+ {
72
+ "suite_name": "Test Suite",
73
+ "started_at": "2026-01-27T14:30:00+00:00",
74
+ "summary": {
75
+ "total_cases": 10,
76
+ "passed_cases": 9,
77
+ "pass_rate": 0.9
78
+ },
79
+ "case_results": [...]
80
+ }
81
+ ```
82
+
83
+ **Use Cases:**
84
+ - Data analysis
85
+ - Historical tracking
86
+ - Programmatic processing
87
+ - Integration with analytics tools
88
+
89
+ ## JUnitReporter
90
+
91
+ JUnit XML format for CI/CD integration.
92
+
93
+ **Parameters:**
94
+ - `output_path` (str | Path): Path to output XML file
95
+
96
+ **Example:**
97
+ ```python
98
+ reporter = JUnitReporter("test-results/junit.xml")
99
+ reporter.report(result)
100
+ ```
101
+
102
+ **Output:**
103
+ ```xml
104
+ <?xml version='1.0' encoding='utf-8'?>
105
+ <testsuite name="Test Suite" tests="10" failures="1" ...>
106
+ <testcase name="Test 1" classname="Test Suite" time="0.145">
107
+ ...
108
+ </testcase>
109
+ </testsuite>
110
+ ```
111
+
112
+ **Use Cases:**
113
+ - CI/CD integration (Jenkins, GitHub Actions, GitLab)
114
+ - Test result visualization
115
+ - Automated failure notifications
116
+ - Test trend tracking
117
+
118
+ ## Using Multiple Reporters
119
+
120
+ Report to multiple outputs simultaneously:
121
+
122
+ ```python
123
+ # Run evaluation once
124
+ result = runner.run()
125
+
126
+ # Report to multiple outputs
127
+ reporters = [
128
+ ConsoleReporter(verbose=False), # Terminal output
129
+ JSONReporter("results/eval.json"), # Data export
130
+ JUnitReporter("results/junit.xml"), # CI integration
131
+ ]
132
+
133
+ for reporter in reporters:
134
+ reporter.report(result)
135
+ ```
136
+
137
+ ## CI/CD Integration Examples
138
+
139
+ ### GitHub Actions
140
+
141
+ ```yaml
142
+ - name: Run evaluations
143
+ run: |
144
+ python run_evals.py
145
+
146
+ - name: Publish test results
147
+ uses: EnricoMi/publish-unit-test-result-action@v2
148
+ if: always()
149
+ with:
150
+ files: test-results/junit.xml
151
+ ```
152
+
153
+ ### GitLab CI
154
+
155
+ ```yaml
156
+ test:
157
+ script:
158
+ - python run_evals.py
159
+ artifacts:
160
+ when: always
161
+ reports:
162
+ junit: test-results/junit.xml
163
+ ```
164
+
165
+ ### Jenkins
166
+
167
+ ```groovy
168
+ stage('Test') {
169
+ steps {
170
+ sh 'python run_evals.py'
171
+ }
172
+ post {
173
+ always {
174
+ junit 'test-results/junit.xml'
175
+ }
176
+ }
177
+ }
178
+ ```
179
+
180
+ ## See Also
181
+
182
+ - Full demo: `examples/reporters_demo.py`
183
+ - Tests: `tests/test_evals/test_reporters.py`
184
+ - Documentation: `/REPORTERS_IMPLEMENTATION_SUMMARY.md`