agentevals-cli 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. agentevals/__init__.py +16 -0
  2. agentevals/_protocol.py +83 -0
  3. agentevals/api/__init__.py +0 -0
  4. agentevals/api/app.py +137 -0
  5. agentevals/api/debug_routes.py +268 -0
  6. agentevals/api/models.py +204 -0
  7. agentevals/api/otlp_app.py +25 -0
  8. agentevals/api/otlp_routes.py +383 -0
  9. agentevals/api/routes.py +554 -0
  10. agentevals/api/streaming_routes.py +373 -0
  11. agentevals/builtin_metrics.py +234 -0
  12. agentevals/cli.py +643 -0
  13. agentevals/config.py +108 -0
  14. agentevals/converter.py +328 -0
  15. agentevals/custom_evaluators.py +468 -0
  16. agentevals/eval_config_loader.py +147 -0
  17. agentevals/evaluator/__init__.py +24 -0
  18. agentevals/evaluator/resolver.py +70 -0
  19. agentevals/evaluator/sources.py +293 -0
  20. agentevals/evaluator/templates.py +224 -0
  21. agentevals/extraction.py +444 -0
  22. agentevals/genai_converter.py +538 -0
  23. agentevals/loader/__init__.py +7 -0
  24. agentevals/loader/base.py +53 -0
  25. agentevals/loader/jaeger.py +112 -0
  26. agentevals/loader/otlp.py +193 -0
  27. agentevals/mcp_server.py +236 -0
  28. agentevals/output.py +204 -0
  29. agentevals/runner.py +310 -0
  30. agentevals/sdk.py +433 -0
  31. agentevals/streaming/__init__.py +120 -0
  32. agentevals/streaming/incremental_processor.py +337 -0
  33. agentevals/streaming/processor.py +285 -0
  34. agentevals/streaming/session.py +36 -0
  35. agentevals/streaming/ws_server.py +806 -0
  36. agentevals/trace_attrs.py +32 -0
  37. agentevals/trace_metrics.py +126 -0
  38. agentevals/utils/__init__.py +0 -0
  39. agentevals/utils/genai_messages.py +142 -0
  40. agentevals/utils/log_buffer.py +43 -0
  41. agentevals/utils/log_enrichment.py +187 -0
  42. agentevals_cli-0.5.2.dist-info/METADATA +22 -0
  43. agentevals_cli-0.5.2.dist-info/RECORD +46 -0
  44. agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
  45. agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
  46. agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
agentevals/sdk.py ADDED
@@ -0,0 +1,433 @@
1
+ """High-level SDK for streaming agent traces to the agentevals UI.
2
+
3
+ Wraps OpenTelemetry, WebSocket, and processor boilerplate into a simple
4
+ context manager or decorator API.
5
+
6
+ Usage (context manager — primary API):
7
+
8
+ from agentevals import AgentEvals
9
+
10
+ app = AgentEvals()
11
+
12
+ with app.session(eval_set_id="my-eval"):
13
+ result = my_agent.invoke("Hello!")
14
+
15
+ Usage (decorator — shorthand for simple agents):
16
+
17
+ app = AgentEvals(eval_set_id="my-eval")
18
+
19
+ @app.agent
20
+ def my_agent(prompt):
21
+ return llm.invoke(prompt).content
22
+
23
+ app.run(["Hello!", "Tell me a joke"])
24
+
25
+ Disabling streaming:
26
+ Pass ``streaming=False`` to skip all WebSocket/OTel setup. The context
27
+ managers become no-ops and your agent code runs without any agentevals
28
+ connection. Useful for gating on an env var so the SDK stays wired up
29
+ in code but only streams when the dev server is running::
30
+
31
+ app = AgentEvals(streaming=os.getenv("AGENTEVALS_STREAM", "1") == "1")
32
+
33
+ Provider lifecycle:
34
+ The SDK adds an ``AgentEvalsStreamingProcessor`` to the active
35
+ ``TracerProvider`` for the duration of a session. After shutdown the
36
+ processor is inert (``on_end`` short-circuits) but remains registered
37
+ because OTel's ``TracerProvider`` has no ``remove_span_processor``
38
+ API. This is harmless for typical dev workflows. If you need a clean
39
+ provider between sessions, pass a fresh ``TracerProvider`` via the
40
+ ``tracer_provider`` parameter.
41
+ """
42
+
43
+ from __future__ import annotations
44
+
45
+ import asyncio
46
+ import inspect
47
+ import logging
48
+ import os
49
+ import threading
50
+ import uuid
51
+ from collections.abc import Callable
52
+ from contextlib import asynccontextmanager, contextmanager
53
+ from dataclasses import dataclass, field
54
+ from datetime import datetime
55
+ from typing import TYPE_CHECKING, Any
56
+
57
+ if TYPE_CHECKING:
58
+ from opentelemetry.sdk._logs import LoggerProvider
59
+ from opentelemetry.sdk.trace import TracerProvider as SdkTracerProvider
60
+
61
+ from .streaming.processor import AgentEvalsLogStreamingProcessor, AgentEvalsStreamingProcessor
62
+
63
+ __all__ = ["AgentEvals"]
64
+
65
+ logger = logging.getLogger(__name__)
66
+
67
+ _DEFAULT_WS_URL = "ws://localhost:8001/ws/traces"
68
+
69
+
70
+ @dataclass(slots=True)
71
+ class _OtelSetup:
72
+ tracer_provider: SdkTracerProvider
73
+ processor: AgentEvalsStreamingProcessor
74
+ logger_provider: LoggerProvider | None = field(default=None)
75
+ log_processor: AgentEvalsLogStreamingProcessor | None = field(default=None)
76
+
77
+
78
+ class AgentEvals:
79
+ """High-level SDK for streaming agent traces to the agentevals UI."""
80
+
81
+ def __init__(
82
+ self,
83
+ ws_url: str = _DEFAULT_WS_URL,
84
+ eval_set_id: str | None = None,
85
+ metadata: dict[str, Any] | None = None,
86
+ auto_instrument: bool = True,
87
+ capture_message_content: bool = True,
88
+ streaming: bool = True,
89
+ ):
90
+ self.ws_url = ws_url
91
+ self.eval_set_id = eval_set_id
92
+ self.metadata = metadata or {}
93
+ self.auto_instrument = auto_instrument
94
+ self.capture_message_content = capture_message_content
95
+ self.streaming = streaming
96
+
97
+ self._agent_fn: Callable | None = None
98
+ self._is_async: bool = False
99
+
100
+ def agent(self, fn: Callable) -> Callable:
101
+ """Decorator to register the agent entry point.
102
+
103
+ The decorated function should accept a prompt string and return a result.
104
+ Works with both sync and async functions.
105
+ """
106
+ self._agent_fn = fn
107
+ self._is_async = inspect.iscoroutinefunction(fn)
108
+ return fn
109
+
110
+ def run(
111
+ self,
112
+ prompts: list[str] | None = None,
113
+ interactive: bool = False,
114
+ eval_set_id: str | None = None,
115
+ metadata: dict[str, Any] | None = None,
116
+ ) -> list[Any]:
117
+ """Run the registered agent with streaming enabled.
118
+
119
+ Args:
120
+ prompts: List of prompts to run sequentially.
121
+ interactive: If True, enter a REPL loop reading from stdin.
122
+ eval_set_id: Override the eval_set_id from __init__.
123
+ metadata: Additional metadata merged with __init__ metadata.
124
+
125
+ Returns:
126
+ List of agent results.
127
+ """
128
+ if self._agent_fn is None:
129
+ raise RuntimeError("No agent registered. Use @app.agent to register one.")
130
+
131
+ eff_eval_set_id = eval_set_id or self.eval_set_id
132
+ eff_metadata = {**self.metadata, **(metadata or {})}
133
+
134
+ if self._is_async:
135
+ return asyncio.run(self._run_async(prompts, interactive, eff_eval_set_id, eff_metadata))
136
+ else:
137
+ return self._run_sync(prompts, interactive, eff_eval_set_id, eff_metadata)
138
+
139
+ # --- Context managers (the core value) ---
140
+
141
+ @contextmanager
142
+ def session(
143
+ self,
144
+ eval_set_id: str | None = None,
145
+ session_name: str | None = None,
146
+ metadata: dict[str, Any] | None = None,
147
+ tracer_provider: SdkTracerProvider | None = None,
148
+ ):
149
+ """Sync context manager that sets up OTel streaming.
150
+
151
+ Args:
152
+ eval_set_id: Evaluation set ID for matching against a golden session.
153
+ session_name: Custom session name (auto-generated if omitted).
154
+ metadata: Custom metadata sent with the session.
155
+ tracer_provider: Explicit TracerProvider to use (e.g. from StrandsTelemetry).
156
+ Falls back to the global provider, then creates a new one.
157
+ """
158
+ eff_session_name = session_name or self._generate_session_id()
159
+
160
+ if not self.streaming:
161
+ logger.debug("Streaming disabled, running without agentevals connection")
162
+ yield eff_session_name
163
+ return
164
+
165
+ eff_eval_set_id = eval_set_id or self.eval_set_id
166
+ eff_metadata = {**self.metadata, **(metadata or {})}
167
+
168
+ setup = self._setup_otel(eff_session_name, tracer_provider)
169
+
170
+ loop = asyncio.new_event_loop()
171
+ thread = threading.Thread(
172
+ target=lambda: (asyncio.set_event_loop(loop), loop.run_forever()),
173
+ daemon=True,
174
+ )
175
+ thread.start()
176
+
177
+ try:
178
+ future = asyncio.run_coroutine_threadsafe(
179
+ setup.processor.connect(eval_set_id=eff_eval_set_id, metadata=eff_metadata),
180
+ loop,
181
+ )
182
+ future.result(timeout=10)
183
+ except Exception as exc:
184
+ loop.call_soon_threadsafe(loop.stop)
185
+ thread.join(timeout=5)
186
+ raise ConnectionError(
187
+ f"[agentevals] Could not connect to {self.ws_url}. Is 'agentevals serve --dev' running?\n {exc}"
188
+ ) from exc
189
+
190
+ setup.tracer_provider.add_span_processor(setup.processor)
191
+ if setup.logger_provider and setup.log_processor:
192
+ setup.logger_provider.add_log_record_processor(setup.log_processor)
193
+
194
+ logger.info("Streaming to %s (session: %s)", self.ws_url, eff_session_name)
195
+
196
+ try:
197
+ yield eff_session_name
198
+ finally:
199
+ setup.tracer_provider.force_flush()
200
+ if setup.logger_provider:
201
+ setup.logger_provider.force_flush()
202
+ fut = asyncio.run_coroutine_threadsafe(setup.processor.shutdown_async(), loop)
203
+ try:
204
+ fut.result(timeout=10)
205
+ except Exception as exc:
206
+ logger.warning("Shutdown error: %s", exc)
207
+ loop.call_soon_threadsafe(loop.stop)
208
+ thread.join(timeout=5)
209
+
210
+ @asynccontextmanager
211
+ async def session_async(
212
+ self,
213
+ eval_set_id: str | None = None,
214
+ session_name: str | None = None,
215
+ metadata: dict[str, Any] | None = None,
216
+ tracer_provider: SdkTracerProvider | None = None,
217
+ ):
218
+ """Async context manager that sets up OTel streaming.
219
+
220
+ Args:
221
+ eval_set_id: Evaluation set ID for matching against a golden session.
222
+ session_name: Custom session name (auto-generated if omitted).
223
+ metadata: Custom metadata sent with the session.
224
+ tracer_provider: Explicit TracerProvider to use. Falls back to the global
225
+ provider, then creates a new one.
226
+ """
227
+ eff_session_name = session_name or self._generate_session_id()
228
+
229
+ if not self.streaming:
230
+ logger.debug("Streaming disabled, running without agentevals connection")
231
+ yield eff_session_name
232
+ return
233
+
234
+ eff_eval_set_id = eval_set_id or self.eval_set_id
235
+ eff_metadata = {**self.metadata, **(metadata or {})}
236
+
237
+ setup = self._setup_otel(eff_session_name, tracer_provider)
238
+
239
+ try:
240
+ await setup.processor.connect(eval_set_id=eff_eval_set_id, metadata=eff_metadata)
241
+ except Exception as exc:
242
+ raise ConnectionError(
243
+ f"[agentevals] Could not connect to {self.ws_url}. Is 'agentevals serve --dev' running?\n {exc}"
244
+ ) from exc
245
+
246
+ setup.tracer_provider.add_span_processor(setup.processor)
247
+ if setup.logger_provider and setup.log_processor:
248
+ setup.logger_provider.add_log_record_processor(setup.log_processor)
249
+
250
+ logger.info("Streaming to %s (session: %s)", self.ws_url, eff_session_name)
251
+
252
+ try:
253
+ yield eff_session_name
254
+ finally:
255
+ setup.tracer_provider.force_flush()
256
+ if setup.logger_provider:
257
+ setup.logger_provider.force_flush()
258
+ try:
259
+ await setup.processor.shutdown_async()
260
+ except Exception as exc:
261
+ logger.warning("Shutdown error: %s", exc)
262
+
263
+ # --- Decorator run helpers ---
264
+
265
+ async def _run_async(self, prompts, interactive, eval_set_id, metadata):
266
+ async with self.session_async(eval_set_id=eval_set_id, metadata=metadata):
267
+ return await self._execute_agent_async(prompts, interactive)
268
+
269
+ async def _execute_agent_async(self, prompts, interactive):
270
+ results = []
271
+ if prompts:
272
+ for i, prompt in enumerate(prompts, 1):
273
+ print(f"[{i}/{len(prompts)}] > {prompt}")
274
+ result = await self._agent_fn(prompt)
275
+ print(f" {result}")
276
+ results.append(result)
277
+ elif interactive:
278
+ while True:
279
+ try:
280
+ prompt = input("> ") # noqa: ASYNC250
281
+ except (EOFError, KeyboardInterrupt):
282
+ break
283
+ result = await self._agent_fn(prompt)
284
+ print(result)
285
+ results.append(result)
286
+ else:
287
+ result = await self._agent_fn()
288
+ results.append(result)
289
+ return results
290
+
291
+ def _run_sync(self, prompts, interactive, eval_set_id, metadata):
292
+ with self.session(eval_set_id=eval_set_id, metadata=metadata):
293
+ return self._execute_agent_sync(prompts, interactive)
294
+
295
+ def _execute_agent_sync(self, prompts, interactive):
296
+ results = []
297
+ if prompts:
298
+ for i, prompt in enumerate(prompts, 1):
299
+ print(f"[{i}/{len(prompts)}] > {prompt}")
300
+ result = self._agent_fn(prompt)
301
+ print(f" {result}")
302
+ results.append(result)
303
+ elif interactive:
304
+ while True:
305
+ try:
306
+ prompt = input("> ")
307
+ except (EOFError, KeyboardInterrupt):
308
+ break
309
+ result = self._agent_fn(prompt)
310
+ print(result)
311
+ results.append(result)
312
+ else:
313
+ result = self._agent_fn()
314
+ results.append(result)
315
+ return results
316
+
317
+ # --- Internal helpers ---
318
+
319
+ def _setup_otel(
320
+ self,
321
+ session_name: str,
322
+ explicit_tracer_provider: SdkTracerProvider | None = None,
323
+ ) -> _OtelSetup:
324
+ """Configure OTel providers and create a streaming processor.
325
+
326
+ Provider resolution order:
327
+ 1. ``explicit_tracer_provider`` if given
328
+ 2. Existing global ``TracerProvider`` (e.g. set by StrandsTelemetry)
329
+ 3. New ``TracerProvider`` created and set globally
330
+
331
+ A ``LoggerProvider`` is only created when the OpenAI OTel instrumentor
332
+ is installed, since it's the only pattern that emits message content
333
+ via OTel log records rather than span events.
334
+ """
335
+ from opentelemetry import trace
336
+ from opentelemetry.sdk.trace import TracerProvider
337
+
338
+ from .streaming.processor import AgentEvalsLogStreamingProcessor, AgentEvalsStreamingProcessor
339
+
340
+ if self.capture_message_content:
341
+ os.environ.setdefault("OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT", "true")
342
+
343
+ if explicit_tracer_provider is not None:
344
+ tracer_provider = explicit_tracer_provider
345
+ else:
346
+ tracer_provider = trace.get_tracer_provider()
347
+ if not isinstance(tracer_provider, TracerProvider):
348
+ tracer_provider = TracerProvider()
349
+ trace.set_tracer_provider(tracer_provider)
350
+
351
+ processor = AgentEvalsStreamingProcessor(
352
+ ws_url=self.ws_url,
353
+ session_id=session_name,
354
+ trace_id=uuid.uuid4().hex,
355
+ )
356
+
357
+ logger_provider = None
358
+ log_processor = None
359
+ if self._should_setup_log_provider():
360
+ try:
361
+ from opentelemetry._logs import get_logger_provider, set_logger_provider
362
+ from opentelemetry.sdk._logs import LoggerProvider
363
+
364
+ existing_lp = get_logger_provider()
365
+ if isinstance(existing_lp, LoggerProvider):
366
+ logger_provider = existing_lp
367
+ else:
368
+ logger_provider = LoggerProvider()
369
+ set_logger_provider(logger_provider)
370
+
371
+ log_processor = AgentEvalsLogStreamingProcessor(processor)
372
+ except ImportError:
373
+ pass
374
+
375
+ if self.auto_instrument:
376
+ self._auto_instrument()
377
+
378
+ return _OtelSetup(
379
+ tracer_provider=tracer_provider,
380
+ processor=processor,
381
+ logger_provider=logger_provider,
382
+ log_processor=log_processor,
383
+ )
384
+
385
+ def _should_setup_log_provider(self) -> bool:
386
+ """Check whether the OpenAI OTel instrumentor is installed.
387
+
388
+ Only the logs-based GenAI semconv pattern (used by
389
+ ``opentelemetry-instrumentation-openai-v2``) requires a
390
+ ``LoggerProvider``. Strands and ADK emit content via span
391
+ events or native attributes and don't need one.
392
+ """
393
+ try:
394
+ import opentelemetry.instrumentation.openai_v2 # noqa: F401
395
+
396
+ return True
397
+ except ImportError:
398
+ return False
399
+
400
+ def _auto_instrument(self) -> None:
401
+ """Best-effort discovery and activation of OTel instrumentors.
402
+
403
+ Silently skips anything that isn't installed. Safe to call
404
+ multiple times — OTel instrumentors track their own state and
405
+ ``instrument()`` is idempotent.
406
+ """
407
+ found_instrumentor = False
408
+
409
+ try:
410
+ from opentelemetry.instrumentation.openai_v2 import OpenAIInstrumentor
411
+
412
+ OpenAIInstrumentor().instrument()
413
+ found_instrumentor = True
414
+ except (ImportError, RuntimeError):
415
+ pass
416
+
417
+ try:
418
+ import strands # noqa: F401
419
+
420
+ os.environ.setdefault("OTEL_SEMCONV_STABILITY_OPT_IN", "gen_ai_latest_experimental")
421
+ found_instrumentor = True
422
+ except ImportError:
423
+ pass
424
+
425
+ if not found_instrumentor:
426
+ logger.warning(
427
+ "No OTel instrumentor found. LLM calls won't produce traces. "
428
+ "Install one, e.g.: pip install opentelemetry-instrumentation-openai-v2"
429
+ )
430
+
431
+ def _generate_session_id(self) -> str:
432
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
433
+ return f"session-{timestamp}-{uuid.uuid4().hex[:6]}"
@@ -0,0 +1,120 @@
1
+ """Live streaming support for agentevals."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ import uuid
8
+ from contextlib import asynccontextmanager
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ @asynccontextmanager
14
+ async def enable_streaming(
15
+ ws_url: str = "ws://localhost:8001/ws/traces",
16
+ eval_set_id: str | None = None,
17
+ session_name: str | None = None,
18
+ ):
19
+ """Enable live streaming of OTel spans to agentevals dev server.
20
+
21
+ Usage:
22
+ from agentevals.streaming import enable_streaming
23
+
24
+ async with enable_streaming("ws://localhost:8001/ws/traces", eval_set_id="my-eval"):
25
+ # Your agent code here
26
+ agent.invoke("...")
27
+ """
28
+ try:
29
+ from opentelemetry import trace
30
+ from opentelemetry.sdk.trace import TracerProvider
31
+
32
+ from .processor import AgentEvalsStreamingProcessor
33
+ except ImportError:
34
+ logger.error("opentelemetry-sdk required for streaming. Install with: pip install opentelemetry-sdk websockets")
35
+ raise
36
+
37
+ session_id = session_name or f"session-{uuid.uuid4().hex[:8]}"
38
+ trace_id = uuid.uuid4().hex
39
+
40
+ processor = AgentEvalsStreamingProcessor(ws_url, session_id, trace_id)
41
+ await processor.connect(eval_set_id=eval_set_id)
42
+
43
+ tracer_provider = trace.get_tracer_provider()
44
+ if isinstance(tracer_provider, TracerProvider):
45
+ tracer_provider.add_span_processor(processor)
46
+ else:
47
+ logger.warning(
48
+ "No TracerProvider found. Streaming may not work. Ensure OpenTelemetry is configured in your agent."
49
+ )
50
+
51
+ try:
52
+ yield session_id
53
+ finally:
54
+ await processor.shutdown_async()
55
+
56
+
57
+ def enable_streaming_sync(
58
+ ws_url: str = "ws://localhost:8001/ws/traces",
59
+ eval_set_id: str | None = None,
60
+ session_name: str | None = None,
61
+ ):
62
+ """Synchronous wrapper for enable_streaming (sets up processor but doesn't manage lifecycle).
63
+
64
+ .. deprecated:: 0.2.0
65
+ Use the async :func:`enable_streaming` context manager instead.
66
+ This function modifies the global event loop and can interfere with existing async code.
67
+
68
+ For use in non-async code. Note: You need to manually manage the event loop.
69
+
70
+ Args:
71
+ ws_url: WebSocket URL of the agentevals dev server
72
+ eval_set_id: Optional ID of eval set to use for evaluation
73
+ session_name: Optional custom session name
74
+
75
+ Returns:
76
+ AgentEvalsStreamingProcessor instance that must be manually shut down
77
+
78
+ Warning:
79
+ This function is deprecated and will be removed in a future version.
80
+ Prefer using the async version for better compatibility.
81
+ """
82
+ import warnings
83
+
84
+ warnings.warn(
85
+ "enable_streaming_sync is deprecated and will be removed in a future version. "
86
+ "Use the async enable_streaming() context manager instead.",
87
+ DeprecationWarning,
88
+ stacklevel=2,
89
+ )
90
+ try:
91
+ from opentelemetry import trace
92
+ from opentelemetry.sdk.trace import TracerProvider
93
+
94
+ from .processor import AgentEvalsStreamingProcessor
95
+ except ImportError:
96
+ logger.error("opentelemetry-sdk required for streaming. Install with: pip install opentelemetry-sdk websockets")
97
+ return
98
+
99
+ session_id = session_name or f"session-{uuid.uuid4().hex[:8]}"
100
+ trace_id = uuid.uuid4().hex
101
+
102
+ processor = AgentEvalsStreamingProcessor(ws_url, session_id, trace_id)
103
+
104
+ loop = asyncio.new_event_loop()
105
+ asyncio.set_event_loop(loop)
106
+ loop.run_until_complete(processor.connect(eval_set_id=eval_set_id))
107
+
108
+ tracer_provider = trace.get_tracer_provider()
109
+ if isinstance(tracer_provider, TracerProvider):
110
+ tracer_provider.add_span_processor(processor)
111
+
112
+ print("[agentevals] Connected to dev server")
113
+ print(f"[agentevals] Session: {session_id}")
114
+ if eval_set_id:
115
+ print(f"[agentevals] Eval set: {eval_set_id}")
116
+
117
+ return processor
118
+
119
+
120
+ __all__ = ["enable_streaming", "enable_streaming_sync"]