agentevals-cli 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. agentevals/__init__.py +16 -0
  2. agentevals/_protocol.py +83 -0
  3. agentevals/api/__init__.py +0 -0
  4. agentevals/api/app.py +137 -0
  5. agentevals/api/debug_routes.py +268 -0
  6. agentevals/api/models.py +204 -0
  7. agentevals/api/otlp_app.py +25 -0
  8. agentevals/api/otlp_routes.py +383 -0
  9. agentevals/api/routes.py +554 -0
  10. agentevals/api/streaming_routes.py +373 -0
  11. agentevals/builtin_metrics.py +234 -0
  12. agentevals/cli.py +643 -0
  13. agentevals/config.py +108 -0
  14. agentevals/converter.py +328 -0
  15. agentevals/custom_evaluators.py +468 -0
  16. agentevals/eval_config_loader.py +147 -0
  17. agentevals/evaluator/__init__.py +24 -0
  18. agentevals/evaluator/resolver.py +70 -0
  19. agentevals/evaluator/sources.py +293 -0
  20. agentevals/evaluator/templates.py +224 -0
  21. agentevals/extraction.py +444 -0
  22. agentevals/genai_converter.py +538 -0
  23. agentevals/loader/__init__.py +7 -0
  24. agentevals/loader/base.py +53 -0
  25. agentevals/loader/jaeger.py +112 -0
  26. agentevals/loader/otlp.py +193 -0
  27. agentevals/mcp_server.py +236 -0
  28. agentevals/output.py +204 -0
  29. agentevals/runner.py +310 -0
  30. agentevals/sdk.py +433 -0
  31. agentevals/streaming/__init__.py +120 -0
  32. agentevals/streaming/incremental_processor.py +337 -0
  33. agentevals/streaming/processor.py +285 -0
  34. agentevals/streaming/session.py +36 -0
  35. agentevals/streaming/ws_server.py +806 -0
  36. agentevals/trace_attrs.py +32 -0
  37. agentevals/trace_metrics.py +126 -0
  38. agentevals/utils/__init__.py +0 -0
  39. agentevals/utils/genai_messages.py +142 -0
  40. agentevals/utils/log_buffer.py +43 -0
  41. agentevals/utils/log_enrichment.py +187 -0
  42. agentevals_cli-0.5.2.dist-info/METADATA +22 -0
  43. agentevals_cli-0.5.2.dist-info/RECORD +46 -0
  44. agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
  45. agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
  46. agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,806 @@
1
+ """WebSocket server for streaming OTel spans from agents."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import logging
8
+ import tempfile
9
+ from datetime import UTC, datetime, timedelta
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from fastapi import WebSocket, WebSocketDisconnect
14
+
15
+ from ..api.models import (
16
+ SessionInfo,
17
+ WSSessionCompleteEvent,
18
+ WSSessionStartedEvent,
19
+ WSSpanReceivedEvent,
20
+ )
21
+ from ..converter import convert_traces
22
+ from ..extraction import extract_token_usage_from_attrs, is_llm_span, parse_tool_response_content
23
+ from ..loader.base import Trace
24
+ from ..loader.otlp import OtlpJsonLoader
25
+ from ..trace_attrs import OTEL_GENAI_INPUT_MESSAGES, OTEL_GENAI_REQUEST_MODEL
26
+ from ..utils.log_enrichment import enrich_spans_with_logs
27
+ from .incremental_processor import IncrementalInvocationExtractor
28
+ from .session import TraceSession
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class StreamingTraceManager:
34
+ """Manages active trace sessions from WebSocket clients.
35
+
36
+ Args:
37
+ session_ttl_hours: How long to keep completed sessions in memory (default: 2 hours)
38
+ max_sessions: Maximum number of sessions to keep (default: 100)
39
+ completion_grace_seconds: Delay after root span before completing session (default: 3.0)
40
+ idle_timeout_seconds: Complete session after this many seconds of inactivity (default: 30.0)
41
+ reextraction_delay_seconds: Debounce delay for late-log re-extraction (default: 2.0)
42
+ """
43
+
44
+ def __init__(
45
+ self,
46
+ session_ttl_hours: int = 2,
47
+ max_sessions: int = 100,
48
+ completion_grace_seconds: float = 3.0,
49
+ idle_timeout_seconds: float = 30.0,
50
+ reextraction_delay_seconds: float = 2.0,
51
+ ):
52
+ self.sessions: dict[str, TraceSession] = {}
53
+ self.incremental_extractors: dict[str, IncrementalInvocationExtractor] = {}
54
+ self.sse_queues: list[asyncio.Queue] = []
55
+ self.session_ttl = timedelta(hours=session_ttl_hours)
56
+ self.max_sessions = max_sessions
57
+ self.completion_grace_seconds = completion_grace_seconds
58
+ self.idle_timeout_seconds = idle_timeout_seconds
59
+ self.reextraction_delay_seconds = reextraction_delay_seconds
60
+ self._cleanup_task: asyncio.Task | None = None
61
+ self._completion_timers: dict[str, asyncio.Task] = {}
62
+ self._idle_timers: dict[str, asyncio.Task] = {}
63
+ self._orphan_logs: list[dict] = []
64
+ self._orphan_log_max_age = timedelta(seconds=60)
65
+ self._active_session_for_name: dict[str, str] = {}
66
+
67
+ def register_sse_client(self) -> asyncio.Queue:
68
+ """Register a new SSE client and return its queue."""
69
+ queue: asyncio.Queue = asyncio.Queue()
70
+ self.sse_queues.append(queue)
71
+ return queue
72
+
73
+ def unregister_sse_client(self, queue: asyncio.Queue) -> None:
74
+ """Unregister an SSE client."""
75
+ if queue in self.sse_queues:
76
+ self.sse_queues.remove(queue)
77
+
78
+ def start_cleanup_task(self) -> None:
79
+ """Start the background task for cleaning up old sessions."""
80
+ if self._cleanup_task is None:
81
+ self._cleanup_task = asyncio.create_task(self._cleanup_old_sessions_loop())
82
+ logger.info("Started session cleanup task (TTL: %s, max: %d)", self.session_ttl, self.max_sessions)
83
+
84
+ async def shutdown(self) -> None:
85
+ """Gracefully shut down: close SSE clients and cancel background tasks."""
86
+ for queue in self.sse_queues:
87
+ queue.put_nowait(None)
88
+ pending = list(self._completion_timers.values()) + list(self._idle_timers.values())
89
+ if self._cleanup_task:
90
+ pending.append(self._cleanup_task)
91
+ self._cleanup_task = None
92
+ for task in pending:
93
+ task.cancel()
94
+ if pending:
95
+ await asyncio.gather(*pending, return_exceptions=True)
96
+ self._completion_timers.clear()
97
+ self._idle_timers.clear()
98
+
99
+ async def _cleanup_old_sessions_loop(self) -> None:
100
+ """Periodically clean up old sessions to prevent memory leak."""
101
+ while True:
102
+ try:
103
+ await asyncio.sleep(3600)
104
+ removed_count = self._cleanup_old_sessions()
105
+ if removed_count > 0:
106
+ logger.info("Cleaned up %d old sessions", removed_count)
107
+ except asyncio.CancelledError:
108
+ break
109
+ except Exception as exc:
110
+ logger.exception("Error in cleanup task: %s", exc)
111
+
112
+ def _cleanup_old_sessions(self) -> int:
113
+ """Remove sessions older than TTL or enforce max session limit.
114
+
115
+ Returns:
116
+ Number of sessions removed
117
+ """
118
+ now = datetime.now(UTC)
119
+ to_remove = []
120
+
121
+ for session_id, session in self.sessions.items():
122
+ age = now - session.started_at
123
+ if session.is_complete and age > self.session_ttl:
124
+ to_remove.append(session_id)
125
+
126
+ if len(self.sessions) - len(to_remove) > self.max_sessions:
127
+ sorted_sessions = sorted(
128
+ [(sid, s) for sid, s in self.sessions.items() if s.is_complete and sid not in to_remove],
129
+ key=lambda x: x[1].started_at,
130
+ )
131
+ excess_count = len(self.sessions) - len(to_remove) - self.max_sessions
132
+ for i in range(min(excess_count, len(sorted_sessions))):
133
+ to_remove.append(sorted_sessions[i][0])
134
+
135
+ for session_id in to_remove:
136
+ del self.sessions[session_id]
137
+ if session_id in self.incremental_extractors:
138
+ del self.incremental_extractors[session_id]
139
+ for key in (session_id, f"_reextract_{session_id}"):
140
+ if key in self._completion_timers:
141
+ self._completion_timers.pop(key).cancel()
142
+ if session_id in self._idle_timers:
143
+ self._idle_timers.pop(session_id).cancel()
144
+ logger.debug("Removed old session: %s", session_id)
145
+
146
+ cutoff = now - self._orphan_log_max_age
147
+ self._orphan_logs = [e for e in self._orphan_logs if e["buffered_at"] >= cutoff]
148
+
149
+ return len(to_remove)
150
+
151
+ async def broadcast_to_ui(self, event: dict) -> None:
152
+ """Broadcast event to all connected SSE clients."""
153
+ for queue in self.sse_queues:
154
+ try:
155
+ await queue.put(event)
156
+ except Exception as exc:
157
+ logger.warning("Failed to broadcast to SSE client: %s", exc)
158
+
159
+ def buffer_orphan_log(self, trace_id: str, session_name: str | None, log_event: dict) -> None:
160
+ """Buffer a log event that arrived before its session was created.
161
+
162
+ OTLP BatchLogRecordProcessor and BatchSpanProcessor flush independently.
163
+ Logs may arrive at /v1/logs before the first span arrives at /v1/traces,
164
+ at which point no session exists yet. These orphan logs are buffered and
165
+ replayed when the matching session is created.
166
+ """
167
+ self._orphan_logs.append(
168
+ {
169
+ "trace_id": trace_id,
170
+ "session_name": session_name,
171
+ "log_event": log_event,
172
+ "buffered_at": datetime.now(UTC),
173
+ }
174
+ )
175
+
176
+ def _replay_orphan_logs(self, session: TraceSession) -> list[dict]:
177
+ """Replay buffered orphan logs that match the given session.
178
+
179
+ Returns the replayed log events for further processing (e.g., incremental
180
+ extraction, broadcasting).
181
+ """
182
+ cutoff = datetime.now(UTC) - self._orphan_log_max_age
183
+ remaining = []
184
+ replayed = []
185
+
186
+ for entry in self._orphan_logs:
187
+ if entry["buffered_at"] < cutoff:
188
+ continue
189
+
190
+ matched = entry["trace_id"] in session.trace_ids or (
191
+ entry["session_name"] and self._active_session_for_name.get(entry["session_name"]) == session.session_id
192
+ )
193
+
194
+ if matched:
195
+ session.trace_ids.add(entry["trace_id"])
196
+ session.logs.append(entry["log_event"])
197
+ replayed.append(entry["log_event"])
198
+ else:
199
+ remaining.append(entry)
200
+
201
+ self._orphan_logs = remaining
202
+
203
+ if replayed:
204
+ logger.info(
205
+ "Replayed %d orphan logs into session %s",
206
+ len(replayed),
207
+ session.session_id,
208
+ )
209
+
210
+ return replayed
211
+
212
+ async def get_or_create_otlp_session(self, trace_id: str, metadata: dict) -> TraceSession:
213
+ """Get existing session for trace_id or create a new one (OTLP path).
214
+
215
+ Groups spans by session_name (from resource attributes), not by trace_id.
216
+ A single session can contain spans from multiple traces — this is common
217
+ with GenAI semconv instrumentation where each LLM call creates its own
218
+ independent trace.
219
+ """
220
+ session_name = metadata.get("session_name") or f"otlp-{trace_id[:12]}"
221
+
222
+ active_id = self._active_session_for_name.get(session_name)
223
+ if active_id:
224
+ active = self.sessions.get(active_id)
225
+ if active and not active.is_complete:
226
+ active.trace_ids.add(trace_id)
227
+ return active
228
+
229
+ existing = self.find_session_by_trace_id(trace_id)
230
+ if existing and existing.is_complete:
231
+ self._reopen_session(existing, trace_id, session_name)
232
+ return existing
233
+
234
+ session_id = session_name
235
+ if session_id in self.sessions:
236
+ counter = 2
237
+ while f"{session_name}-{counter}" in self.sessions:
238
+ counter += 1
239
+ session_id = f"{session_name}-{counter}"
240
+
241
+ session = TraceSession(
242
+ session_id=session_id,
243
+ trace_id=trace_id,
244
+ eval_set_id=metadata.get("eval_set_id"),
245
+ metadata={k: v for k, v in metadata.get("resource_attrs", {}).items() if not k.startswith("agentevals.")},
246
+ source="otlp",
247
+ trace_ids={trace_id},
248
+ )
249
+
250
+ self.sessions[session_id] = session
251
+ self._active_session_for_name[session_name] = session_id
252
+ self.incremental_extractors[session_id] = IncrementalInvocationExtractor()
253
+
254
+ replayed = self._replay_orphan_logs(session)
255
+ extractor = self.incremental_extractors.get(session_id)
256
+ if extractor and replayed:
257
+ for log_event in replayed:
258
+ updates = extractor.process_log(log_event)
259
+ for update in updates:
260
+ update["sessionId"] = session_id
261
+ await self.broadcast_to_ui(update)
262
+
263
+ await self.broadcast_to_ui(
264
+ WSSessionStartedEvent(
265
+ session=SessionInfo(
266
+ session_id=session_id,
267
+ trace_id=trace_id,
268
+ eval_set_id=metadata.get("eval_set_id"),
269
+ span_count=0,
270
+ is_complete=False,
271
+ started_at=session.started_at.isoformat(),
272
+ metadata=session.metadata,
273
+ ),
274
+ ).model_dump(by_alias=True)
275
+ )
276
+
277
+ logger.info("Auto-created OTLP session: %s (trace: %s)", session_id, trace_id)
278
+ return session
279
+
280
+ def schedule_session_completion(self, session_id: str) -> None:
281
+ """Schedule session completion after root span arrival.
282
+
283
+ Starts a 3-second grace period to allow late-arriving child spans
284
+ from the same OTLP batch to be included before finalizing.
285
+ """
286
+ if session_id in self._completion_timers:
287
+ self._completion_timers[session_id].cancel()
288
+
289
+ self._completion_timers[session_id] = asyncio.create_task(
290
+ self._delayed_complete(session_id, self.completion_grace_seconds)
291
+ )
292
+
293
+ def reset_idle_timer(self, session_id: str) -> None:
294
+ """Reset the idle timeout for an OTLP session.
295
+
296
+ Fallback completion after 30 seconds of no new spans or logs.
297
+ Primary completion uses root span detection (3-second grace period),
298
+ which handles most cases. This idle timeout catches edge cases like
299
+ agent crashes or traces that never emit a root span.
300
+ """
301
+ if session_id in self._idle_timers:
302
+ self._idle_timers[session_id].cancel()
303
+
304
+ self._idle_timers[session_id] = asyncio.create_task(
305
+ self._delayed_complete(session_id, self.idle_timeout_seconds)
306
+ )
307
+
308
+ def schedule_log_reextraction(self, session_id: str) -> None:
309
+ """Schedule re-extraction of invocations after late-arriving logs.
310
+
311
+ Logs from BatchLogRecordProcessor may arrive after span-triggered
312
+ session completion. This debounces re-extraction so multiple log
313
+ batches are coalesced into a single re-extraction pass.
314
+ """
315
+ key = f"_reextract_{session_id}"
316
+ if key in self._completion_timers:
317
+ self._completion_timers[key].cancel()
318
+
319
+ self._completion_timers[key] = asyncio.create_task(
320
+ self._delayed_reextract(session_id, self.reextraction_delay_seconds)
321
+ )
322
+
323
+ def _reopen_session(self, session: TraceSession, trace_id: str, session_name: str) -> None:
324
+ """Reopen a completed session when a trace_id already in the session
325
+ receives more spans after completion (split-batch scenario).
326
+
327
+ The OTLP BatchSpanProcessor may flush one turn's spans across the
328
+ completion boundary: some child spans arrive before the grace period
329
+ fires, and the root span (plus remaining children) arrives after.
330
+ Because the trace_id was already registered in the session, we know
331
+ these late spans belong here rather than to a new agent run.
332
+ """
333
+ session.is_complete = False
334
+ session.completed_at = None
335
+ session.trace_ids.add(trace_id)
336
+ self._active_session_for_name[session_name] = session.session_id
337
+ self.incremental_extractors[session.session_id] = IncrementalInvocationExtractor()
338
+ self.reset_idle_timer(session.session_id)
339
+ logger.info(
340
+ "Reopened session %s for trace %s (%d spans so far)",
341
+ session.session_id,
342
+ trace_id,
343
+ len(session.spans),
344
+ )
345
+
346
+ async def _delayed_complete(self, session_id: str, delay: float) -> None:
347
+ await asyncio.sleep(delay)
348
+ await self._complete_otlp_session(session_id)
349
+
350
+ async def _delayed_reextract(self, session_id: str, delay: float) -> None:
351
+ await asyncio.sleep(delay)
352
+ await self._reextract_with_logs(session_id)
353
+
354
+ def find_session_by_trace_id(self, trace_id: str) -> TraceSession | None:
355
+ """Find a session that contains the given trace_id.
356
+
357
+ Matches both active and recently-completed sessions so that
358
+ late-arriving logs can still be associated with their session.
359
+ """
360
+ for session in self.sessions.values():
361
+ if trace_id in session.trace_ids:
362
+ return session
363
+ return None
364
+
365
+ async def _reextract_with_logs(self, session_id: str) -> None:
366
+ """Re-extract invocations after late logs arrive for a completed session."""
367
+ session = self.sessions.get(session_id)
368
+ if not session:
369
+ return
370
+
371
+ key = f"_reextract_{session_id}"
372
+ if key in self._completion_timers:
373
+ del self._completion_timers[key]
374
+
375
+ logger.info(
376
+ "Re-extracting invocations with %d late logs for session %s",
377
+ len(session.logs),
378
+ session_id,
379
+ )
380
+
381
+ invocations_data = await self._extract_invocations(session)
382
+ session.invocations = invocations_data
383
+
384
+ await self.broadcast_to_ui(
385
+ WSSessionCompleteEvent(
386
+ session_id=session_id,
387
+ invocations=invocations_data,
388
+ ).model_dump(by_alias=True)
389
+ )
390
+
391
+ async def _complete_otlp_session(self, session_id: str) -> None:
392
+ """Mark an OTLP session as complete and extract invocations.
393
+
394
+ Equivalent to the WebSocket 'session_end' handler. Idempotent — does
395
+ nothing if the session is already complete or missing.
396
+ """
397
+ session = self.sessions.get(session_id)
398
+ if not session or session.is_complete:
399
+ return
400
+
401
+ session.is_complete = True
402
+ session.completed_at = datetime.now(UTC)
403
+
404
+ for name, sid in list(self._active_session_for_name.items()):
405
+ if sid == session_id:
406
+ del self._active_session_for_name[name]
407
+ break
408
+
409
+ if session_id in self._completion_timers:
410
+ self._completion_timers.pop(session_id).cancel()
411
+ if session_id in self._idle_timers:
412
+ self._idle_timers.pop(session_id).cancel()
413
+
414
+ logger.info(
415
+ "OTLP session complete: %s (%d spans, %d logs)",
416
+ session_id,
417
+ len(session.spans),
418
+ len(session.logs),
419
+ )
420
+
421
+ invocations_data = await self._extract_invocations(session)
422
+ session.invocations = invocations_data
423
+
424
+ await self.broadcast_to_ui(
425
+ WSSessionCompleteEvent(
426
+ session_id=session_id,
427
+ invocations=invocations_data,
428
+ ).model_dump(by_alias=True)
429
+ )
430
+
431
+ if session_id in self.incremental_extractors:
432
+ del self.incremental_extractors[session_id]
433
+
434
+ async def handle_connection(self, websocket: WebSocket) -> None:
435
+ """Handle WebSocket connection from an agent.
436
+
437
+ Manages the lifecycle of a WebSocket connection, receiving span events
438
+ and broadcasting updates to connected UI clients.
439
+
440
+ Args:
441
+ websocket: The WebSocket connection to handle
442
+ """
443
+ await websocket.accept()
444
+ session_id = None
445
+
446
+ try:
447
+ async for message in websocket.iter_text():
448
+ event = json.loads(message)
449
+
450
+ if event["type"] == "session_start":
451
+ session_id = event["session_id"]
452
+ logger.info("Received session_start event: %s", session_id)
453
+
454
+ session = TraceSession(
455
+ session_id=session_id,
456
+ trace_id=event["trace_id"],
457
+ eval_set_id=event.get("eval_set_id"),
458
+ metadata=event.get("metadata", {}),
459
+ )
460
+ self.sessions[session_id] = session
461
+ self.incremental_extractors[session_id] = IncrementalInvocationExtractor()
462
+
463
+ broadcast_event = WSSessionStartedEvent(
464
+ session=SessionInfo(
465
+ session_id=session_id,
466
+ trace_id=event["trace_id"],
467
+ eval_set_id=event.get("eval_set_id"),
468
+ span_count=0,
469
+ is_complete=False,
470
+ started_at=session.started_at.isoformat(),
471
+ metadata=event.get("metadata", {}),
472
+ ),
473
+ ).model_dump(by_alias=True)
474
+ logger.info("Broadcasting session_started to %d SSE clients", len(self.sse_queues))
475
+ await self.broadcast_to_ui(broadcast_event)
476
+
477
+ logger.info("Session started: %s", session_id)
478
+
479
+ elif event["type"] == "span":
480
+ sid = event["session_id"]
481
+
482
+ if sid not in self.sessions:
483
+ logger.warning("Span for unknown session: %s", sid)
484
+ continue
485
+
486
+ session = self.sessions[sid]
487
+
488
+ if not session.can_accept_span():
489
+ logger.warning(
490
+ "Session %s has reached max span limit (%d), rejecting new span", sid, len(session.spans)
491
+ )
492
+ await websocket.send_json(
493
+ {
494
+ "type": "error",
495
+ "message": f"Session has reached maximum span limit ({len(session.spans)})",
496
+ }
497
+ )
498
+ continue
499
+
500
+ session.spans.append(event["span"])
501
+
502
+ extractor = self.incremental_extractors.get(sid)
503
+ if extractor:
504
+ updates = extractor.process_span(event["span"])
505
+ for update in updates:
506
+ update["sessionId"] = sid
507
+ await self.broadcast_to_ui(update)
508
+
509
+ await self.broadcast_to_ui(
510
+ WSSpanReceivedEvent(
511
+ session_id=sid,
512
+ span=event["span"],
513
+ ).model_dump(by_alias=True)
514
+ )
515
+
516
+ elif event["type"] == "log":
517
+ sid = event["session_id"]
518
+ log_event = event["log"]
519
+
520
+ if sid not in self.sessions:
521
+ logger.warning("Log for unknown session: %s", sid)
522
+ continue
523
+
524
+ session = self.sessions[sid]
525
+
526
+ if not session.can_accept_log():
527
+ logger.warning(
528
+ "Session %s has reached max log limit (%d), rejecting new log", sid, len(session.logs)
529
+ )
530
+ await websocket.send_json(
531
+ {"type": "error", "message": f"Session has reached maximum log limit ({len(session.logs)})"}
532
+ )
533
+ continue
534
+
535
+ session.logs.append(log_event)
536
+
537
+ extractor = self.incremental_extractors.get(sid)
538
+ if extractor:
539
+ updates = extractor.process_log(log_event)
540
+ for update in updates:
541
+ update["sessionId"] = sid
542
+ await self.broadcast_to_ui(update)
543
+ else:
544
+ logger.warning(f"No extractor found for session {sid}")
545
+
546
+ elif event["type"] == "session_end":
547
+ sid = event["session_id"]
548
+
549
+ if sid not in self.sessions:
550
+ logger.warning("End for unknown session: %s", sid)
551
+ continue
552
+
553
+ session = self.sessions[sid]
554
+ session.is_complete = True
555
+
556
+ logger.info("Session ended: %s (%d spans, %d logs)", sid, len(session.spans), len(session.logs))
557
+
558
+ invocations_data = await self._extract_invocations(session)
559
+ session.invocations = invocations_data
560
+
561
+ complete_event = WSSessionCompleteEvent(
562
+ session_id=sid,
563
+ invocations=invocations_data,
564
+ ).model_dump(by_alias=True)
565
+ logger.info("Broadcasting session_complete to %d SSE clients", len(self.sse_queues))
566
+ await self.broadcast_to_ui(complete_event)
567
+
568
+ if sid in self.incremental_extractors:
569
+ del self.incremental_extractors[sid]
570
+
571
+ await websocket.send_json({"type": "session_complete", "invocations": invocations_data})
572
+
573
+ except WebSocketDisconnect:
574
+ if session_id and session_id in self.sessions:
575
+ if not self.sessions[session_id].is_complete:
576
+ logger.warning("Client disconnected without ending session: %s", session_id)
577
+ else:
578
+ logger.info("Client disconnected after session end: %s", session_id)
579
+
580
+ async def _save_spans_to_temp_file(self, session: TraceSession) -> Path:
581
+ """Save spans to a temporary OTLP JSONL file.
582
+
583
+ Args:
584
+ session: The trace session containing spans to save
585
+
586
+ Returns:
587
+ Path to the temporary JSONL file containing the spans
588
+ """
589
+ temp_file = Path(tempfile.gettempdir()) / f"agentevals_{session.session_id}.jsonl"
590
+
591
+ enriched_spans = enrich_spans_with_logs(session.spans, session.logs, session.session_id)
592
+
593
+ with open(temp_file, "w") as f: # noqa: ASYNC230
594
+ for span in enriched_spans:
595
+ span_copy = span.copy()
596
+ span_copy["traceId"] = session.trace_id
597
+ f.write(json.dumps(span_copy) + "\n")
598
+
599
+ return temp_file
600
+
601
+ async def _extract_invocations(self, session: TraceSession) -> list[dict]:
602
+ """Extract invocations from session spans for UI display.
603
+
604
+ Converts raw OTLP spans into structured invocation data with user/agent messages,
605
+ tool calls, and model information for display in the UI.
606
+
607
+ Args:
608
+ session: The trace session containing spans to extract invocations from
609
+
610
+ Returns:
611
+ List of invocation dictionaries with the following structure:
612
+ - invocationId: Unique identifier for the invocation
613
+ - userText: User's input text
614
+ - agentText: Agent's response text
615
+ - toolCalls: List of tool calls with name and args
616
+ - modelInfo: Model metadata (model name, tokens, etc.)
617
+ """
618
+ try:
619
+ temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
620
+
621
+ has_genai_spans = any(
622
+ span.get("attributes", [])
623
+ and any(
624
+ attr.get("key") in (OTEL_GENAI_REQUEST_MODEL, OTEL_GENAI_INPUT_MESSAGES)
625
+ for attr in span.get("attributes", [])
626
+ )
627
+ for span in session.spans
628
+ )
629
+
630
+ if has_genai_spans and not session.logs:
631
+ logger.warning(
632
+ "Session %s has GenAI spans but no logs. "
633
+ "Message content will be missing unless spans already enriched.",
634
+ session.session_id,
635
+ )
636
+
637
+ enriched_spans = enrich_spans_with_logs(session.spans, session.logs, session.session_id)
638
+
639
+ for span in enriched_spans:
640
+ span_copy = span.copy()
641
+ span_copy["traceId"] = session.trace_id
642
+ temp_file.write(json.dumps(span_copy) + "\n")
643
+ temp_file.close()
644
+
645
+ logger.debug("Saved %d enriched spans to %s", len(enriched_spans), temp_file.name)
646
+
647
+ loader = OtlpJsonLoader()
648
+ traces = loader.load(temp_file.name)
649
+
650
+ if not traces:
651
+ logger.warning("No traces loaded from session %s", session.session_id)
652
+ return []
653
+
654
+ logger.debug("Loaded %d traces", len(traces))
655
+
656
+ conversion_results = convert_traces(traces)
657
+
658
+ if not conversion_results:
659
+ logger.warning("No conversion results")
660
+ return []
661
+
662
+ invocations_data = []
663
+
664
+ for trace_idx, conv_result in enumerate(conversion_results):
665
+ if conv_result.warnings:
666
+ logger.warning("Conversion warnings: %s", conv_result.warnings)
667
+
668
+ trace = traces[trace_idx] if trace_idx < len(traces) else None
669
+
670
+ for inv_idx, inv in enumerate(conv_result.invocations):
671
+ user_text = ""
672
+ if inv.user_content and inv.user_content.parts:
673
+ user_text = " ".join(p.text for p in inv.user_content.parts if p.text)
674
+
675
+ agent_text = ""
676
+ if inv.final_response and inv.final_response.parts:
677
+ for part in inv.final_response.parts:
678
+ if part.text:
679
+ agent_text += part.text
680
+
681
+ tool_calls = []
682
+ if inv.intermediate_data and inv.intermediate_data.tool_uses:
683
+ for tool_use in inv.intermediate_data.tool_uses:
684
+ tool_calls.append(
685
+ {
686
+ "name": tool_use.name,
687
+ "args": tool_use.args if hasattr(tool_use, "args") else {},
688
+ "id": getattr(tool_use, "id", None),
689
+ }
690
+ )
691
+
692
+ tool_responses = []
693
+ if inv.intermediate_data and inv.intermediate_data.tool_responses:
694
+ for tr in inv.intermediate_data.tool_responses:
695
+ tool_responses.append(
696
+ {
697
+ "name": tr.name,
698
+ "response": tr.response if hasattr(tr, "response") else {},
699
+ "id": getattr(tr, "id", None),
700
+ }
701
+ )
702
+
703
+ model_info = {}
704
+ if trace:
705
+ model_info = self._extract_model_info_from_trace(trace, inv_idx)
706
+
707
+ invocations_data.append(
708
+ {
709
+ "invocationId": inv.invocation_id,
710
+ "userText": user_text,
711
+ "agentText": agent_text,
712
+ "toolCalls": tool_calls,
713
+ "toolResponses": tool_responses,
714
+ "modelInfo": model_info,
715
+ }
716
+ )
717
+
718
+ logger.debug("Extracted %d invocations from %d traces", len(invocations_data), len(conversion_results))
719
+
720
+ self._augment_tool_responses_from_logs(invocations_data, session)
721
+
722
+ return invocations_data
723
+
724
+ except Exception:
725
+ logger.exception("Failed to extract invocations")
726
+ return []
727
+
728
+ def _extract_model_info_from_trace(self, trace: Trace, invocation_idx: int) -> dict:
729
+ """Extract model information from LLM spans in the trace."""
730
+ model_info: dict[str, Any] = {}
731
+ models_used: set[str] = set()
732
+ total_input_tokens = 0
733
+ total_output_tokens = 0
734
+
735
+ llm_spans = [s for s in trace.all_spans if is_llm_span(s) or "call_llm" in s.operation_name]
736
+
737
+ for span in llm_spans:
738
+ in_toks, out_toks, model = extract_token_usage_from_attrs(span.tags)
739
+ if model and model != "unknown":
740
+ models_used.add(model)
741
+ else:
742
+ genai_model = span.get_tag(OTEL_GENAI_REQUEST_MODEL)
743
+ if genai_model:
744
+ models_used.add(genai_model)
745
+ total_input_tokens += in_toks
746
+ total_output_tokens += out_toks
747
+
748
+ if models_used:
749
+ model_info["models"] = list(models_used)
750
+ if total_input_tokens > 0:
751
+ model_info["inputTokens"] = total_input_tokens
752
+ if total_output_tokens > 0:
753
+ model_info["outputTokens"] = total_output_tokens
754
+
755
+ return model_info
756
+
757
+ @staticmethod
758
+ def _augment_tool_responses_from_logs(invocations_data: list[dict], session: TraceSession) -> None:
759
+ """Fill in missing tool responses from session logs (e.g. LangChain gen_ai.tool.message)."""
760
+ if not session.logs:
761
+ return
762
+
763
+ needs_responses = any(inv.get("toolCalls") and not inv.get("toolResponses") for inv in invocations_data)
764
+ if not needs_responses:
765
+ return
766
+
767
+ tool_names: dict[str, str] = {}
768
+ for inv in invocations_data:
769
+ for tc in inv.get("toolCalls", []):
770
+ tc_id = tc.get("id")
771
+ if tc_id:
772
+ tool_names[tc_id] = tc["name"]
773
+
774
+ tool_results_by_span: dict[str, list[dict]] = {}
775
+ for log_event in session.logs:
776
+ if log_event.get("event_name") != "gen_ai.tool.message":
777
+ continue
778
+ body = log_event.get("body", {})
779
+ if not isinstance(body, dict):
780
+ continue
781
+ span_id = log_event.get("span_id", "")
782
+ tool_id = body.get("id", "")
783
+ content = body.get("content")
784
+ if content is None:
785
+ continue
786
+
787
+ response = parse_tool_response_content(content)
788
+ tool_results_by_span.setdefault(span_id, []).append(
789
+ {
790
+ "name": body.get("name") or tool_names.get(tool_id, "unknown"),
791
+ "response": response,
792
+ "id": tool_id,
793
+ }
794
+ )
795
+
796
+ if not tool_results_by_span:
797
+ return
798
+
799
+ for inv in invocations_data:
800
+ if inv.get("toolResponses"):
801
+ continue
802
+ inv_id = inv.get("invocationId", "")
803
+ bare_span_id = inv_id.removeprefix("genai-")
804
+ responses = tool_results_by_span.get(bare_span_id, [])
805
+ if responses:
806
+ inv["toolResponses"] = responses