agentevals-cli 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. agentevals/__init__.py +16 -0
  2. agentevals/_protocol.py +83 -0
  3. agentevals/api/__init__.py +0 -0
  4. agentevals/api/app.py +137 -0
  5. agentevals/api/debug_routes.py +268 -0
  6. agentevals/api/models.py +204 -0
  7. agentevals/api/otlp_app.py +25 -0
  8. agentevals/api/otlp_routes.py +383 -0
  9. agentevals/api/routes.py +554 -0
  10. agentevals/api/streaming_routes.py +373 -0
  11. agentevals/builtin_metrics.py +234 -0
  12. agentevals/cli.py +643 -0
  13. agentevals/config.py +108 -0
  14. agentevals/converter.py +328 -0
  15. agentevals/custom_evaluators.py +468 -0
  16. agentevals/eval_config_loader.py +147 -0
  17. agentevals/evaluator/__init__.py +24 -0
  18. agentevals/evaluator/resolver.py +70 -0
  19. agentevals/evaluator/sources.py +293 -0
  20. agentevals/evaluator/templates.py +224 -0
  21. agentevals/extraction.py +444 -0
  22. agentevals/genai_converter.py +538 -0
  23. agentevals/loader/__init__.py +7 -0
  24. agentevals/loader/base.py +53 -0
  25. agentevals/loader/jaeger.py +112 -0
  26. agentevals/loader/otlp.py +193 -0
  27. agentevals/mcp_server.py +236 -0
  28. agentevals/output.py +204 -0
  29. agentevals/runner.py +310 -0
  30. agentevals/sdk.py +433 -0
  31. agentevals/streaming/__init__.py +120 -0
  32. agentevals/streaming/incremental_processor.py +337 -0
  33. agentevals/streaming/processor.py +285 -0
  34. agentevals/streaming/session.py +36 -0
  35. agentevals/streaming/ws_server.py +806 -0
  36. agentevals/trace_attrs.py +32 -0
  37. agentevals/trace_metrics.py +126 -0
  38. agentevals/utils/__init__.py +0 -0
  39. agentevals/utils/genai_messages.py +142 -0
  40. agentevals/utils/log_buffer.py +43 -0
  41. agentevals/utils/log_enrichment.py +187 -0
  42. agentevals_cli-0.5.2.dist-info/METADATA +22 -0
  43. agentevals_cli-0.5.2.dist-info/RECORD +46 -0
  44. agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
  45. agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
  46. agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,337 @@
1
+ """Incremental span processor for extracting conversation elements in real-time.
2
+
3
+ Processes OTLP spans as they arrive and extracts:
4
+ - User input from call_llm spans
5
+ - Tool calls from execute_tool spans
6
+ - Agent responses from call_llm spans
7
+ - Token usage information
8
+
9
+ This enables real-time display of agent execution progress without waiting for
10
+ session completion.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+
17
+ from ..extraction import (
18
+ extract_agent_response_from_attrs,
19
+ extract_token_usage_from_attrs,
20
+ extract_tool_call_from_attrs,
21
+ extract_tool_result_from_attrs,
22
+ extract_user_text_from_attrs,
23
+ flatten_otlp_attributes,
24
+ parse_tool_response_content,
25
+ )
26
+ from ..trace_attrs import (
27
+ ADK_INVOCATION_ID,
28
+ ADK_SCOPE_VALUE,
29
+ OTEL_GENAI_REQUEST_MODEL,
30
+ OTEL_GENAI_TOOL_NAME,
31
+ OTEL_SCOPE,
32
+ )
33
+ from ..utils.genai_messages import parse_json_attr
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ def _normalize_ts(raw_ts) -> float:
39
+ """Normalize a nanosecond timestamp (string or int) to seconds."""
40
+ try:
41
+ ns = int(raw_ts)
42
+ except (TypeError, ValueError):
43
+ return 0.0
44
+ if ns > 1e15:
45
+ return ns / 1e9
46
+ return float(ns)
47
+
48
+
49
+ class IncrementalInvocationExtractor:
50
+ """Extracts conversation elements from spans and logs as they arrive."""
51
+
52
+ def __init__(self):
53
+ self.seen_user_input = set()
54
+ self.seen_tool_calls = {}
55
+ self.seen_agent_response = set()
56
+ self.llm_spans_by_invocation = {}
57
+ self.token_totals = {}
58
+ self.current_invocation_id = None
59
+ self.seen_message_contents = set() # Track message contents to avoid duplicates
60
+ self.tool_names_by_id: dict[str, str] = {} # tool_call_id -> tool_name
61
+
62
+ def process_span(self, span: dict) -> list[dict]:
63
+ """Process a single OTLP span and return conversation updates to broadcast.
64
+
65
+ Args:
66
+ span: OTLP JSON span dictionary
67
+
68
+ Returns:
69
+ List of update events to broadcast via SSE
70
+ """
71
+ updates = []
72
+ operation_name = span.get("name", "")
73
+
74
+ attributes = flatten_otlp_attributes(span.get("attributes", []))
75
+
76
+ is_adk = attributes.get(OTEL_SCOPE) == ADK_SCOPE_VALUE
77
+ is_genai_llm = bool(attributes.get(OTEL_GENAI_REQUEST_MODEL))
78
+ is_genai_tool = bool(attributes.get(OTEL_GENAI_TOOL_NAME))
79
+
80
+ if not (is_adk or is_genai_llm or is_genai_tool):
81
+ return updates
82
+
83
+ invocation_id = self._get_invocation_id(span, attributes)
84
+ if not invocation_id:
85
+ return updates
86
+
87
+ self.current_invocation_id = invocation_id
88
+
89
+ is_llm = operation_name.startswith("call_llm") or is_genai_llm
90
+ if is_llm:
91
+ if invocation_id not in self.llm_spans_by_invocation:
92
+ self.llm_spans_by_invocation[invocation_id] = []
93
+ self.llm_spans_by_invocation[invocation_id].append(span)
94
+
95
+ if invocation_id not in self.seen_user_input:
96
+ user_text = extract_user_text_from_attrs(attributes)
97
+ if user_text:
98
+ message_key = f"user:{user_text.strip()}"
99
+ if message_key not in self.seen_message_contents:
100
+ logger.debug(f"Extracted user input for invocation {invocation_id}")
101
+ updates.append(
102
+ {
103
+ "type": "user_input",
104
+ "invocationId": invocation_id,
105
+ "text": user_text,
106
+ "timestamp": int(span.get("startTimeUnixNano", 0)) / 1e9,
107
+ }
108
+ )
109
+ self.seen_message_contents.add(message_key)
110
+ self.seen_user_input.add(invocation_id)
111
+
112
+ agent_text = extract_agent_response_from_attrs(attributes)
113
+ if agent_text and invocation_id not in self.seen_agent_response:
114
+ message_key = f"agent:{agent_text.strip()}"
115
+ if message_key not in self.seen_message_contents:
116
+ logger.debug(f"Extracted agent response for invocation {invocation_id}")
117
+ updates.append(
118
+ {
119
+ "type": "agent_response",
120
+ "invocationId": invocation_id,
121
+ "text": agent_text,
122
+ "timestamp": int(span.get("endTimeUnixNano", 0)) / 1e9,
123
+ }
124
+ )
125
+ self.seen_message_contents.add(message_key)
126
+ self.seen_agent_response.add(invocation_id)
127
+
128
+ in_toks, out_toks, model = extract_token_usage_from_attrs(attributes)
129
+ if in_toks or out_toks:
130
+ if invocation_id not in self.token_totals:
131
+ self.token_totals[invocation_id] = {
132
+ "inputTokens": 0,
133
+ "outputTokens": 0,
134
+ "model": model,
135
+ }
136
+
137
+ self.token_totals[invocation_id]["inputTokens"] += in_toks
138
+ self.token_totals[invocation_id]["outputTokens"] += out_toks
139
+
140
+ logger.debug("Token update for %s: +%d input, +%d output", invocation_id, in_toks, out_toks)
141
+
142
+ updates.append(
143
+ {
144
+ "type": "token_update",
145
+ "invocationId": invocation_id,
146
+ "inputTokens": in_toks,
147
+ "outputTokens": out_toks,
148
+ "model": model,
149
+ }
150
+ )
151
+
152
+ elif operation_name.startswith("execute_tool") or is_genai_tool:
153
+ span_id = span.get("spanId", "")
154
+ tool_call = extract_tool_call_from_attrs(attributes, operation_name, span_id=span_id)
155
+ if tool_call:
156
+ call_id = tool_call["id"]
157
+ if invocation_id not in self.seen_tool_calls:
158
+ self.seen_tool_calls[invocation_id] = set()
159
+
160
+ self.tool_names_by_id[call_id] = tool_call["name"]
161
+
162
+ if call_id not in self.seen_tool_calls[invocation_id]:
163
+ updates.append(
164
+ {
165
+ "type": "tool_call",
166
+ "invocationId": invocation_id,
167
+ "toolCall": tool_call,
168
+ "timestamp": int(span.get("startTimeUnixNano", 0)) / 1e9,
169
+ }
170
+ )
171
+ self.seen_tool_calls[invocation_id].add(call_id)
172
+
173
+ tool_result = extract_tool_result_from_attrs(attributes)
174
+ if tool_result:
175
+ updates.append(
176
+ {
177
+ "type": "tool_result",
178
+ "invocationId": invocation_id,
179
+ "toolCallId": call_id,
180
+ "toolName": tool_call["name"],
181
+ "response": tool_result["response"],
182
+ "isError": tool_result["isError"],
183
+ "timestamp": int(span.get("endTimeUnixNano", 0)) / 1e9,
184
+ }
185
+ )
186
+
187
+ return updates
188
+
189
+ def process_log(self, log_event: dict) -> list[dict]:
190
+ """Process a GenAI log event and extract conversation updates.
191
+
192
+ Args:
193
+ log_event: Log event dict with event_name, body, attributes
194
+
195
+ Returns:
196
+ List of update events to broadcast via SSE
197
+ """
198
+ updates = []
199
+ event_name = log_event.get("event_name", "")
200
+ body = log_event.get("body", {})
201
+
202
+ invocation_id = log_event.get("span_id")
203
+ if not invocation_id:
204
+ invocation_id = self.current_invocation_id
205
+ if not invocation_id:
206
+ return updates
207
+
208
+ # Extract user messages (gen_ai.user.message)
209
+ if event_name == "gen_ai.user.message":
210
+ if isinstance(body, dict) and "content" in body:
211
+ user_text = body["content"]
212
+ message_key = f"user:{user_text.strip() if isinstance(user_text, str) else user_text}"
213
+ if user_text and message_key not in self.seen_message_contents:
214
+ logger.debug(f"Extracted user input from log for invocation {invocation_id}")
215
+ updates.append(
216
+ {
217
+ "type": "user_input",
218
+ "invocationId": invocation_id,
219
+ "text": user_text,
220
+ "timestamp": _normalize_ts(log_event.get("timestamp", 0)),
221
+ }
222
+ )
223
+ self.seen_message_contents.add(message_key)
224
+ self.seen_user_input.add(invocation_id)
225
+
226
+ # Extract assistant messages (gen_ai.assistant.message or gen_ai.choice)
227
+ elif event_name in ("gen_ai.assistant.message", "gen_ai.choice"):
228
+ agent_text = None
229
+
230
+ if isinstance(body, dict):
231
+ # Check for direct content
232
+ if "content" in body:
233
+ agent_text = body["content"]
234
+ # Check for message.content (gen_ai.choice format)
235
+ elif "message" in body and isinstance(body["message"], dict):
236
+ if "content" in body["message"]:
237
+ agent_text = body["message"]["content"]
238
+
239
+ if agent_text:
240
+ message_key = f"agent:{agent_text.strip() if isinstance(agent_text, str) else agent_text}"
241
+ if message_key not in self.seen_message_contents:
242
+ logger.debug(f"Extracted agent response from log for invocation {invocation_id}")
243
+ updates.append(
244
+ {
245
+ "type": "agent_response",
246
+ "invocationId": invocation_id,
247
+ "text": agent_text,
248
+ "timestamp": _normalize_ts(log_event.get("timestamp", 0)),
249
+ }
250
+ )
251
+ self.seen_message_contents.add(message_key)
252
+ self.seen_agent_response.add(invocation_id)
253
+
254
+ # Extract tool calls from assistant message
255
+ if isinstance(body, dict):
256
+ tool_calls = None
257
+ if "tool_calls" in body:
258
+ tool_calls = body["tool_calls"]
259
+ elif "message" in body and isinstance(body["message"], dict) and "tool_calls" in body["message"]:
260
+ tool_calls = body["message"]["tool_calls"]
261
+
262
+ if tool_calls and isinstance(tool_calls, list):
263
+ for tc in tool_calls:
264
+ if isinstance(tc, dict):
265
+ tool_id = tc.get("id", "unknown")
266
+ tool_key = f"tool:{tool_id}"
267
+
268
+ tc_name = (
269
+ tc.get("function", {}).get("name", "unknown")
270
+ if "function" in tc
271
+ else tc.get("name", "unknown")
272
+ )
273
+ self.tool_names_by_id[tool_id] = tc_name
274
+
275
+ if tool_key not in self.seen_message_contents:
276
+ tool_call = {
277
+ "id": tool_id,
278
+ "name": tc_name,
279
+ "args": {},
280
+ }
281
+
282
+ if "function" in tc and "arguments" in tc["function"]:
283
+ parsed = parse_json_attr(
284
+ tc["function"]["arguments"], "tool_call.function.arguments"
285
+ )
286
+ if isinstance(parsed, dict):
287
+ tool_call["args"] = parsed
288
+
289
+ logger.debug(f"Extracted tool call from log for invocation {invocation_id}")
290
+ updates.append(
291
+ {
292
+ "type": "tool_call",
293
+ "invocationId": invocation_id,
294
+ "toolCall": tool_call,
295
+ "timestamp": _normalize_ts(log_event.get("timestamp", 0)),
296
+ }
297
+ )
298
+ self.seen_message_contents.add(tool_key)
299
+
300
+ if invocation_id not in self.seen_tool_calls:
301
+ self.seen_tool_calls[invocation_id] = set()
302
+ self.seen_tool_calls[invocation_id].add(tool_id)
303
+
304
+ # Extract tool results from gen_ai.tool.message logs
305
+ elif event_name == "gen_ai.tool.message":
306
+ if isinstance(body, dict):
307
+ tool_id = body.get("id", "unknown")
308
+ tool_name = body.get("name") or self.tool_names_by_id.get(tool_id, "unknown")
309
+ content = body.get("content")
310
+ if content is not None:
311
+ response = parse_tool_response_content(content)
312
+ result_key = f"tool_result:{tool_id}"
313
+ if result_key not in self.seen_message_contents:
314
+ is_error = bool(response.get("isError", False))
315
+ updates.append(
316
+ {
317
+ "type": "tool_result",
318
+ "invocationId": invocation_id,
319
+ "toolCallId": tool_id,
320
+ "toolName": tool_name,
321
+ "response": response,
322
+ "isError": is_error,
323
+ "timestamp": _normalize_ts(log_event.get("timestamp", 0)),
324
+ }
325
+ )
326
+ self.seen_message_contents.add(result_key)
327
+
328
+ return updates
329
+
330
+ def _get_invocation_id(self, span: dict, attributes: dict) -> str | None:
331
+ invocation_id = attributes.get(ADK_INVOCATION_ID)
332
+ if invocation_id:
333
+ return invocation_id
334
+ parent_span_id = span.get("parentSpanId")
335
+ if parent_span_id:
336
+ return parent_span_id
337
+ return span.get("spanId")
@@ -0,0 +1,285 @@
1
+ """OpenTelemetry SpanProcessor and LogRecordProcessor for streaming to agentevals dev server."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import logging
8
+ from typing import Any
9
+
10
+ try:
11
+ import websockets
12
+ from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor
13
+ from opentelemetry.trace import SpanKind
14
+ except ImportError:
15
+ websockets = None
16
+ ReadableSpan = None
17
+ SpanProcessor = None
18
+ SpanKind = None
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class AgentEvalsStreamingProcessor:
24
+ """OTel span processor that streams spans to agentevals dev server via WebSocket."""
25
+
26
+ def __init__(self, ws_url: str, session_id: str, trace_id: str):
27
+ if websockets is None or SpanProcessor is None:
28
+ raise ImportError(
29
+ "websockets and opentelemetry-sdk required for streaming. "
30
+ "Install with: pip install websockets opentelemetry-sdk"
31
+ )
32
+
33
+ self.ws_url = ws_url
34
+ self.session_id = session_id
35
+ self.trace_id = trace_id
36
+ self.websocket: Any | None = None
37
+ self.loop: asyncio.AbstractEventLoop | None = None
38
+ self._connected = False
39
+ self._span_buffer: list[dict] = []
40
+ self._failed_spans: list[dict] = []
41
+ self._pending_sends: set[asyncio.Task] = set()
42
+
43
+ async def connect(self, eval_set_id: str | None = None, metadata: dict | None = None):
44
+ try:
45
+ self.websocket = await websockets.connect(self.ws_url)
46
+ self.loop = asyncio.get_running_loop()
47
+
48
+ await self.websocket.send(
49
+ json.dumps(
50
+ {
51
+ "type": "session_start",
52
+ "session_id": self.session_id,
53
+ "trace_id": self.trace_id,
54
+ "eval_set_id": eval_set_id,
55
+ "metadata": metadata or {},
56
+ }
57
+ )
58
+ )
59
+
60
+ self._connected = True
61
+ logger.info("Connected to agentevals dev server: %s", self.session_id)
62
+
63
+ except Exception as exc:
64
+ logger.error("Failed to connect to agentevals server: %s", exc)
65
+ self._connected = False
66
+
67
+ def on_start(self, span: ReadableSpan, parent_context=None) -> None:
68
+ pass
69
+
70
+ def on_end(self, span: ReadableSpan) -> None:
71
+ if not self._connected or not self.websocket or not self.loop:
72
+ logger.debug(f"Skipping span {span.name}: not connected")
73
+ return
74
+
75
+ try:
76
+ otlp_span = self._span_to_otlp(span)
77
+ self._span_buffer.append(otlp_span)
78
+
79
+ future = asyncio.run_coroutine_threadsafe(self._send_span(otlp_span), self.loop)
80
+ self._pending_sends.add(future)
81
+
82
+ def handle_send_complete(fut):
83
+ self._pending_sends.discard(fut)
84
+ try:
85
+ fut.result()
86
+ logger.debug(f"Sent span: {span.name}")
87
+ except Exception as exc:
88
+ logger.error(f"Failed to send span {span.name}: {exc}")
89
+ self._failed_spans.append(otlp_span)
90
+
91
+ future.add_done_callback(handle_send_complete)
92
+
93
+ except Exception as exc:
94
+ logger.warning("Failed to convert span: %s", exc)
95
+
96
+ def shutdown(self) -> None:
97
+ pass
98
+
99
+ async def shutdown_async(self) -> None:
100
+ if self.websocket and self._connected:
101
+ try:
102
+ await self._send_session_end()
103
+ except Exception as exc:
104
+ logger.warning("Failed to shutdown cleanly: %s", exc)
105
+
106
+ async def _send_span(self, otlp_span: dict) -> None:
107
+ if not self.websocket:
108
+ raise ConnectionError("WebSocket not connected")
109
+
110
+ message = {
111
+ "type": "span",
112
+ "session_id": self.session_id,
113
+ "span": otlp_span,
114
+ }
115
+ await self.websocket.send(json.dumps(message))
116
+ logger.debug(f"Sent span: {otlp_span.get('name')}")
117
+
118
+ async def _send_session_end(self) -> None:
119
+ try:
120
+ if not self.websocket:
121
+ return
122
+
123
+ if self._pending_sends:
124
+ logger.info("Waiting for %d pending span sends to complete...", len(self._pending_sends))
125
+ for future in list(self._pending_sends):
126
+ try:
127
+ await asyncio.wrap_future(future)
128
+ except Exception as exc:
129
+ logger.warning("Pending send failed during shutdown: %s", exc)
130
+ logger.info("All pending sends completed")
131
+
132
+ if self._failed_spans:
133
+ logger.info("Retrying %d failed spans at shutdown", len(self._failed_spans))
134
+ for otlp_span in self._failed_spans:
135
+ try:
136
+ await self._send_span(otlp_span)
137
+ except Exception as exc:
138
+ logger.error("Failed to send span even at shutdown: %s", exc)
139
+
140
+ self._failed_spans.clear()
141
+ self._span_buffer.clear()
142
+ self._pending_sends.clear()
143
+
144
+ await self.websocket.send(json.dumps({"type": "session_end", "session_id": self.session_id}))
145
+
146
+ await self.websocket.close()
147
+ self._connected = False
148
+ except Exception as exc:
149
+ logger.error("Failed to send session_end: %s", exc)
150
+
151
+ def force_flush(self, timeout_millis: int = 30000) -> bool:
152
+ return True
153
+
154
+ def _span_to_otlp(self, span: ReadableSpan) -> dict:
155
+ scope_name = span.instrumentation_scope.name if span.instrumentation_scope else ""
156
+ scope_version = span.instrumentation_scope.version if span.instrumentation_scope else ""
157
+
158
+ attributes = []
159
+ if scope_name:
160
+ attributes.append({"key": "otel.scope.name", "value": {"stringValue": scope_name}})
161
+ if scope_version:
162
+ attributes.append({"key": "otel.scope.version", "value": {"stringValue": scope_version}})
163
+
164
+ if span.attributes:
165
+ for key, value in span.attributes.items():
166
+ attributes.append(self._to_otlp_attribute(key, value))
167
+
168
+ self._promote_genai_event_attributes(span, attributes)
169
+
170
+ parent_span_id = None
171
+ if span.parent and hasattr(span.parent, "span_id"):
172
+ parent_span_id = format(span.parent.span_id, "016x")
173
+
174
+ return {
175
+ "traceId": format(span.context.trace_id, "032x"),
176
+ "spanId": format(span.context.span_id, "016x"),
177
+ "parentSpanId": parent_span_id,
178
+ "name": span.name,
179
+ "kind": span.kind.value if span.kind else 1,
180
+ "startTimeUnixNano": str(span.start_time),
181
+ "endTimeUnixNano": str(span.end_time),
182
+ "attributes": attributes,
183
+ "status": {"code": span.status.status_code.value} if span.status else {},
184
+ }
185
+
186
+ def _promote_genai_event_attributes(self, span: ReadableSpan, attributes: list[dict]) -> None:
187
+ """Promote OTel GenAI message attributes from span events to span attributes.
188
+
189
+ OTel GenAI semantic convention frameworks may store message content
190
+ (gen_ai.input.messages, gen_ai.output.messages) in span events rather
191
+ than span attributes. This promotes those event attributes so downstream
192
+ processors can access them uniformly from span attributes alone.
193
+ """
194
+ if not hasattr(span, "events") or not span.events:
195
+ return
196
+
197
+ from ..trace_attrs import OTEL_GENAI_INPUT_MESSAGES, OTEL_GENAI_OUTPUT_MESSAGES
198
+
199
+ _genai_event_keys = {OTEL_GENAI_INPUT_MESSAGES, OTEL_GENAI_OUTPUT_MESSAGES}
200
+ existing_keys = {a["key"] for a in attributes}
201
+
202
+ for event in span.events:
203
+ if not event.attributes:
204
+ continue
205
+ for key, value in event.attributes.items():
206
+ if key in _genai_event_keys and key not in existing_keys:
207
+ attributes.append(self._to_otlp_attribute(key, value))
208
+ existing_keys.add(key)
209
+
210
+ def _to_otlp_attribute(self, key: str, value: Any) -> dict:
211
+ if isinstance(value, bool):
212
+ return {"key": key, "value": {"boolValue": value}}
213
+ elif isinstance(value, int):
214
+ return {"key": key, "value": {"intValue": value}}
215
+ elif isinstance(value, float):
216
+ return {"key": key, "value": {"doubleValue": value}}
217
+ else:
218
+ return {"key": key, "value": {"stringValue": str(value)}}
219
+
220
+
221
+ class AgentEvalsLogStreamingProcessor:
222
+ """OTel log processor that streams GenAI logs to agentevals dev server via WebSocket.
223
+
224
+ This processor shares the same WebSocket connection and session as AgentEvalsStreamingProcessor.
225
+ It extracts input/output messages from GenAI semantic convention logs and streams them.
226
+ """
227
+
228
+ def __init__(self, span_processor: AgentEvalsStreamingProcessor):
229
+ """Initialize with a reference to the span processor for shared connection."""
230
+ self.span_processor = span_processor
231
+
232
+ def on_emit(self, log_data):
233
+ """Called when a log record is emitted."""
234
+ log_record = log_data.log_record
235
+
236
+ # Only process GenAI message logs
237
+ if not log_record.event_name or not log_record.event_name.startswith("gen_ai."):
238
+ return
239
+
240
+ logger.info(f"Log emitted: event={log_record.event_name}")
241
+
242
+ if not self.span_processor._connected or not self.span_processor.websocket or not self.span_processor.loop:
243
+ return
244
+
245
+ try:
246
+ log_json = {
247
+ "event_name": log_record.event_name,
248
+ "timestamp": str(log_record.timestamp) if log_record.timestamp else None,
249
+ "body": log_record.body,
250
+ "attributes": {},
251
+ }
252
+
253
+ if log_record.attributes:
254
+ for key, value in log_record.attributes.items():
255
+ log_json["attributes"][key] = value
256
+
257
+ future = asyncio.run_coroutine_threadsafe(self._send_log(log_json), self.span_processor.loop)
258
+
259
+ def handle_send_complete(fut):
260
+ try:
261
+ fut.result()
262
+ except Exception as exc:
263
+ logger.error(f"Failed to send log: {exc}")
264
+
265
+ future.add_done_callback(handle_send_complete)
266
+
267
+ except Exception as exc:
268
+ logger.warning("Failed to process log: %s", exc)
269
+
270
+ async def _send_log(self, log_json: dict) -> None:
271
+ if not self.span_processor.websocket:
272
+ raise ConnectionError("WebSocket not connected")
273
+
274
+ message = {
275
+ "type": "log",
276
+ "session_id": self.span_processor.session_id,
277
+ "log": log_json,
278
+ }
279
+ await self.span_processor.websocket.send(json.dumps(message))
280
+
281
+ def shutdown(self):
282
+ pass
283
+
284
+ def force_flush(self, timeout_millis: int = 30000):
285
+ return True
@@ -0,0 +1,36 @@
1
+ """Trace session tracking for live streaming."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from datetime import UTC, datetime
7
+
8
+ MAX_SPANS_PER_SESSION = 10000
9
+ MAX_LOGS_PER_SESSION = 5000
10
+
11
+
12
+ @dataclass
13
+ class TraceSession:
14
+ """Represents an active trace session from a streaming agent."""
15
+
16
+ session_id: str
17
+ trace_id: str
18
+ eval_set_id: str | None
19
+ spans: list[dict] = field(default_factory=list)
20
+ logs: list[dict] = field(default_factory=list)
21
+ started_at: datetime = field(default_factory=lambda: datetime.now(UTC))
22
+ is_complete: bool = False
23
+ completed_at: datetime | None = None
24
+ metadata: dict = field(default_factory=dict)
25
+ source: str = "websocket"
26
+ has_root_span: bool = False
27
+ trace_ids: set[str] = field(default_factory=set)
28
+ invocations: list[dict] = field(default_factory=list)
29
+
30
+ def can_accept_span(self) -> bool:
31
+ """Check if session can accept another span without exceeding limits."""
32
+ return len(self.spans) < MAX_SPANS_PER_SESSION
33
+
34
+ def can_accept_log(self) -> bool:
35
+ """Check if session can accept another log without exceeding limits."""
36
+ return len(self.logs) < MAX_LOGS_PER_SESSION