agentevals-cli 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentevals/__init__.py +16 -0
- agentevals/_protocol.py +83 -0
- agentevals/api/__init__.py +0 -0
- agentevals/api/app.py +137 -0
- agentevals/api/debug_routes.py +268 -0
- agentevals/api/models.py +204 -0
- agentevals/api/otlp_app.py +25 -0
- agentevals/api/otlp_routes.py +383 -0
- agentevals/api/routes.py +554 -0
- agentevals/api/streaming_routes.py +373 -0
- agentevals/builtin_metrics.py +234 -0
- agentevals/cli.py +643 -0
- agentevals/config.py +108 -0
- agentevals/converter.py +328 -0
- agentevals/custom_evaluators.py +468 -0
- agentevals/eval_config_loader.py +147 -0
- agentevals/evaluator/__init__.py +24 -0
- agentevals/evaluator/resolver.py +70 -0
- agentevals/evaluator/sources.py +293 -0
- agentevals/evaluator/templates.py +224 -0
- agentevals/extraction.py +444 -0
- agentevals/genai_converter.py +538 -0
- agentevals/loader/__init__.py +7 -0
- agentevals/loader/base.py +53 -0
- agentevals/loader/jaeger.py +112 -0
- agentevals/loader/otlp.py +193 -0
- agentevals/mcp_server.py +236 -0
- agentevals/output.py +204 -0
- agentevals/runner.py +310 -0
- agentevals/sdk.py +433 -0
- agentevals/streaming/__init__.py +120 -0
- agentevals/streaming/incremental_processor.py +337 -0
- agentevals/streaming/processor.py +285 -0
- agentevals/streaming/session.py +36 -0
- agentevals/streaming/ws_server.py +806 -0
- agentevals/trace_attrs.py +32 -0
- agentevals/trace_metrics.py +126 -0
- agentevals/utils/__init__.py +0 -0
- agentevals/utils/genai_messages.py +142 -0
- agentevals/utils/log_buffer.py +43 -0
- agentevals/utils/log_enrichment.py +187 -0
- agentevals_cli-0.5.2.dist-info/METADATA +22 -0
- agentevals_cli-0.5.2.dist-info/RECORD +46 -0
- agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
- agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
- agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
"""Incremental span processor for extracting conversation elements in real-time.
|
|
2
|
+
|
|
3
|
+
Processes OTLP spans as they arrive and extracts:
|
|
4
|
+
- User input from call_llm spans
|
|
5
|
+
- Tool calls from execute_tool spans
|
|
6
|
+
- Agent responses from call_llm spans
|
|
7
|
+
- Token usage information
|
|
8
|
+
|
|
9
|
+
This enables real-time display of agent execution progress without waiting for
|
|
10
|
+
session completion.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
|
|
17
|
+
from ..extraction import (
|
|
18
|
+
extract_agent_response_from_attrs,
|
|
19
|
+
extract_token_usage_from_attrs,
|
|
20
|
+
extract_tool_call_from_attrs,
|
|
21
|
+
extract_tool_result_from_attrs,
|
|
22
|
+
extract_user_text_from_attrs,
|
|
23
|
+
flatten_otlp_attributes,
|
|
24
|
+
parse_tool_response_content,
|
|
25
|
+
)
|
|
26
|
+
from ..trace_attrs import (
|
|
27
|
+
ADK_INVOCATION_ID,
|
|
28
|
+
ADK_SCOPE_VALUE,
|
|
29
|
+
OTEL_GENAI_REQUEST_MODEL,
|
|
30
|
+
OTEL_GENAI_TOOL_NAME,
|
|
31
|
+
OTEL_SCOPE,
|
|
32
|
+
)
|
|
33
|
+
from ..utils.genai_messages import parse_json_attr
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _normalize_ts(raw_ts) -> float:
|
|
39
|
+
"""Normalize a nanosecond timestamp (string or int) to seconds."""
|
|
40
|
+
try:
|
|
41
|
+
ns = int(raw_ts)
|
|
42
|
+
except (TypeError, ValueError):
|
|
43
|
+
return 0.0
|
|
44
|
+
if ns > 1e15:
|
|
45
|
+
return ns / 1e9
|
|
46
|
+
return float(ns)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class IncrementalInvocationExtractor:
|
|
50
|
+
"""Extracts conversation elements from spans and logs as they arrive."""
|
|
51
|
+
|
|
52
|
+
def __init__(self):
|
|
53
|
+
self.seen_user_input = set()
|
|
54
|
+
self.seen_tool_calls = {}
|
|
55
|
+
self.seen_agent_response = set()
|
|
56
|
+
self.llm_spans_by_invocation = {}
|
|
57
|
+
self.token_totals = {}
|
|
58
|
+
self.current_invocation_id = None
|
|
59
|
+
self.seen_message_contents = set() # Track message contents to avoid duplicates
|
|
60
|
+
self.tool_names_by_id: dict[str, str] = {} # tool_call_id -> tool_name
|
|
61
|
+
|
|
62
|
+
def process_span(self, span: dict) -> list[dict]:
|
|
63
|
+
"""Process a single OTLP span and return conversation updates to broadcast.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
span: OTLP JSON span dictionary
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
List of update events to broadcast via SSE
|
|
70
|
+
"""
|
|
71
|
+
updates = []
|
|
72
|
+
operation_name = span.get("name", "")
|
|
73
|
+
|
|
74
|
+
attributes = flatten_otlp_attributes(span.get("attributes", []))
|
|
75
|
+
|
|
76
|
+
is_adk = attributes.get(OTEL_SCOPE) == ADK_SCOPE_VALUE
|
|
77
|
+
is_genai_llm = bool(attributes.get(OTEL_GENAI_REQUEST_MODEL))
|
|
78
|
+
is_genai_tool = bool(attributes.get(OTEL_GENAI_TOOL_NAME))
|
|
79
|
+
|
|
80
|
+
if not (is_adk or is_genai_llm or is_genai_tool):
|
|
81
|
+
return updates
|
|
82
|
+
|
|
83
|
+
invocation_id = self._get_invocation_id(span, attributes)
|
|
84
|
+
if not invocation_id:
|
|
85
|
+
return updates
|
|
86
|
+
|
|
87
|
+
self.current_invocation_id = invocation_id
|
|
88
|
+
|
|
89
|
+
is_llm = operation_name.startswith("call_llm") or is_genai_llm
|
|
90
|
+
if is_llm:
|
|
91
|
+
if invocation_id not in self.llm_spans_by_invocation:
|
|
92
|
+
self.llm_spans_by_invocation[invocation_id] = []
|
|
93
|
+
self.llm_spans_by_invocation[invocation_id].append(span)
|
|
94
|
+
|
|
95
|
+
if invocation_id not in self.seen_user_input:
|
|
96
|
+
user_text = extract_user_text_from_attrs(attributes)
|
|
97
|
+
if user_text:
|
|
98
|
+
message_key = f"user:{user_text.strip()}"
|
|
99
|
+
if message_key not in self.seen_message_contents:
|
|
100
|
+
logger.debug(f"Extracted user input for invocation {invocation_id}")
|
|
101
|
+
updates.append(
|
|
102
|
+
{
|
|
103
|
+
"type": "user_input",
|
|
104
|
+
"invocationId": invocation_id,
|
|
105
|
+
"text": user_text,
|
|
106
|
+
"timestamp": int(span.get("startTimeUnixNano", 0)) / 1e9,
|
|
107
|
+
}
|
|
108
|
+
)
|
|
109
|
+
self.seen_message_contents.add(message_key)
|
|
110
|
+
self.seen_user_input.add(invocation_id)
|
|
111
|
+
|
|
112
|
+
agent_text = extract_agent_response_from_attrs(attributes)
|
|
113
|
+
if agent_text and invocation_id not in self.seen_agent_response:
|
|
114
|
+
message_key = f"agent:{agent_text.strip()}"
|
|
115
|
+
if message_key not in self.seen_message_contents:
|
|
116
|
+
logger.debug(f"Extracted agent response for invocation {invocation_id}")
|
|
117
|
+
updates.append(
|
|
118
|
+
{
|
|
119
|
+
"type": "agent_response",
|
|
120
|
+
"invocationId": invocation_id,
|
|
121
|
+
"text": agent_text,
|
|
122
|
+
"timestamp": int(span.get("endTimeUnixNano", 0)) / 1e9,
|
|
123
|
+
}
|
|
124
|
+
)
|
|
125
|
+
self.seen_message_contents.add(message_key)
|
|
126
|
+
self.seen_agent_response.add(invocation_id)
|
|
127
|
+
|
|
128
|
+
in_toks, out_toks, model = extract_token_usage_from_attrs(attributes)
|
|
129
|
+
if in_toks or out_toks:
|
|
130
|
+
if invocation_id not in self.token_totals:
|
|
131
|
+
self.token_totals[invocation_id] = {
|
|
132
|
+
"inputTokens": 0,
|
|
133
|
+
"outputTokens": 0,
|
|
134
|
+
"model": model,
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
self.token_totals[invocation_id]["inputTokens"] += in_toks
|
|
138
|
+
self.token_totals[invocation_id]["outputTokens"] += out_toks
|
|
139
|
+
|
|
140
|
+
logger.debug("Token update for %s: +%d input, +%d output", invocation_id, in_toks, out_toks)
|
|
141
|
+
|
|
142
|
+
updates.append(
|
|
143
|
+
{
|
|
144
|
+
"type": "token_update",
|
|
145
|
+
"invocationId": invocation_id,
|
|
146
|
+
"inputTokens": in_toks,
|
|
147
|
+
"outputTokens": out_toks,
|
|
148
|
+
"model": model,
|
|
149
|
+
}
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
elif operation_name.startswith("execute_tool") or is_genai_tool:
|
|
153
|
+
span_id = span.get("spanId", "")
|
|
154
|
+
tool_call = extract_tool_call_from_attrs(attributes, operation_name, span_id=span_id)
|
|
155
|
+
if tool_call:
|
|
156
|
+
call_id = tool_call["id"]
|
|
157
|
+
if invocation_id not in self.seen_tool_calls:
|
|
158
|
+
self.seen_tool_calls[invocation_id] = set()
|
|
159
|
+
|
|
160
|
+
self.tool_names_by_id[call_id] = tool_call["name"]
|
|
161
|
+
|
|
162
|
+
if call_id not in self.seen_tool_calls[invocation_id]:
|
|
163
|
+
updates.append(
|
|
164
|
+
{
|
|
165
|
+
"type": "tool_call",
|
|
166
|
+
"invocationId": invocation_id,
|
|
167
|
+
"toolCall": tool_call,
|
|
168
|
+
"timestamp": int(span.get("startTimeUnixNano", 0)) / 1e9,
|
|
169
|
+
}
|
|
170
|
+
)
|
|
171
|
+
self.seen_tool_calls[invocation_id].add(call_id)
|
|
172
|
+
|
|
173
|
+
tool_result = extract_tool_result_from_attrs(attributes)
|
|
174
|
+
if tool_result:
|
|
175
|
+
updates.append(
|
|
176
|
+
{
|
|
177
|
+
"type": "tool_result",
|
|
178
|
+
"invocationId": invocation_id,
|
|
179
|
+
"toolCallId": call_id,
|
|
180
|
+
"toolName": tool_call["name"],
|
|
181
|
+
"response": tool_result["response"],
|
|
182
|
+
"isError": tool_result["isError"],
|
|
183
|
+
"timestamp": int(span.get("endTimeUnixNano", 0)) / 1e9,
|
|
184
|
+
}
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
return updates
|
|
188
|
+
|
|
189
|
+
def process_log(self, log_event: dict) -> list[dict]:
|
|
190
|
+
"""Process a GenAI log event and extract conversation updates.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
log_event: Log event dict with event_name, body, attributes
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
List of update events to broadcast via SSE
|
|
197
|
+
"""
|
|
198
|
+
updates = []
|
|
199
|
+
event_name = log_event.get("event_name", "")
|
|
200
|
+
body = log_event.get("body", {})
|
|
201
|
+
|
|
202
|
+
invocation_id = log_event.get("span_id")
|
|
203
|
+
if not invocation_id:
|
|
204
|
+
invocation_id = self.current_invocation_id
|
|
205
|
+
if not invocation_id:
|
|
206
|
+
return updates
|
|
207
|
+
|
|
208
|
+
# Extract user messages (gen_ai.user.message)
|
|
209
|
+
if event_name == "gen_ai.user.message":
|
|
210
|
+
if isinstance(body, dict) and "content" in body:
|
|
211
|
+
user_text = body["content"]
|
|
212
|
+
message_key = f"user:{user_text.strip() if isinstance(user_text, str) else user_text}"
|
|
213
|
+
if user_text and message_key not in self.seen_message_contents:
|
|
214
|
+
logger.debug(f"Extracted user input from log for invocation {invocation_id}")
|
|
215
|
+
updates.append(
|
|
216
|
+
{
|
|
217
|
+
"type": "user_input",
|
|
218
|
+
"invocationId": invocation_id,
|
|
219
|
+
"text": user_text,
|
|
220
|
+
"timestamp": _normalize_ts(log_event.get("timestamp", 0)),
|
|
221
|
+
}
|
|
222
|
+
)
|
|
223
|
+
self.seen_message_contents.add(message_key)
|
|
224
|
+
self.seen_user_input.add(invocation_id)
|
|
225
|
+
|
|
226
|
+
# Extract assistant messages (gen_ai.assistant.message or gen_ai.choice)
|
|
227
|
+
elif event_name in ("gen_ai.assistant.message", "gen_ai.choice"):
|
|
228
|
+
agent_text = None
|
|
229
|
+
|
|
230
|
+
if isinstance(body, dict):
|
|
231
|
+
# Check for direct content
|
|
232
|
+
if "content" in body:
|
|
233
|
+
agent_text = body["content"]
|
|
234
|
+
# Check for message.content (gen_ai.choice format)
|
|
235
|
+
elif "message" in body and isinstance(body["message"], dict):
|
|
236
|
+
if "content" in body["message"]:
|
|
237
|
+
agent_text = body["message"]["content"]
|
|
238
|
+
|
|
239
|
+
if agent_text:
|
|
240
|
+
message_key = f"agent:{agent_text.strip() if isinstance(agent_text, str) else agent_text}"
|
|
241
|
+
if message_key not in self.seen_message_contents:
|
|
242
|
+
logger.debug(f"Extracted agent response from log for invocation {invocation_id}")
|
|
243
|
+
updates.append(
|
|
244
|
+
{
|
|
245
|
+
"type": "agent_response",
|
|
246
|
+
"invocationId": invocation_id,
|
|
247
|
+
"text": agent_text,
|
|
248
|
+
"timestamp": _normalize_ts(log_event.get("timestamp", 0)),
|
|
249
|
+
}
|
|
250
|
+
)
|
|
251
|
+
self.seen_message_contents.add(message_key)
|
|
252
|
+
self.seen_agent_response.add(invocation_id)
|
|
253
|
+
|
|
254
|
+
# Extract tool calls from assistant message
|
|
255
|
+
if isinstance(body, dict):
|
|
256
|
+
tool_calls = None
|
|
257
|
+
if "tool_calls" in body:
|
|
258
|
+
tool_calls = body["tool_calls"]
|
|
259
|
+
elif "message" in body and isinstance(body["message"], dict) and "tool_calls" in body["message"]:
|
|
260
|
+
tool_calls = body["message"]["tool_calls"]
|
|
261
|
+
|
|
262
|
+
if tool_calls and isinstance(tool_calls, list):
|
|
263
|
+
for tc in tool_calls:
|
|
264
|
+
if isinstance(tc, dict):
|
|
265
|
+
tool_id = tc.get("id", "unknown")
|
|
266
|
+
tool_key = f"tool:{tool_id}"
|
|
267
|
+
|
|
268
|
+
tc_name = (
|
|
269
|
+
tc.get("function", {}).get("name", "unknown")
|
|
270
|
+
if "function" in tc
|
|
271
|
+
else tc.get("name", "unknown")
|
|
272
|
+
)
|
|
273
|
+
self.tool_names_by_id[tool_id] = tc_name
|
|
274
|
+
|
|
275
|
+
if tool_key not in self.seen_message_contents:
|
|
276
|
+
tool_call = {
|
|
277
|
+
"id": tool_id,
|
|
278
|
+
"name": tc_name,
|
|
279
|
+
"args": {},
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
if "function" in tc and "arguments" in tc["function"]:
|
|
283
|
+
parsed = parse_json_attr(
|
|
284
|
+
tc["function"]["arguments"], "tool_call.function.arguments"
|
|
285
|
+
)
|
|
286
|
+
if isinstance(parsed, dict):
|
|
287
|
+
tool_call["args"] = parsed
|
|
288
|
+
|
|
289
|
+
logger.debug(f"Extracted tool call from log for invocation {invocation_id}")
|
|
290
|
+
updates.append(
|
|
291
|
+
{
|
|
292
|
+
"type": "tool_call",
|
|
293
|
+
"invocationId": invocation_id,
|
|
294
|
+
"toolCall": tool_call,
|
|
295
|
+
"timestamp": _normalize_ts(log_event.get("timestamp", 0)),
|
|
296
|
+
}
|
|
297
|
+
)
|
|
298
|
+
self.seen_message_contents.add(tool_key)
|
|
299
|
+
|
|
300
|
+
if invocation_id not in self.seen_tool_calls:
|
|
301
|
+
self.seen_tool_calls[invocation_id] = set()
|
|
302
|
+
self.seen_tool_calls[invocation_id].add(tool_id)
|
|
303
|
+
|
|
304
|
+
# Extract tool results from gen_ai.tool.message logs
|
|
305
|
+
elif event_name == "gen_ai.tool.message":
|
|
306
|
+
if isinstance(body, dict):
|
|
307
|
+
tool_id = body.get("id", "unknown")
|
|
308
|
+
tool_name = body.get("name") or self.tool_names_by_id.get(tool_id, "unknown")
|
|
309
|
+
content = body.get("content")
|
|
310
|
+
if content is not None:
|
|
311
|
+
response = parse_tool_response_content(content)
|
|
312
|
+
result_key = f"tool_result:{tool_id}"
|
|
313
|
+
if result_key not in self.seen_message_contents:
|
|
314
|
+
is_error = bool(response.get("isError", False))
|
|
315
|
+
updates.append(
|
|
316
|
+
{
|
|
317
|
+
"type": "tool_result",
|
|
318
|
+
"invocationId": invocation_id,
|
|
319
|
+
"toolCallId": tool_id,
|
|
320
|
+
"toolName": tool_name,
|
|
321
|
+
"response": response,
|
|
322
|
+
"isError": is_error,
|
|
323
|
+
"timestamp": _normalize_ts(log_event.get("timestamp", 0)),
|
|
324
|
+
}
|
|
325
|
+
)
|
|
326
|
+
self.seen_message_contents.add(result_key)
|
|
327
|
+
|
|
328
|
+
return updates
|
|
329
|
+
|
|
330
|
+
def _get_invocation_id(self, span: dict, attributes: dict) -> str | None:
|
|
331
|
+
invocation_id = attributes.get(ADK_INVOCATION_ID)
|
|
332
|
+
if invocation_id:
|
|
333
|
+
return invocation_id
|
|
334
|
+
parent_span_id = span.get("parentSpanId")
|
|
335
|
+
if parent_span_id:
|
|
336
|
+
return parent_span_id
|
|
337
|
+
return span.get("spanId")
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
"""OpenTelemetry SpanProcessor and LogRecordProcessor for streaming to agentevals dev server."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import websockets
|
|
12
|
+
from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor
|
|
13
|
+
from opentelemetry.trace import SpanKind
|
|
14
|
+
except ImportError:
|
|
15
|
+
websockets = None
|
|
16
|
+
ReadableSpan = None
|
|
17
|
+
SpanProcessor = None
|
|
18
|
+
SpanKind = None
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AgentEvalsStreamingProcessor:
|
|
24
|
+
"""OTel span processor that streams spans to agentevals dev server via WebSocket."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, ws_url: str, session_id: str, trace_id: str):
|
|
27
|
+
if websockets is None or SpanProcessor is None:
|
|
28
|
+
raise ImportError(
|
|
29
|
+
"websockets and opentelemetry-sdk required for streaming. "
|
|
30
|
+
"Install with: pip install websockets opentelemetry-sdk"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
self.ws_url = ws_url
|
|
34
|
+
self.session_id = session_id
|
|
35
|
+
self.trace_id = trace_id
|
|
36
|
+
self.websocket: Any | None = None
|
|
37
|
+
self.loop: asyncio.AbstractEventLoop | None = None
|
|
38
|
+
self._connected = False
|
|
39
|
+
self._span_buffer: list[dict] = []
|
|
40
|
+
self._failed_spans: list[dict] = []
|
|
41
|
+
self._pending_sends: set[asyncio.Task] = set()
|
|
42
|
+
|
|
43
|
+
async def connect(self, eval_set_id: str | None = None, metadata: dict | None = None):
|
|
44
|
+
try:
|
|
45
|
+
self.websocket = await websockets.connect(self.ws_url)
|
|
46
|
+
self.loop = asyncio.get_running_loop()
|
|
47
|
+
|
|
48
|
+
await self.websocket.send(
|
|
49
|
+
json.dumps(
|
|
50
|
+
{
|
|
51
|
+
"type": "session_start",
|
|
52
|
+
"session_id": self.session_id,
|
|
53
|
+
"trace_id": self.trace_id,
|
|
54
|
+
"eval_set_id": eval_set_id,
|
|
55
|
+
"metadata": metadata or {},
|
|
56
|
+
}
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
self._connected = True
|
|
61
|
+
logger.info("Connected to agentevals dev server: %s", self.session_id)
|
|
62
|
+
|
|
63
|
+
except Exception as exc:
|
|
64
|
+
logger.error("Failed to connect to agentevals server: %s", exc)
|
|
65
|
+
self._connected = False
|
|
66
|
+
|
|
67
|
+
def on_start(self, span: ReadableSpan, parent_context=None) -> None:
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
def on_end(self, span: ReadableSpan) -> None:
|
|
71
|
+
if not self._connected or not self.websocket or not self.loop:
|
|
72
|
+
logger.debug(f"Skipping span {span.name}: not connected")
|
|
73
|
+
return
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
otlp_span = self._span_to_otlp(span)
|
|
77
|
+
self._span_buffer.append(otlp_span)
|
|
78
|
+
|
|
79
|
+
future = asyncio.run_coroutine_threadsafe(self._send_span(otlp_span), self.loop)
|
|
80
|
+
self._pending_sends.add(future)
|
|
81
|
+
|
|
82
|
+
def handle_send_complete(fut):
|
|
83
|
+
self._pending_sends.discard(fut)
|
|
84
|
+
try:
|
|
85
|
+
fut.result()
|
|
86
|
+
logger.debug(f"Sent span: {span.name}")
|
|
87
|
+
except Exception as exc:
|
|
88
|
+
logger.error(f"Failed to send span {span.name}: {exc}")
|
|
89
|
+
self._failed_spans.append(otlp_span)
|
|
90
|
+
|
|
91
|
+
future.add_done_callback(handle_send_complete)
|
|
92
|
+
|
|
93
|
+
except Exception as exc:
|
|
94
|
+
logger.warning("Failed to convert span: %s", exc)
|
|
95
|
+
|
|
96
|
+
def shutdown(self) -> None:
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
async def shutdown_async(self) -> None:
|
|
100
|
+
if self.websocket and self._connected:
|
|
101
|
+
try:
|
|
102
|
+
await self._send_session_end()
|
|
103
|
+
except Exception as exc:
|
|
104
|
+
logger.warning("Failed to shutdown cleanly: %s", exc)
|
|
105
|
+
|
|
106
|
+
async def _send_span(self, otlp_span: dict) -> None:
|
|
107
|
+
if not self.websocket:
|
|
108
|
+
raise ConnectionError("WebSocket not connected")
|
|
109
|
+
|
|
110
|
+
message = {
|
|
111
|
+
"type": "span",
|
|
112
|
+
"session_id": self.session_id,
|
|
113
|
+
"span": otlp_span,
|
|
114
|
+
}
|
|
115
|
+
await self.websocket.send(json.dumps(message))
|
|
116
|
+
logger.debug(f"Sent span: {otlp_span.get('name')}")
|
|
117
|
+
|
|
118
|
+
async def _send_session_end(self) -> None:
|
|
119
|
+
try:
|
|
120
|
+
if not self.websocket:
|
|
121
|
+
return
|
|
122
|
+
|
|
123
|
+
if self._pending_sends:
|
|
124
|
+
logger.info("Waiting for %d pending span sends to complete...", len(self._pending_sends))
|
|
125
|
+
for future in list(self._pending_sends):
|
|
126
|
+
try:
|
|
127
|
+
await asyncio.wrap_future(future)
|
|
128
|
+
except Exception as exc:
|
|
129
|
+
logger.warning("Pending send failed during shutdown: %s", exc)
|
|
130
|
+
logger.info("All pending sends completed")
|
|
131
|
+
|
|
132
|
+
if self._failed_spans:
|
|
133
|
+
logger.info("Retrying %d failed spans at shutdown", len(self._failed_spans))
|
|
134
|
+
for otlp_span in self._failed_spans:
|
|
135
|
+
try:
|
|
136
|
+
await self._send_span(otlp_span)
|
|
137
|
+
except Exception as exc:
|
|
138
|
+
logger.error("Failed to send span even at shutdown: %s", exc)
|
|
139
|
+
|
|
140
|
+
self._failed_spans.clear()
|
|
141
|
+
self._span_buffer.clear()
|
|
142
|
+
self._pending_sends.clear()
|
|
143
|
+
|
|
144
|
+
await self.websocket.send(json.dumps({"type": "session_end", "session_id": self.session_id}))
|
|
145
|
+
|
|
146
|
+
await self.websocket.close()
|
|
147
|
+
self._connected = False
|
|
148
|
+
except Exception as exc:
|
|
149
|
+
logger.error("Failed to send session_end: %s", exc)
|
|
150
|
+
|
|
151
|
+
def force_flush(self, timeout_millis: int = 30000) -> bool:
|
|
152
|
+
return True
|
|
153
|
+
|
|
154
|
+
def _span_to_otlp(self, span: ReadableSpan) -> dict:
|
|
155
|
+
scope_name = span.instrumentation_scope.name if span.instrumentation_scope else ""
|
|
156
|
+
scope_version = span.instrumentation_scope.version if span.instrumentation_scope else ""
|
|
157
|
+
|
|
158
|
+
attributes = []
|
|
159
|
+
if scope_name:
|
|
160
|
+
attributes.append({"key": "otel.scope.name", "value": {"stringValue": scope_name}})
|
|
161
|
+
if scope_version:
|
|
162
|
+
attributes.append({"key": "otel.scope.version", "value": {"stringValue": scope_version}})
|
|
163
|
+
|
|
164
|
+
if span.attributes:
|
|
165
|
+
for key, value in span.attributes.items():
|
|
166
|
+
attributes.append(self._to_otlp_attribute(key, value))
|
|
167
|
+
|
|
168
|
+
self._promote_genai_event_attributes(span, attributes)
|
|
169
|
+
|
|
170
|
+
parent_span_id = None
|
|
171
|
+
if span.parent and hasattr(span.parent, "span_id"):
|
|
172
|
+
parent_span_id = format(span.parent.span_id, "016x")
|
|
173
|
+
|
|
174
|
+
return {
|
|
175
|
+
"traceId": format(span.context.trace_id, "032x"),
|
|
176
|
+
"spanId": format(span.context.span_id, "016x"),
|
|
177
|
+
"parentSpanId": parent_span_id,
|
|
178
|
+
"name": span.name,
|
|
179
|
+
"kind": span.kind.value if span.kind else 1,
|
|
180
|
+
"startTimeUnixNano": str(span.start_time),
|
|
181
|
+
"endTimeUnixNano": str(span.end_time),
|
|
182
|
+
"attributes": attributes,
|
|
183
|
+
"status": {"code": span.status.status_code.value} if span.status else {},
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
def _promote_genai_event_attributes(self, span: ReadableSpan, attributes: list[dict]) -> None:
|
|
187
|
+
"""Promote OTel GenAI message attributes from span events to span attributes.
|
|
188
|
+
|
|
189
|
+
OTel GenAI semantic convention frameworks may store message content
|
|
190
|
+
(gen_ai.input.messages, gen_ai.output.messages) in span events rather
|
|
191
|
+
than span attributes. This promotes those event attributes so downstream
|
|
192
|
+
processors can access them uniformly from span attributes alone.
|
|
193
|
+
"""
|
|
194
|
+
if not hasattr(span, "events") or not span.events:
|
|
195
|
+
return
|
|
196
|
+
|
|
197
|
+
from ..trace_attrs import OTEL_GENAI_INPUT_MESSAGES, OTEL_GENAI_OUTPUT_MESSAGES
|
|
198
|
+
|
|
199
|
+
_genai_event_keys = {OTEL_GENAI_INPUT_MESSAGES, OTEL_GENAI_OUTPUT_MESSAGES}
|
|
200
|
+
existing_keys = {a["key"] for a in attributes}
|
|
201
|
+
|
|
202
|
+
for event in span.events:
|
|
203
|
+
if not event.attributes:
|
|
204
|
+
continue
|
|
205
|
+
for key, value in event.attributes.items():
|
|
206
|
+
if key in _genai_event_keys and key not in existing_keys:
|
|
207
|
+
attributes.append(self._to_otlp_attribute(key, value))
|
|
208
|
+
existing_keys.add(key)
|
|
209
|
+
|
|
210
|
+
def _to_otlp_attribute(self, key: str, value: Any) -> dict:
|
|
211
|
+
if isinstance(value, bool):
|
|
212
|
+
return {"key": key, "value": {"boolValue": value}}
|
|
213
|
+
elif isinstance(value, int):
|
|
214
|
+
return {"key": key, "value": {"intValue": value}}
|
|
215
|
+
elif isinstance(value, float):
|
|
216
|
+
return {"key": key, "value": {"doubleValue": value}}
|
|
217
|
+
else:
|
|
218
|
+
return {"key": key, "value": {"stringValue": str(value)}}
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class AgentEvalsLogStreamingProcessor:
|
|
222
|
+
"""OTel log processor that streams GenAI logs to agentevals dev server via WebSocket.
|
|
223
|
+
|
|
224
|
+
This processor shares the same WebSocket connection and session as AgentEvalsStreamingProcessor.
|
|
225
|
+
It extracts input/output messages from GenAI semantic convention logs and streams them.
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
def __init__(self, span_processor: AgentEvalsStreamingProcessor):
|
|
229
|
+
"""Initialize with a reference to the span processor for shared connection."""
|
|
230
|
+
self.span_processor = span_processor
|
|
231
|
+
|
|
232
|
+
def on_emit(self, log_data):
|
|
233
|
+
"""Called when a log record is emitted."""
|
|
234
|
+
log_record = log_data.log_record
|
|
235
|
+
|
|
236
|
+
# Only process GenAI message logs
|
|
237
|
+
if not log_record.event_name or not log_record.event_name.startswith("gen_ai."):
|
|
238
|
+
return
|
|
239
|
+
|
|
240
|
+
logger.info(f"Log emitted: event={log_record.event_name}")
|
|
241
|
+
|
|
242
|
+
if not self.span_processor._connected or not self.span_processor.websocket or not self.span_processor.loop:
|
|
243
|
+
return
|
|
244
|
+
|
|
245
|
+
try:
|
|
246
|
+
log_json = {
|
|
247
|
+
"event_name": log_record.event_name,
|
|
248
|
+
"timestamp": str(log_record.timestamp) if log_record.timestamp else None,
|
|
249
|
+
"body": log_record.body,
|
|
250
|
+
"attributes": {},
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
if log_record.attributes:
|
|
254
|
+
for key, value in log_record.attributes.items():
|
|
255
|
+
log_json["attributes"][key] = value
|
|
256
|
+
|
|
257
|
+
future = asyncio.run_coroutine_threadsafe(self._send_log(log_json), self.span_processor.loop)
|
|
258
|
+
|
|
259
|
+
def handle_send_complete(fut):
|
|
260
|
+
try:
|
|
261
|
+
fut.result()
|
|
262
|
+
except Exception as exc:
|
|
263
|
+
logger.error(f"Failed to send log: {exc}")
|
|
264
|
+
|
|
265
|
+
future.add_done_callback(handle_send_complete)
|
|
266
|
+
|
|
267
|
+
except Exception as exc:
|
|
268
|
+
logger.warning("Failed to process log: %s", exc)
|
|
269
|
+
|
|
270
|
+
async def _send_log(self, log_json: dict) -> None:
|
|
271
|
+
if not self.span_processor.websocket:
|
|
272
|
+
raise ConnectionError("WebSocket not connected")
|
|
273
|
+
|
|
274
|
+
message = {
|
|
275
|
+
"type": "log",
|
|
276
|
+
"session_id": self.span_processor.session_id,
|
|
277
|
+
"log": log_json,
|
|
278
|
+
}
|
|
279
|
+
await self.span_processor.websocket.send(json.dumps(message))
|
|
280
|
+
|
|
281
|
+
def shutdown(self):
|
|
282
|
+
pass
|
|
283
|
+
|
|
284
|
+
def force_flush(self, timeout_millis: int = 30000):
|
|
285
|
+
return True
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Trace session tracking for live streaming."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
|
|
8
|
+
MAX_SPANS_PER_SESSION = 10000
|
|
9
|
+
MAX_LOGS_PER_SESSION = 5000
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class TraceSession:
|
|
14
|
+
"""Represents an active trace session from a streaming agent."""
|
|
15
|
+
|
|
16
|
+
session_id: str
|
|
17
|
+
trace_id: str
|
|
18
|
+
eval_set_id: str | None
|
|
19
|
+
spans: list[dict] = field(default_factory=list)
|
|
20
|
+
logs: list[dict] = field(default_factory=list)
|
|
21
|
+
started_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
|
22
|
+
is_complete: bool = False
|
|
23
|
+
completed_at: datetime | None = None
|
|
24
|
+
metadata: dict = field(default_factory=dict)
|
|
25
|
+
source: str = "websocket"
|
|
26
|
+
has_root_span: bool = False
|
|
27
|
+
trace_ids: set[str] = field(default_factory=set)
|
|
28
|
+
invocations: list[dict] = field(default_factory=list)
|
|
29
|
+
|
|
30
|
+
def can_accept_span(self) -> bool:
|
|
31
|
+
"""Check if session can accept another span without exceeding limits."""
|
|
32
|
+
return len(self.spans) < MAX_SPANS_PER_SESSION
|
|
33
|
+
|
|
34
|
+
def can_accept_log(self) -> bool:
|
|
35
|
+
"""Check if session can accept another log without exceeding limits."""
|
|
36
|
+
return len(self.logs) < MAX_LOGS_PER_SESSION
|