agentevals-cli 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. agentevals/__init__.py +16 -0
  2. agentevals/_protocol.py +83 -0
  3. agentevals/api/__init__.py +0 -0
  4. agentevals/api/app.py +137 -0
  5. agentevals/api/debug_routes.py +268 -0
  6. agentevals/api/models.py +204 -0
  7. agentevals/api/otlp_app.py +25 -0
  8. agentevals/api/otlp_routes.py +383 -0
  9. agentevals/api/routes.py +554 -0
  10. agentevals/api/streaming_routes.py +373 -0
  11. agentevals/builtin_metrics.py +234 -0
  12. agentevals/cli.py +643 -0
  13. agentevals/config.py +108 -0
  14. agentevals/converter.py +328 -0
  15. agentevals/custom_evaluators.py +468 -0
  16. agentevals/eval_config_loader.py +147 -0
  17. agentevals/evaluator/__init__.py +24 -0
  18. agentevals/evaluator/resolver.py +70 -0
  19. agentevals/evaluator/sources.py +293 -0
  20. agentevals/evaluator/templates.py +224 -0
  21. agentevals/extraction.py +444 -0
  22. agentevals/genai_converter.py +538 -0
  23. agentevals/loader/__init__.py +7 -0
  24. agentevals/loader/base.py +53 -0
  25. agentevals/loader/jaeger.py +112 -0
  26. agentevals/loader/otlp.py +193 -0
  27. agentevals/mcp_server.py +236 -0
  28. agentevals/output.py +204 -0
  29. agentevals/runner.py +310 -0
  30. agentevals/sdk.py +433 -0
  31. agentevals/streaming/__init__.py +120 -0
  32. agentevals/streaming/incremental_processor.py +337 -0
  33. agentevals/streaming/processor.py +285 -0
  34. agentevals/streaming/session.py +36 -0
  35. agentevals/streaming/ws_server.py +806 -0
  36. agentevals/trace_attrs.py +32 -0
  37. agentevals/trace_metrics.py +126 -0
  38. agentevals/utils/__init__.py +0 -0
  39. agentevals/utils/genai_messages.py +142 -0
  40. agentevals/utils/log_buffer.py +43 -0
  41. agentevals/utils/log_enrichment.py +187 -0
  42. agentevals_cli-0.5.2.dist-info/METADATA +22 -0
  43. agentevals_cli-0.5.2.dist-info/RECORD +46 -0
  44. agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
  45. agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
  46. agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,444 @@
1
+ """Shared extraction functions and format-aware extractor strategy.
2
+
3
+ Provides:
4
+ - Pure functions that extract user text, agent responses, token usage, and tool
5
+ calls from flat attribute dictionaries (usable by both Span-based converters
6
+ and the raw-OTLP-dict incremental processor).
7
+ - Span classification predicates (is_llm_span, is_tool_span, etc.).
8
+ - A lightweight TraceFormatExtractor protocol with ADK and GenAI implementations,
9
+ plus a get_extractor() dispatcher for trace-level format selection.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import logging
16
+ from typing import Any, Protocol
17
+
18
+ from .loader.base import Span, Trace
19
+ from .trace_attrs import (
20
+ ADK_LLM_REQUEST,
21
+ ADK_LLM_RESPONSE,
22
+ ADK_SCOPE_VALUE,
23
+ ADK_TOOL_CALL_ARGS,
24
+ ADK_TOOL_RESPONSE,
25
+ OTEL_GENAI_INPUT_MESSAGES,
26
+ OTEL_GENAI_OP,
27
+ OTEL_GENAI_OUTPUT_MESSAGES,
28
+ OTEL_GENAI_REQUEST_MODEL,
29
+ OTEL_GENAI_TOOL_CALL_ARGUMENTS,
30
+ OTEL_GENAI_TOOL_CALL_ID,
31
+ OTEL_GENAI_TOOL_CALL_RESULT,
32
+ OTEL_GENAI_TOOL_NAME,
33
+ OTEL_GENAI_USAGE_INPUT_TOKENS,
34
+ OTEL_GENAI_USAGE_OUTPUT_TOKENS,
35
+ OTEL_SCOPE,
36
+ )
37
+ from .utils.genai_messages import (
38
+ ASSISTANT_ROLES,
39
+ USER_ROLES,
40
+ extract_text_from_message,
41
+ extract_tool_call_args_from_messages,
42
+ parse_json_attr,
43
+ )
44
+
45
+ logger = logging.getLogger(__name__)
46
+
47
+ FORMAT_DETECTION_SPAN_LIMIT = 10
48
+
49
+ # ---------------------------------------------------------------------------
50
+ # Pure extraction functions (operate on flat attribute dicts)
51
+ # ---------------------------------------------------------------------------
52
+
53
+
54
+ def extract_user_text_from_attrs(attrs: dict[str, Any]) -> str | None:
55
+ """Extract user input text from span attributes, ADK-first."""
56
+ llm_request_raw = attrs.get(ADK_LLM_REQUEST)
57
+ if llm_request_raw:
58
+ llm_request = parse_json(llm_request_raw)
59
+ if isinstance(llm_request, dict):
60
+ for content_dict in reversed(llm_request.get("contents", [])):
61
+ if content_dict.get("role") != "user":
62
+ continue
63
+ parts = content_dict.get("parts", [])
64
+ text_parts = [p for p in parts if "text" in p]
65
+ if text_parts:
66
+ return " ".join(p["text"] for p in text_parts)
67
+ for content_dict in llm_request.get("contents", []):
68
+ if content_dict.get("role") == "user":
69
+ parts = content_dict.get("parts", [])
70
+ if parts:
71
+ return " ".join(p.get("text", "") for p in parts if "text" in p)
72
+
73
+ messages_raw = attrs.get(OTEL_GENAI_INPUT_MESSAGES)
74
+ if messages_raw:
75
+ messages = parse_json_attr(messages_raw, "gen_ai.input.messages")
76
+ if isinstance(messages, list):
77
+ for msg in reversed(messages):
78
+ if isinstance(msg, dict) and msg.get("role") in USER_ROLES:
79
+ text = extract_text_from_message(msg)
80
+ if text:
81
+ return text
82
+
83
+ return None
84
+
85
+
86
+ def extract_agent_response_from_attrs(attrs: dict[str, Any]) -> str | None:
87
+ """Extract agent response text from span attributes, ADK-first."""
88
+ llm_response_raw = attrs.get(ADK_LLM_RESPONSE)
89
+ if llm_response_raw:
90
+ llm_response = parse_json(llm_response_raw)
91
+ if isinstance(llm_response, dict):
92
+ content_dict = llm_response.get("content", {})
93
+ if content_dict:
94
+ parts_dicts = content_dict.get("parts", [])
95
+ text_parts = [p for p in parts_dicts if "text" in p]
96
+ if text_parts:
97
+ return " ".join(p["text"] for p in text_parts)
98
+
99
+ messages_raw = attrs.get(OTEL_GENAI_OUTPUT_MESSAGES)
100
+ if messages_raw:
101
+ messages = parse_json_attr(messages_raw, "gen_ai.output.messages")
102
+ if isinstance(messages, list):
103
+ for msg in messages:
104
+ if isinstance(msg, dict) and msg.get("role") in ASSISTANT_ROLES:
105
+ text = extract_text_from_message(msg)
106
+ if text:
107
+ return text
108
+
109
+ return None
110
+
111
+
112
+ def extract_token_usage_from_attrs(
113
+ attrs: dict[str, Any],
114
+ ) -> tuple[int, int, str]:
115
+ """Extract (input_tokens, output_tokens, model) from attributes, ADK-first."""
116
+ model = attrs.get(OTEL_GENAI_REQUEST_MODEL, "unknown")
117
+
118
+ llm_response_raw = attrs.get(ADK_LLM_RESPONSE)
119
+ if llm_response_raw:
120
+ llm_response = parse_json(llm_response_raw)
121
+ if isinstance(llm_response, dict):
122
+ usage = llm_response.get("usage_metadata", {})
123
+ input_toks = usage.get("prompt_token_count", 0)
124
+ output_toks = usage.get("candidates_token_count", 0)
125
+ if input_toks or output_toks:
126
+ llm_request_raw = attrs.get(ADK_LLM_REQUEST)
127
+ if llm_request_raw:
128
+ llm_request = parse_json(llm_request_raw)
129
+ if isinstance(llm_request, dict) and "model" in llm_request:
130
+ model = llm_request["model"]
131
+ return int(input_toks), int(output_toks), model
132
+
133
+ input_toks = attrs.get(OTEL_GENAI_USAGE_INPUT_TOKENS, 0)
134
+ output_toks = attrs.get(OTEL_GENAI_USAGE_OUTPUT_TOKENS, 0)
135
+ if isinstance(input_toks, (int, float)) and isinstance(output_toks, (int, float)):
136
+ if input_toks or output_toks:
137
+ return int(input_toks), int(output_toks), model
138
+
139
+ return 0, 0, model
140
+
141
+
142
+ def extract_tool_call_from_attrs(
143
+ attrs: dict[str, Any], operation_name: str = "", span_id: str = ""
144
+ ) -> dict[str, Any] | None:
145
+ """Extract tool call info from span attributes. Returns {id, name, args} or None."""
146
+ tool_name = attrs.get(OTEL_GENAI_TOOL_NAME)
147
+ if not tool_name:
148
+ if operation_name.startswith("execute_tool "):
149
+ tool_name = operation_name[len("execute_tool ") :]
150
+ else:
151
+ return None
152
+
153
+ tool_call_id = attrs.get(OTEL_GENAI_TOOL_CALL_ID) or span_id or "unknown"
154
+
155
+ args_raw = attrs.get(OTEL_GENAI_TOOL_CALL_ARGUMENTS)
156
+ if not args_raw:
157
+ args_raw = attrs.get(ADK_TOOL_CALL_ARGS)
158
+
159
+ args: dict = {}
160
+ if args_raw:
161
+ parsed = parse_json_attr(args_raw, "tool.call.arguments")
162
+ if isinstance(parsed, dict):
163
+ args = parsed
164
+
165
+ if not args:
166
+ messages_raw = attrs.get(OTEL_GENAI_INPUT_MESSAGES)
167
+ if messages_raw:
168
+ fallback_args, fallback_id = extract_tool_call_args_from_messages(messages_raw, tool_name)
169
+ if fallback_args:
170
+ args = fallback_args
171
+ if fallback_id:
172
+ tool_call_id = fallback_id
173
+
174
+ return {"id": tool_call_id, "name": tool_name, "args": args}
175
+
176
+
177
+ def parse_tool_response_content(content: Any) -> dict:
178
+ """Parse raw tool response content into a response dict.
179
+
180
+ Handles str (tries JSON parse), dict (pass-through), and other types (stringified).
181
+ On JSON parse failure, wraps raw content as {"result": content}.
182
+ """
183
+ if isinstance(content, str):
184
+ try:
185
+ parsed = json.loads(content)
186
+ return parsed if isinstance(parsed, dict) else {"result": str(parsed)}
187
+ except (json.JSONDecodeError, TypeError):
188
+ return {"result": content}
189
+ elif isinstance(content, dict):
190
+ return content
191
+ return {"result": str(content)}
192
+
193
+
194
+ def extract_tool_result_from_attrs(attrs: dict[str, Any]) -> dict[str, Any] | None:
195
+ """Extract tool result from span attributes, ADK-first.
196
+
197
+ Checks (in order):
198
+ 1. ADK tool response attribute
199
+ 2. GenAI semconv tool call result attribute
200
+ 3. gen_ai.output.messages for tool_call_response parts (Strands format)
201
+
202
+ Returns {"response": <parsed dict>, "isError": bool} or None if no result present.
203
+ """
204
+ raw = attrs.get(ADK_TOOL_RESPONSE)
205
+ if not raw:
206
+ raw = attrs.get(OTEL_GENAI_TOOL_CALL_RESULT)
207
+
208
+ if raw:
209
+ parsed = parse_tool_response_content(raw)
210
+ if parsed:
211
+ is_error = bool(parsed.get("isError", False))
212
+ return {"response": parsed, "isError": is_error}
213
+
214
+ output_msgs_raw = attrs.get(OTEL_GENAI_OUTPUT_MESSAGES)
215
+ if output_msgs_raw:
216
+ messages = parse_json_attr(output_msgs_raw, "gen_ai.output.messages")
217
+ if isinstance(messages, list):
218
+ for msg in messages:
219
+ if not isinstance(msg, dict):
220
+ continue
221
+ for part in msg.get("parts", []):
222
+ if not isinstance(part, dict):
223
+ continue
224
+ if part.get("type") == "tool_call_response" and "response" in part:
225
+ resp = part["response"]
226
+ if isinstance(resp, list):
227
+ texts = [t.get("text", "") for t in resp if isinstance(t, dict) and "text" in t]
228
+ parsed = parse_tool_response_content(" ".join(texts))
229
+ elif isinstance(resp, dict):
230
+ parsed = resp
231
+ else:
232
+ continue
233
+ return {"response": parsed, "isError": bool(parsed.get("isError", False))}
234
+
235
+ return None
236
+
237
+
238
+ # ---------------------------------------------------------------------------
239
+ # Span classification helpers
240
+ # ---------------------------------------------------------------------------
241
+
242
+
243
+ def is_adk_scope(span: Span) -> bool:
244
+ return span.get_tag(OTEL_SCOPE) == ADK_SCOPE_VALUE
245
+
246
+
247
+ def is_llm_span(span: Span) -> bool:
248
+ return span.get_tag(OTEL_GENAI_REQUEST_MODEL) is not None or span.get_tag(OTEL_GENAI_INPUT_MESSAGES) is not None
249
+
250
+
251
+ def is_tool_span(span: Span) -> bool:
252
+ return span.get_tag(OTEL_GENAI_TOOL_NAME) is not None
253
+
254
+
255
+ def is_invocation_span(span: Span) -> bool:
256
+ """Check if a span represents an agent invocation.
257
+
258
+ Checks gen_ai.operation.name first (reliable for Strands and ADK),
259
+ then falls back to keyword heuristics on the operation name.
260
+ """
261
+ op_name_attr = span.get_tag(OTEL_GENAI_OP)
262
+ if op_name_attr == "invoke_agent":
263
+ return True
264
+
265
+ op_lower = span.operation_name.lower()
266
+ invocation_keywords = ["agent", "chain", "executor", "workflow"]
267
+ return any(keyword in op_lower for keyword in invocation_keywords)
268
+
269
+
270
+ # ---------------------------------------------------------------------------
271
+ # OTLP attribute flattening (shared by incremental_processor and processor)
272
+ # ---------------------------------------------------------------------------
273
+
274
+
275
+ def flatten_otlp_attributes(attrs_list: list[dict]) -> dict[str, Any]:
276
+ """Convert OTLP attributes array [{key, value: {stringValue|...}}] to flat dict."""
277
+ result: dict[str, Any] = {}
278
+ for attr in attrs_list:
279
+ key = attr.get("key", "")
280
+ value_obj = attr.get("value", {})
281
+ if "stringValue" in value_obj:
282
+ result[key] = value_obj["stringValue"]
283
+ elif "intValue" in value_obj:
284
+ result[key] = int(value_obj["intValue"])
285
+ elif "doubleValue" in value_obj:
286
+ result[key] = float(value_obj["doubleValue"])
287
+ elif "boolValue" in value_obj:
288
+ result[key] = value_obj["boolValue"]
289
+ return result
290
+
291
+
292
+ # ---------------------------------------------------------------------------
293
+ # Format-aware extractor strategy
294
+ # ---------------------------------------------------------------------------
295
+
296
+
297
+ class TraceFormatExtractor(Protocol):
298
+ def detect(self, trace: Trace) -> bool: ...
299
+ def format_name(self) -> str: ...
300
+ def find_invocation_spans(self, trace: Trace) -> list[Span]: ...
301
+ def find_llm_spans_in(self, root: Span) -> list[Span]: ...
302
+ def find_tool_spans_in(self, root: Span) -> list[Span]: ...
303
+ def classify_span(self, span: Span) -> str | None:
304
+ """Return 'llm', 'tool', 'invocation', or None."""
305
+ ...
306
+
307
+
308
+ class AdkExtractor:
309
+ def detect(self, trace: Trace) -> bool:
310
+ for span in trace.all_spans[:FORMAT_DETECTION_SPAN_LIMIT]:
311
+ if is_adk_scope(span):
312
+ return True
313
+ for span in trace.all_spans[FORMAT_DETECTION_SPAN_LIMIT:]:
314
+ if is_adk_scope(span):
315
+ return True
316
+ return False
317
+
318
+ def format_name(self) -> str:
319
+ return "adk"
320
+
321
+ def find_invocation_spans(self, trace: Trace) -> list[Span]:
322
+ matches = [s for s in trace.all_spans if is_adk_scope(s) and s.operation_name.startswith("invoke_agent")]
323
+ matches.sort(key=lambda s: s.start_time)
324
+ return matches
325
+
326
+ def find_llm_spans_in(self, root: Span) -> list[Span]:
327
+ results: list[Span] = []
328
+ self._walk(root, lambda s: s.operation_name.startswith("call_llm"), results)
329
+ results.sort(key=lambda s: s.start_time)
330
+ return results
331
+
332
+ def find_tool_spans_in(self, root: Span) -> list[Span]:
333
+ results: list[Span] = []
334
+ self._walk(root, lambda s: s.operation_name.startswith("execute_tool"), results)
335
+ results.sort(key=lambda s: s.start_time)
336
+ return results
337
+
338
+ def classify_span(self, span: Span) -> str | None:
339
+ if not is_adk_scope(span):
340
+ return None
341
+ if span.operation_name.startswith("invoke_agent"):
342
+ return "invocation"
343
+ if span.operation_name.startswith("call_llm"):
344
+ return "llm"
345
+ if span.operation_name.startswith("execute_tool"):
346
+ return "tool"
347
+ return None
348
+
349
+ @staticmethod
350
+ def _walk(span: Span, predicate, acc: list[Span]) -> None:
351
+ for child in span.children:
352
+ if predicate(child):
353
+ acc.append(child)
354
+ AdkExtractor._walk(child, predicate, acc)
355
+
356
+
357
+ class GenAIExtractor:
358
+ def detect(self, trace: Trace) -> bool:
359
+ for span in trace.all_spans[:FORMAT_DETECTION_SPAN_LIMIT]:
360
+ if span.get_tag(OTEL_GENAI_REQUEST_MODEL) or span.get_tag(OTEL_GENAI_INPUT_MESSAGES):
361
+ return True
362
+ for span in trace.all_spans[FORMAT_DETECTION_SPAN_LIMIT:]:
363
+ if span.get_tag(OTEL_GENAI_REQUEST_MODEL) or span.get_tag(OTEL_GENAI_INPUT_MESSAGES):
364
+ return True
365
+ return False
366
+
367
+ def format_name(self) -> str:
368
+ return "genai"
369
+
370
+ def find_invocation_spans(self, trace: Trace) -> list[Span]:
371
+ candidates = [s for s in trace.root_spans if is_invocation_span(s)]
372
+ if not candidates:
373
+ candidates = [s for s in trace.root_spans if self._has_llm_children(s)]
374
+ if not candidates and trace.root_spans:
375
+ llm_spans = [s for s in trace.root_spans if is_llm_span(s)]
376
+ candidates = llm_spans if llm_spans else list(trace.root_spans)
377
+ candidates.sort(key=lambda s: s.start_time)
378
+ return candidates
379
+
380
+ def find_llm_spans_in(self, root: Span) -> list[Span]:
381
+ results: list[Span] = []
382
+ self._walk(root, is_llm_span, results)
383
+ results.sort(key=lambda s: s.start_time)
384
+ return results
385
+
386
+ def find_tool_spans_in(self, root: Span) -> list[Span]:
387
+ results: list[Span] = []
388
+ self._walk(root, is_tool_span, results)
389
+ results.sort(key=lambda s: s.start_time)
390
+ return results
391
+
392
+ def classify_span(self, span: Span) -> str | None:
393
+ if is_invocation_span(span):
394
+ return "invocation"
395
+ if is_llm_span(span):
396
+ return "llm"
397
+ if is_tool_span(span):
398
+ return "tool"
399
+ return None
400
+
401
+ @staticmethod
402
+ def _has_llm_children(span: Span) -> bool:
403
+ for child in span.children:
404
+ if is_llm_span(child):
405
+ return True
406
+ if GenAIExtractor._has_llm_children(child):
407
+ return True
408
+ return False
409
+
410
+ @staticmethod
411
+ def _walk(span: Span, predicate, acc: list[Span]) -> None:
412
+ if predicate(span):
413
+ acc.append(span)
414
+ for child in span.children:
415
+ GenAIExtractor._walk(child, predicate, acc)
416
+
417
+
418
+ # Registry: ADK checked first (richer data, more specific detection).
419
+ _EXTRACTORS: list[TraceFormatExtractor] = [AdkExtractor(), GenAIExtractor()] # type: ignore[list-item]
420
+
421
+
422
+ def get_extractor(trace: Trace) -> TraceFormatExtractor:
423
+ for ext in _EXTRACTORS:
424
+ if ext.detect(trace):
425
+ logger.debug("Trace %s: detected format %s", trace.trace_id, ext.format_name())
426
+ return ext
427
+ logger.warning("Trace %s: no format detected, defaulting to ADK", trace.trace_id)
428
+ return _EXTRACTORS[0]
429
+
430
+
431
+ # ---------------------------------------------------------------------------
432
+ # Internal helpers
433
+ # ---------------------------------------------------------------------------
434
+
435
+
436
+ def parse_json(raw: str | dict | Any) -> dict | list | Any:
437
+ if isinstance(raw, (dict, list)):
438
+ return raw
439
+ if isinstance(raw, str):
440
+ try:
441
+ return json.loads(raw)
442
+ except json.JSONDecodeError:
443
+ return {}
444
+ return {}