agentevals-cli 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentevals/__init__.py +16 -0
- agentevals/_protocol.py +83 -0
- agentevals/api/__init__.py +0 -0
- agentevals/api/app.py +137 -0
- agentevals/api/debug_routes.py +268 -0
- agentevals/api/models.py +204 -0
- agentevals/api/otlp_app.py +25 -0
- agentevals/api/otlp_routes.py +383 -0
- agentevals/api/routes.py +554 -0
- agentevals/api/streaming_routes.py +373 -0
- agentevals/builtin_metrics.py +234 -0
- agentevals/cli.py +643 -0
- agentevals/config.py +108 -0
- agentevals/converter.py +328 -0
- agentevals/custom_evaluators.py +468 -0
- agentevals/eval_config_loader.py +147 -0
- agentevals/evaluator/__init__.py +24 -0
- agentevals/evaluator/resolver.py +70 -0
- agentevals/evaluator/sources.py +293 -0
- agentevals/evaluator/templates.py +224 -0
- agentevals/extraction.py +444 -0
- agentevals/genai_converter.py +538 -0
- agentevals/loader/__init__.py +7 -0
- agentevals/loader/base.py +53 -0
- agentevals/loader/jaeger.py +112 -0
- agentevals/loader/otlp.py +193 -0
- agentevals/mcp_server.py +236 -0
- agentevals/output.py +204 -0
- agentevals/runner.py +310 -0
- agentevals/sdk.py +433 -0
- agentevals/streaming/__init__.py +120 -0
- agentevals/streaming/incremental_processor.py +337 -0
- agentevals/streaming/processor.py +285 -0
- agentevals/streaming/session.py +36 -0
- agentevals/streaming/ws_server.py +806 -0
- agentevals/trace_attrs.py +32 -0
- agentevals/trace_metrics.py +126 -0
- agentevals/utils/__init__.py +0 -0
- agentevals/utils/genai_messages.py +142 -0
- agentevals/utils/log_buffer.py +43 -0
- agentevals/utils/log_enrichment.py +187 -0
- agentevals_cli-0.5.2.dist-info/METADATA +22 -0
- agentevals_cli-0.5.2.dist-info/RECORD +46 -0
- agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
- agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
- agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Centralized OTel span attribute key constants.
|
|
2
|
+
|
|
3
|
+
Single source of truth for all attribute names used across the converter,
|
|
4
|
+
extraction, streaming, and runner modules.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# OTel scope
|
|
8
|
+
OTEL_SCOPE = "otel.scope.name"
|
|
9
|
+
OTEL_SCOPE_VERSION = "otel.scope.version"
|
|
10
|
+
|
|
11
|
+
# Google ADK scope value
|
|
12
|
+
ADK_SCOPE_VALUE = "gcp.vertex.agent"
|
|
13
|
+
|
|
14
|
+
# Standard OTel GenAI semantic conventions (gen_ai.*)
|
|
15
|
+
OTEL_GENAI_OP = "gen_ai.operation.name"
|
|
16
|
+
OTEL_GENAI_AGENT_NAME = "gen_ai.agent.name"
|
|
17
|
+
OTEL_GENAI_REQUEST_MODEL = "gen_ai.request.model"
|
|
18
|
+
OTEL_GENAI_INPUT_MESSAGES = "gen_ai.input.messages"
|
|
19
|
+
OTEL_GENAI_OUTPUT_MESSAGES = "gen_ai.output.messages"
|
|
20
|
+
OTEL_GENAI_USAGE_INPUT_TOKENS = "gen_ai.usage.input_tokens"
|
|
21
|
+
OTEL_GENAI_USAGE_OUTPUT_TOKENS = "gen_ai.usage.output_tokens"
|
|
22
|
+
OTEL_GENAI_TOOL_NAME = "gen_ai.tool.name"
|
|
23
|
+
OTEL_GENAI_TOOL_CALL_ID = "gen_ai.tool.call.id"
|
|
24
|
+
OTEL_GENAI_TOOL_CALL_ARGUMENTS = "gen_ai.tool.call.arguments"
|
|
25
|
+
OTEL_GENAI_TOOL_CALL_RESULT = "gen_ai.tool.call.result"
|
|
26
|
+
|
|
27
|
+
# ADK-specific custom attributes (gcp.vertex.agent.*)
|
|
28
|
+
ADK_LLM_REQUEST = "gcp.vertex.agent.llm_request"
|
|
29
|
+
ADK_LLM_RESPONSE = "gcp.vertex.agent.llm_response"
|
|
30
|
+
ADK_TOOL_CALL_ARGS = "gcp.vertex.agent.tool_call_args"
|
|
31
|
+
ADK_TOOL_RESPONSE = "gcp.vertex.agent.tool_response"
|
|
32
|
+
ADK_INVOCATION_ID = "gcp.vertex.agent.invocation_id"
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""Extract performance and metadata from trace spans."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from .extraction import (
|
|
8
|
+
extract_agent_response_from_attrs,
|
|
9
|
+
extract_token_usage_from_attrs,
|
|
10
|
+
extract_user_text_from_attrs,
|
|
11
|
+
get_extractor,
|
|
12
|
+
)
|
|
13
|
+
from .trace_attrs import OTEL_GENAI_AGENT_NAME, OTEL_GENAI_REQUEST_MODEL
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _truncate(text: str, max_length: int = 200) -> str:
|
|
17
|
+
if len(text) <= max_length:
|
|
18
|
+
return text
|
|
19
|
+
return text[:max_length] + "..."
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _calc_percentiles(values: list[float]) -> dict[str, float]:
|
|
23
|
+
if not values:
|
|
24
|
+
return {"p50": 0.0, "p95": 0.0, "p99": 0.0}
|
|
25
|
+
import statistics
|
|
26
|
+
|
|
27
|
+
sorted_values = sorted(values)
|
|
28
|
+
n = len(sorted_values)
|
|
29
|
+
return {
|
|
30
|
+
"p50": statistics.median(sorted_values),
|
|
31
|
+
"p95": sorted_values[int(n * 0.95)] if n > 1 else sorted_values[0],
|
|
32
|
+
"p99": sorted_values[int(n * 0.99)] if n > 1 else sorted_values[0],
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def extract_performance_metrics(trace, extractor=None) -> dict[str, Any]:
|
|
37
|
+
"""Extract latency and token usage metrics from trace spans."""
|
|
38
|
+
agent_latencies = []
|
|
39
|
+
llm_latencies = []
|
|
40
|
+
tool_latencies = []
|
|
41
|
+
prompt_tokens = []
|
|
42
|
+
output_tokens = []
|
|
43
|
+
total_tokens = []
|
|
44
|
+
|
|
45
|
+
if extractor is None:
|
|
46
|
+
extractor = get_extractor(trace)
|
|
47
|
+
invocation_spans = extractor.find_invocation_spans(trace)
|
|
48
|
+
|
|
49
|
+
if not invocation_spans and trace.root_spans:
|
|
50
|
+
for root_span in trace.root_spans:
|
|
51
|
+
agent_latencies.append(root_span.duration / 1000.0)
|
|
52
|
+
|
|
53
|
+
for inv_span in invocation_spans:
|
|
54
|
+
agent_latencies.append(inv_span.duration / 1000.0)
|
|
55
|
+
|
|
56
|
+
for span in trace.all_spans:
|
|
57
|
+
duration_ms = span.duration / 1000.0
|
|
58
|
+
role = extractor.classify_span(span)
|
|
59
|
+
|
|
60
|
+
if role == "llm":
|
|
61
|
+
llm_latencies.append(duration_ms)
|
|
62
|
+
in_toks, out_toks, _ = extract_token_usage_from_attrs(span.tags)
|
|
63
|
+
if in_toks or out_toks:
|
|
64
|
+
prompt_tokens.append(in_toks)
|
|
65
|
+
output_tokens.append(out_toks)
|
|
66
|
+
total_tokens.append(in_toks + out_toks)
|
|
67
|
+
elif role == "tool":
|
|
68
|
+
tool_latencies.append(duration_ms)
|
|
69
|
+
|
|
70
|
+
return {
|
|
71
|
+
"latency": {
|
|
72
|
+
"overall": _calc_percentiles(agent_latencies),
|
|
73
|
+
"llm_calls": _calc_percentiles(llm_latencies),
|
|
74
|
+
"tool_executions": _calc_percentiles(tool_latencies),
|
|
75
|
+
},
|
|
76
|
+
"tokens": {
|
|
77
|
+
"total_prompt": sum(prompt_tokens) if prompt_tokens else 0,
|
|
78
|
+
"total_output": sum(output_tokens) if output_tokens else 0,
|
|
79
|
+
"total": sum(total_tokens) if total_tokens else 0,
|
|
80
|
+
"per_llm_call": _calc_percentiles(total_tokens) if total_tokens else {"p50": 0.0, "p95": 0.0, "p99": 0.0},
|
|
81
|
+
},
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def extract_trace_metadata(trace, extractor=None) -> dict[str, Any]:
|
|
86
|
+
"""Extract agent name, model, timing, and preview text from a trace."""
|
|
87
|
+
metadata: dict[str, Any] = {
|
|
88
|
+
"agent_name": None,
|
|
89
|
+
"model": None,
|
|
90
|
+
"start_time": None,
|
|
91
|
+
"user_input_preview": None,
|
|
92
|
+
"final_output_preview": None,
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if extractor is None:
|
|
96
|
+
extractor = get_extractor(trace)
|
|
97
|
+
invocation_spans = extractor.find_invocation_spans(trace)
|
|
98
|
+
|
|
99
|
+
if invocation_spans:
|
|
100
|
+
first_inv = invocation_spans[0]
|
|
101
|
+
metadata["agent_name"] = first_inv.get_tag(OTEL_GENAI_AGENT_NAME)
|
|
102
|
+
metadata["start_time"] = first_inv.start_time
|
|
103
|
+
|
|
104
|
+
llm_spans = extractor.find_llm_spans_in(first_inv)
|
|
105
|
+
if llm_spans:
|
|
106
|
+
metadata["model"] = llm_spans[0].get_tag(OTEL_GENAI_REQUEST_MODEL)
|
|
107
|
+
|
|
108
|
+
user_text = extract_user_text_from_attrs(llm_spans[0].tags)
|
|
109
|
+
if user_text:
|
|
110
|
+
metadata["user_input_preview"] = _truncate(user_text)
|
|
111
|
+
|
|
112
|
+
agent_text = extract_agent_response_from_attrs(llm_spans[-1].tags)
|
|
113
|
+
if agent_text:
|
|
114
|
+
metadata["final_output_preview"] = _truncate(agent_text)
|
|
115
|
+
|
|
116
|
+
if not metadata["agent_name"] and trace.root_spans:
|
|
117
|
+
metadata["agent_name"] = trace.root_spans[0].operation_name
|
|
118
|
+
|
|
119
|
+
if not metadata["model"]:
|
|
120
|
+
for span in trace.all_spans:
|
|
121
|
+
model = span.get_tag(OTEL_GENAI_REQUEST_MODEL)
|
|
122
|
+
if model:
|
|
123
|
+
metadata["model"] = model
|
|
124
|
+
break
|
|
125
|
+
|
|
126
|
+
return metadata
|
|
File without changes
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Utilities for parsing OTel GenAI semantic convention message formats.
|
|
2
|
+
|
|
3
|
+
Supports two message formats:
|
|
4
|
+
- Content-based (e.g. opentelemetry-instrumentation-openai-v2):
|
|
5
|
+
{"role": "user", "content": "Hello"}
|
|
6
|
+
{"role": "assistant", "content": "...", "tool_calls": [{"type": "function", ...}]}
|
|
7
|
+
|
|
8
|
+
- Parts-based (OTel GenAI semconv v1.36.0+):
|
|
9
|
+
{"role": "user", "parts": [{"type": "text", "content": "Hello"}]}
|
|
10
|
+
{"role": "assistant", "parts": [{"type": "tool_call", "name": "...", "arguments": {...}}]}
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import logging
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
USER_ROLES = ("user", "human")
|
|
22
|
+
ASSISTANT_ROLES = ("assistant", "model", "ai")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def parse_json_attr(raw: str | dict | list | Any, tag_name: str = "") -> dict | list | Any:
|
|
26
|
+
"""Parse a JSON string from an OTel span attribute value.
|
|
27
|
+
|
|
28
|
+
If *raw* is already a dict or list it is returned as-is.
|
|
29
|
+
Returns ``{}`` on parse failure.
|
|
30
|
+
"""
|
|
31
|
+
if isinstance(raw, (dict, list)):
|
|
32
|
+
return raw
|
|
33
|
+
if isinstance(raw, str):
|
|
34
|
+
try:
|
|
35
|
+
return json.loads(raw)
|
|
36
|
+
except json.JSONDecodeError:
|
|
37
|
+
logger.warning("Failed to parse JSON in %s: %s", tag_name, raw[:200])
|
|
38
|
+
return {}
|
|
39
|
+
return {}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def extract_text_from_message(msg: dict) -> str:
|
|
43
|
+
"""Extract text content from a GenAI message in any supported format."""
|
|
44
|
+
content = msg.get("content")
|
|
45
|
+
if isinstance(content, str) and content:
|
|
46
|
+
return content
|
|
47
|
+
if isinstance(content, list):
|
|
48
|
+
parts = [item["text"] for item in content if isinstance(item, dict) and "text" in item]
|
|
49
|
+
if parts:
|
|
50
|
+
return " ".join(parts)
|
|
51
|
+
|
|
52
|
+
parts = msg.get("parts")
|
|
53
|
+
if isinstance(parts, list):
|
|
54
|
+
text_parts = []
|
|
55
|
+
for part in parts:
|
|
56
|
+
if not isinstance(part, dict) or part.get("type") != "text":
|
|
57
|
+
continue
|
|
58
|
+
text = part.get("content") or part.get("text", "")
|
|
59
|
+
if text:
|
|
60
|
+
text_parts.append(text)
|
|
61
|
+
if text_parts:
|
|
62
|
+
return " ".join(text_parts)
|
|
63
|
+
|
|
64
|
+
return ""
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def extract_tool_calls_from_message(msg: dict) -> list[dict[str, Any]]:
|
|
68
|
+
"""Extract tool calls from a GenAI message in any supported format.
|
|
69
|
+
|
|
70
|
+
Returns a normalized list of:
|
|
71
|
+
{"name": str, "id": str | None, "arguments": dict}
|
|
72
|
+
"""
|
|
73
|
+
result = []
|
|
74
|
+
|
|
75
|
+
tool_calls = msg.get("tool_calls")
|
|
76
|
+
if isinstance(tool_calls, list):
|
|
77
|
+
for tc in tool_calls:
|
|
78
|
+
if not isinstance(tc, dict):
|
|
79
|
+
continue
|
|
80
|
+
if tc.get("type") == "function" and "function" in tc:
|
|
81
|
+
func = tc["function"]
|
|
82
|
+
args = _parse_args(func.get("arguments", {}))
|
|
83
|
+
result.append(
|
|
84
|
+
{
|
|
85
|
+
"name": func.get("name", ""),
|
|
86
|
+
"id": tc.get("id"),
|
|
87
|
+
"arguments": args,
|
|
88
|
+
}
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
if not result:
|
|
92
|
+
parts = msg.get("parts")
|
|
93
|
+
if isinstance(parts, list):
|
|
94
|
+
for part in parts:
|
|
95
|
+
if not isinstance(part, dict) or part.get("type") != "tool_call":
|
|
96
|
+
continue
|
|
97
|
+
args = _parse_args(part.get("arguments", {}))
|
|
98
|
+
result.append(
|
|
99
|
+
{
|
|
100
|
+
"name": part.get("name", ""),
|
|
101
|
+
"id": part.get("id"),
|
|
102
|
+
"arguments": args,
|
|
103
|
+
}
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return result
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def extract_tool_call_args_from_messages(
|
|
110
|
+
messages_raw: str | list | Any,
|
|
111
|
+
tool_name: str,
|
|
112
|
+
) -> tuple[dict, str | None]:
|
|
113
|
+
"""Fallback: extract tool call args and ID from a messages attribute by matching *tool_name*.
|
|
114
|
+
|
|
115
|
+
Used when a tool span lacks ``gen_ai.tool.call.arguments`` directly
|
|
116
|
+
(e.g. Strands embeds the triggering tool_call in ``gen_ai.input.messages``).
|
|
117
|
+
|
|
118
|
+
Returns ``(args_dict, tool_call_id_or_None)``.
|
|
119
|
+
"""
|
|
120
|
+
messages = parse_json_attr(messages_raw, "gen_ai.input.messages")
|
|
121
|
+
if not isinstance(messages, list):
|
|
122
|
+
return {}, None
|
|
123
|
+
for msg in messages:
|
|
124
|
+
if not isinstance(msg, dict):
|
|
125
|
+
continue
|
|
126
|
+
for tc in extract_tool_calls_from_message(msg):
|
|
127
|
+
if tc["name"] == tool_name and tc["arguments"]:
|
|
128
|
+
return tc["arguments"], tc.get("id")
|
|
129
|
+
return {}, None
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _parse_args(args: Any) -> dict:
|
|
133
|
+
if isinstance(args, dict):
|
|
134
|
+
return args
|
|
135
|
+
if isinstance(args, str):
|
|
136
|
+
try:
|
|
137
|
+
parsed = json.loads(args)
|
|
138
|
+
if isinstance(parsed, dict):
|
|
139
|
+
return parsed
|
|
140
|
+
except json.JSONDecodeError:
|
|
141
|
+
logger.warning("Failed to parse tool call arguments JSON: %s", args[:200])
|
|
142
|
+
return {}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from collections import deque
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import UTC
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class BufferedLogRecord:
|
|
9
|
+
timestamp: str
|
|
10
|
+
level: str
|
|
11
|
+
logger_name: str
|
|
12
|
+
message: str
|
|
13
|
+
exc_text: str | None = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RingBufferLogHandler(logging.Handler):
|
|
17
|
+
def __init__(self, capacity: int = 1000):
|
|
18
|
+
super().__init__()
|
|
19
|
+
self._buffer: deque[BufferedLogRecord] = deque(maxlen=capacity)
|
|
20
|
+
|
|
21
|
+
def emit(self, record: logging.LogRecord) -> None:
|
|
22
|
+
from datetime import datetime
|
|
23
|
+
|
|
24
|
+
self._buffer.append(
|
|
25
|
+
BufferedLogRecord(
|
|
26
|
+
timestamp=datetime.fromtimestamp(record.created, tz=UTC).isoformat(),
|
|
27
|
+
level=record.levelname,
|
|
28
|
+
logger_name=record.name,
|
|
29
|
+
message=self.format(record),
|
|
30
|
+
exc_text=record.exc_text,
|
|
31
|
+
)
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def get_text(self) -> str:
|
|
35
|
+
lines = []
|
|
36
|
+
for r in self._buffer:
|
|
37
|
+
lines.append(f"[{r.timestamp}] {r.level} {r.logger_name}: {r.message}")
|
|
38
|
+
if r.exc_text:
|
|
39
|
+
lines.append(r.exc_text)
|
|
40
|
+
return "\n".join(lines)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
log_buffer = RingBufferLogHandler(capacity=1000)
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""Utilities for enriching OTel spans with GenAI log message content."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
|
|
9
|
+
from ..trace_attrs import (
|
|
10
|
+
OTEL_GENAI_AGENT_NAME,
|
|
11
|
+
OTEL_GENAI_INPUT_MESSAGES,
|
|
12
|
+
OTEL_GENAI_OUTPUT_MESSAGES,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def enrich_spans_with_logs(spans: list[dict], logs: list[dict], session_id: str | None = None) -> list[dict]:
|
|
19
|
+
"""Enrich spans with message content from GenAI logs.
|
|
20
|
+
|
|
21
|
+
This reconstructs gen_ai.input.messages and gen_ai.output.messages attributes
|
|
22
|
+
from log events so the converter can extract message content.
|
|
23
|
+
|
|
24
|
+
When logs carry a ``span_id`` (OTLP path), each span is enriched only with
|
|
25
|
+
its own logs. When logs lack ``span_id`` (WebSocket SDK path), all messages
|
|
26
|
+
are injected into every span (legacy behavior).
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
spans: List of OTLP span dictionaries
|
|
30
|
+
logs: List of GenAI log event dictionaries
|
|
31
|
+
session_id: Optional session ID to add as agent.name attribute
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
List of enriched span dictionaries with message attributes added
|
|
35
|
+
"""
|
|
36
|
+
if not logs:
|
|
37
|
+
return spans
|
|
38
|
+
|
|
39
|
+
logger.debug("Enriching %d spans with %d logs", len(spans), len(logs))
|
|
40
|
+
|
|
41
|
+
has_span_ids = any(log.get("span_id") for log in logs)
|
|
42
|
+
|
|
43
|
+
if has_span_ids:
|
|
44
|
+
return _enrich_per_span(spans, logs, session_id)
|
|
45
|
+
return _enrich_broadcast(spans, logs, session_id)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _extract_messages_from_logs(
|
|
49
|
+
logs: list[dict],
|
|
50
|
+
) -> tuple[list[dict], list[dict]]:
|
|
51
|
+
"""Extract deduplicated input/output messages from a list of log events."""
|
|
52
|
+
input_messages = []
|
|
53
|
+
output_messages = []
|
|
54
|
+
seen_user = set()
|
|
55
|
+
seen_assistant = set()
|
|
56
|
+
|
|
57
|
+
for log in logs:
|
|
58
|
+
event_name = log.get("event_name", "")
|
|
59
|
+
body = log.get("body", {})
|
|
60
|
+
|
|
61
|
+
if not isinstance(body, dict):
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
if event_name == "gen_ai.user.message":
|
|
65
|
+
user_content = body.get("content", "")
|
|
66
|
+
if user_content and user_content not in seen_user:
|
|
67
|
+
input_messages.append({"role": "user", "content": user_content})
|
|
68
|
+
seen_user.add(user_content)
|
|
69
|
+
|
|
70
|
+
elif event_name in ("gen_ai.assistant.message", "gen_ai.choice"):
|
|
71
|
+
if event_name == "gen_ai.choice":
|
|
72
|
+
nested = body.get("message", {}) if isinstance(body.get("message"), dict) else {}
|
|
73
|
+
assistant_content = body.get("content") or nested.get("content") or ""
|
|
74
|
+
tool_calls = nested.get("tool_calls", [])
|
|
75
|
+
else:
|
|
76
|
+
assistant_content = body.get("content") or ""
|
|
77
|
+
tool_calls = body.get("tool_calls", [])
|
|
78
|
+
|
|
79
|
+
message_key = f"{assistant_content}:{json.dumps(tool_calls) if tool_calls else ''}"
|
|
80
|
+
|
|
81
|
+
if (assistant_content or tool_calls) and message_key not in seen_assistant:
|
|
82
|
+
assistant_msg = {"role": "assistant", "content": assistant_content}
|
|
83
|
+
if tool_calls:
|
|
84
|
+
assistant_msg["tool_calls"] = tool_calls
|
|
85
|
+
output_messages.append(assistant_msg)
|
|
86
|
+
seen_assistant.add(message_key)
|
|
87
|
+
|
|
88
|
+
return input_messages, output_messages
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _inject_messages(
|
|
92
|
+
span: dict,
|
|
93
|
+
input_messages: list[dict],
|
|
94
|
+
output_messages: list[dict],
|
|
95
|
+
session_id: str | None,
|
|
96
|
+
) -> dict:
|
|
97
|
+
"""Create a copy of *span* with message attributes injected."""
|
|
98
|
+
span_copy = span.copy()
|
|
99
|
+
attrs = list(span_copy.get("attributes", []))
|
|
100
|
+
span_copy["attributes"] = attrs
|
|
101
|
+
|
|
102
|
+
if input_messages:
|
|
103
|
+
attrs.append(
|
|
104
|
+
{
|
|
105
|
+
"key": OTEL_GENAI_INPUT_MESSAGES,
|
|
106
|
+
"value": {"stringValue": json.dumps(input_messages)},
|
|
107
|
+
}
|
|
108
|
+
)
|
|
109
|
+
if output_messages:
|
|
110
|
+
attrs.append(
|
|
111
|
+
{
|
|
112
|
+
"key": OTEL_GENAI_OUTPUT_MESSAGES,
|
|
113
|
+
"value": {"stringValue": json.dumps(output_messages)},
|
|
114
|
+
}
|
|
115
|
+
)
|
|
116
|
+
if session_id:
|
|
117
|
+
attrs.append(
|
|
118
|
+
{
|
|
119
|
+
"key": OTEL_GENAI_AGENT_NAME,
|
|
120
|
+
"value": {"stringValue": session_id},
|
|
121
|
+
}
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
return span_copy
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _enrich_per_span(
|
|
128
|
+
spans: list[dict],
|
|
129
|
+
logs: list[dict],
|
|
130
|
+
session_id: str | None,
|
|
131
|
+
) -> list[dict]:
|
|
132
|
+
"""Enrich each span with only the logs emitted within that span's context."""
|
|
133
|
+
logs_by_span: dict[str, list[dict]] = defaultdict(list)
|
|
134
|
+
for log in logs:
|
|
135
|
+
sid = log.get("span_id", "")
|
|
136
|
+
if sid:
|
|
137
|
+
logs_by_span[sid].append(log)
|
|
138
|
+
|
|
139
|
+
enriched = []
|
|
140
|
+
for span in spans:
|
|
141
|
+
span_id = span.get("spanId", "")
|
|
142
|
+
span_logs = logs_by_span.get(span_id, [])
|
|
143
|
+
|
|
144
|
+
if span_logs:
|
|
145
|
+
input_msgs, output_msgs = _extract_messages_from_logs(span_logs)
|
|
146
|
+
enriched.append(_inject_messages(span, input_msgs, output_msgs, session_id))
|
|
147
|
+
else:
|
|
148
|
+
span_copy = span.copy()
|
|
149
|
+
if session_id:
|
|
150
|
+
attrs = list(span_copy.get("attributes", []))
|
|
151
|
+
attrs.append(
|
|
152
|
+
{
|
|
153
|
+
"key": OTEL_GENAI_AGENT_NAME,
|
|
154
|
+
"value": {"stringValue": session_id},
|
|
155
|
+
}
|
|
156
|
+
)
|
|
157
|
+
span_copy["attributes"] = attrs
|
|
158
|
+
enriched.append(span_copy)
|
|
159
|
+
|
|
160
|
+
matched = sum(1 for sid in logs_by_span if any(s.get("spanId") == sid for s in spans))
|
|
161
|
+
logger.debug(
|
|
162
|
+
"Per-span enrichment: %d log groups, %d matched to spans",
|
|
163
|
+
len(logs_by_span),
|
|
164
|
+
matched,
|
|
165
|
+
)
|
|
166
|
+
return enriched
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _enrich_broadcast(
|
|
170
|
+
spans: list[dict],
|
|
171
|
+
logs: list[dict],
|
|
172
|
+
session_id: str | None,
|
|
173
|
+
) -> list[dict]:
|
|
174
|
+
"""Legacy enrichment: inject all messages into every span."""
|
|
175
|
+
input_messages, output_messages = _extract_messages_from_logs(logs)
|
|
176
|
+
|
|
177
|
+
if not (input_messages or output_messages):
|
|
178
|
+
logger.warning("No messages extracted from logs")
|
|
179
|
+
return spans
|
|
180
|
+
|
|
181
|
+
logger.debug(
|
|
182
|
+
"Broadcast enrichment: %d user, %d assistant messages",
|
|
183
|
+
len(input_messages),
|
|
184
|
+
len(output_messages),
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
return [_inject_messages(span, input_messages, output_messages, session_id) for span in spans]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agentevals-cli
|
|
3
|
+
Version: 0.5.2
|
|
4
|
+
Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: click>=8.0
|
|
8
|
+
Requires-Dist: fastapi>=0.115.0
|
|
9
|
+
Requires-Dist: google-adk[eval]>=1.25.0
|
|
10
|
+
Requires-Dist: httpx>=0.27.0
|
|
11
|
+
Requires-Dist: opentelemetry-proto>=1.36.0
|
|
12
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
13
|
+
Requires-Dist: python-multipart>=0.0.12
|
|
14
|
+
Requires-Dist: pyyaml>=6.0
|
|
15
|
+
Requires-Dist: tabulate>=0.9.0
|
|
16
|
+
Requires-Dist: uvicorn[standard]>=0.32.0
|
|
17
|
+
Provides-Extra: live
|
|
18
|
+
Requires-Dist: httpx>=0.27.0; extra == 'live'
|
|
19
|
+
Requires-Dist: mcp>=1.26.0; extra == 'live'
|
|
20
|
+
Provides-Extra: streaming
|
|
21
|
+
Requires-Dist: opentelemetry-sdk>=1.20.0; extra == 'streaming'
|
|
22
|
+
Requires-Dist: websockets>=12.0; extra == 'streaming'
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
agentevals/__init__.py,sha256=NU0vD23WlyU5pexvoOMAsCuFvbJ-OJ9rfoJcnRl72dc,448
|
|
2
|
+
agentevals/_protocol.py,sha256=IHqAdPBp76boK_PQU_oFIvRDjZ0hehOyv75tyHIceAI,2578
|
|
3
|
+
agentevals/builtin_metrics.py,sha256=UQF8Gwthbvg-43YrVlULhGOb98qM11SnAezW7nD1-_Q,7944
|
|
4
|
+
agentevals/cli.py,sha256=8asqKXD4BN9Tf97ounqaOcHLEwc85tzKy7kvcjUiYM0,20231
|
|
5
|
+
agentevals/config.py,sha256=G1x2xIjkS3LGHLbDtCHXihiTP2ORYnFczLX4R-X0MwI,3383
|
|
6
|
+
agentevals/converter.py,sha256=TMF8zHTwgULjtq-F7CQt78uHB28YZPOxQQp2jpmcpWg,11377
|
|
7
|
+
agentevals/custom_evaluators.py,sha256=cBTPnxVVP7YHNdBXuoN3uGtNWFky46Cf2MbAf2tc3cM,15875
|
|
8
|
+
agentevals/eval_config_loader.py,sha256=cQOxu4VoUEmrCGe4srV9zWqF0-wrHmLUr-Jo8eF9znc,4908
|
|
9
|
+
agentevals/extraction.py,sha256=whCTbP0mzxx1aIBYKCNU99-z2_tJzGHhYUMjOKs35UU,16735
|
|
10
|
+
agentevals/genai_converter.py,sha256=d3n5LpAr84eMpdve5tDNXDl-THN66I6Y8YCDM3vfQ5E,20399
|
|
11
|
+
agentevals/mcp_server.py,sha256=bZVfrGqyYdjuxoF-bN22CS4R6J0Vj_ct3gBYc_cbxe8,9395
|
|
12
|
+
agentevals/output.py,sha256=KvmNpIPIMKp1VwuBv31nwNxaXRGR679_nSNJpCn5ehs,7263
|
|
13
|
+
agentevals/runner.py,sha256=5BI9sDNgmsHhR6welq6cS9yaT2LG6p3ULXNS4cNTZQI,11050
|
|
14
|
+
agentevals/sdk.py,sha256=5sXtmiIGonMyHeTdO04GKdFPZB5yeSG1jCarKTQ86wM,15796
|
|
15
|
+
agentevals/trace_attrs.py,sha256=fJ5PjRNjhbD0r-SPHKhOwX_oEFyjUViVLjus1HOsEnw,1263
|
|
16
|
+
agentevals/trace_metrics.py,sha256=WpLgDB1Z_b6LmpkwknK0rxxcc-wmbZfocBtN97_g_uk,4266
|
|
17
|
+
agentevals/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
+
agentevals/api/app.py,sha256=LBFUVcMrqwFLtpimqQLp1GCkVxibxLKEZ1I3SQB8Hzg,4088
|
|
19
|
+
agentevals/api/debug_routes.py,sha256=vrSjqUAShsO6WQh0OqX2ZtcJeVAV5DPuaK-k5_sYULg,9093
|
|
20
|
+
agentevals/api/models.py,sha256=caYsK7vj7Cpb6Juhr4UbJU1nkvqG_5CbGw4NOVbjJ5s,4451
|
|
21
|
+
agentevals/api/otlp_app.py,sha256=N6qtNY0_vX5N5WLrQ7fXi8EspM6CcOgS32nFoVmT6uU,605
|
|
22
|
+
agentevals/api/otlp_routes.py,sha256=QJRZW_jt-HZ47aUax94pxUhphMa25qPGza5bJfwIiSo,14165
|
|
23
|
+
agentevals/api/routes.py,sha256=nE3bEt1Tzm37UyMoMhGl1oRyq5lR2C_7xmNWrUz6Yx0,20987
|
|
24
|
+
agentevals/api/streaming_routes.py,sha256=Uj88WXzvFbOq4xA0FYa7t2NsyWpnCORvXQiqQtCYiWI,13167
|
|
25
|
+
agentevals/evaluator/__init__.py,sha256=_4n0uxBA6LcEGOuIfn6nTbFGD32O2llfxStOxTU2Lis,574
|
|
26
|
+
agentevals/evaluator/resolver.py,sha256=AXo6kBpvLfIq2ZLXfckNI58AixrMv7KYJaffb-yyBgg,2529
|
|
27
|
+
agentevals/evaluator/sources.py,sha256=SJ2MIAwrtmgtdnGscqC-9MjUp76jKBQUwnX-1rWUkBk,9591
|
|
28
|
+
agentevals/evaluator/templates.py,sha256=1kqcdhdmNsTotJlMydbZsvM8Tmgm4JXXa1B6B7MspwY,4914
|
|
29
|
+
agentevals/loader/__init__.py,sha256=EpnM_rHm7v17hDDzBUT3JzfuAqEZdHT5jJ8_z3jnayc,202
|
|
30
|
+
agentevals/loader/base.py,sha256=OTeoEyHbvUEbwzsYzuRHeK8Sx7fuLTafd9T4UI7cGSk,1503
|
|
31
|
+
agentevals/loader/jaeger.py,sha256=DgN7CJdN2w4OGONye90HIeUtYTv-vu9gFNesCAI9ULo,3568
|
|
32
|
+
agentevals/loader/otlp.py,sha256=alShML5ukryFKgt85VC1Nc6U69hK3zH7tcL36IcwP6w,7194
|
|
33
|
+
agentevals/streaming/__init__.py,sha256=WnAeBETNIK9KcA47vAQnL7s8WPFgknQLdy4laknGm9E,3939
|
|
34
|
+
agentevals/streaming/incremental_processor.py,sha256=b2DEf1SgyHT8EHcFeAhhiNYwpE039d8GYcw49guAsP4,14550
|
|
35
|
+
agentevals/streaming/processor.py,sha256=gXwwKzEzR6vWokKR2AIhnaQYYYmcE6Jk_fcBYMLonks,10886
|
|
36
|
+
agentevals/streaming/session.py,sha256=vUWzqq1CQigpkiDEqJqDM9LTyLw0Q39DilYy1Yw0XZE,1210
|
|
37
|
+
agentevals/streaming/ws_server.py,sha256=kkdavfBrcCoo5vnVz_FHAWZ2lARU6g9mSy41y5XChrc,32872
|
|
38
|
+
agentevals/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
|
+
agentevals/utils/genai_messages.py,sha256=BURKS3eoUQzmiwMuYPznv9IDwUKyzsiCI4klUlBTPVM,4704
|
|
40
|
+
agentevals/utils/log_buffer.py,sha256=bpDCPvfwBC2ZzOXXh0bmcbq3bSc1JixTSJZf9XqvG_M,1188
|
|
41
|
+
agentevals/utils/log_enrichment.py,sha256=SmhQknMoz3Pkbg0Rz1J83B1if74Qfh2rthrP6O21bD4,6117
|
|
42
|
+
agentevals_cli-0.5.2.dist-info/METADATA,sha256=SxxDY7HJtYtn7wqxUKM01RN4memFgtQcAAy6BsL5nq0,802
|
|
43
|
+
agentevals_cli-0.5.2.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
44
|
+
agentevals_cli-0.5.2.dist-info/entry_points.txt,sha256=lLyvQGvs92BySjju70F7byMFSAb9LTexKmSkwF4jkks,51
|
|
45
|
+
agentevals_cli-0.5.2.dist-info/licenses/LICENSE,sha256=Ox7lseFP2kBRXBjsLweW1jLmWiCyrKjwF8ZUvCbKd70,11310
|
|
46
|
+
agentevals_cli-0.5.2.dist-info/RECORD,,
|