agentevals-cli 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. agentevals/__init__.py +16 -0
  2. agentevals/_protocol.py +83 -0
  3. agentevals/api/__init__.py +0 -0
  4. agentevals/api/app.py +137 -0
  5. agentevals/api/debug_routes.py +268 -0
  6. agentevals/api/models.py +204 -0
  7. agentevals/api/otlp_app.py +25 -0
  8. agentevals/api/otlp_routes.py +383 -0
  9. agentevals/api/routes.py +554 -0
  10. agentevals/api/streaming_routes.py +373 -0
  11. agentevals/builtin_metrics.py +234 -0
  12. agentevals/cli.py +643 -0
  13. agentevals/config.py +108 -0
  14. agentevals/converter.py +328 -0
  15. agentevals/custom_evaluators.py +468 -0
  16. agentevals/eval_config_loader.py +147 -0
  17. agentevals/evaluator/__init__.py +24 -0
  18. agentevals/evaluator/resolver.py +70 -0
  19. agentevals/evaluator/sources.py +293 -0
  20. agentevals/evaluator/templates.py +224 -0
  21. agentevals/extraction.py +444 -0
  22. agentevals/genai_converter.py +538 -0
  23. agentevals/loader/__init__.py +7 -0
  24. agentevals/loader/base.py +53 -0
  25. agentevals/loader/jaeger.py +112 -0
  26. agentevals/loader/otlp.py +193 -0
  27. agentevals/mcp_server.py +236 -0
  28. agentevals/output.py +204 -0
  29. agentevals/runner.py +310 -0
  30. agentevals/sdk.py +433 -0
  31. agentevals/streaming/__init__.py +120 -0
  32. agentevals/streaming/incremental_processor.py +337 -0
  33. agentevals/streaming/processor.py +285 -0
  34. agentevals/streaming/session.py +36 -0
  35. agentevals/streaming/ws_server.py +806 -0
  36. agentevals/trace_attrs.py +32 -0
  37. agentevals/trace_metrics.py +126 -0
  38. agentevals/utils/__init__.py +0 -0
  39. agentevals/utils/genai_messages.py +142 -0
  40. agentevals/utils/log_buffer.py +43 -0
  41. agentevals/utils/log_enrichment.py +187 -0
  42. agentevals_cli-0.5.2.dist-info/METADATA +22 -0
  43. agentevals_cli-0.5.2.dist-info/RECORD +46 -0
  44. agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
  45. agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
  46. agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
agentevals/config.py ADDED
@@ -0,0 +1,108 @@
1
+ """Configuration for agentevals runs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated, Any, Literal
7
+
8
+ from pydantic import BaseModel, Field, field_validator
9
+
10
+
11
+ class BuiltinMetricDef(BaseModel):
12
+ """A built-in ADK metric, optionally with threshold/judge overrides."""
13
+
14
+ name: str
15
+ type: Literal["builtin"] = "builtin"
16
+ threshold: float | None = None
17
+ judge_model: str | None = None
18
+
19
+
20
+ class BaseEvaluatorDef(BaseModel):
21
+ """Shared fields for all executable evaluator definitions."""
22
+
23
+ name: str
24
+ threshold: float = 0.5
25
+ timeout: int = Field(default=30, description="Subprocess timeout in seconds.")
26
+ config: dict[str, Any] = Field(default_factory=dict)
27
+ executor: str = Field(default="local", description="Execution environment: 'local' or 'docker' (future).")
28
+
29
+
30
+ class CodeEvaluatorDef(BaseEvaluatorDef):
31
+ """An evaluator implemented as an external code file (Python, JavaScript, etc.)."""
32
+
33
+ type: Literal["code"] = "code"
34
+ path: str = Field(description="Path to the evaluator file (.py, .js, .ts, etc.).")
35
+
36
+ @field_validator("path")
37
+ @classmethod
38
+ def _validate_extension(cls, v: str) -> str:
39
+ from .custom_evaluators import supported_extensions
40
+
41
+ suffix = Path(v).suffix.lower()
42
+ allowed = supported_extensions()
43
+ if suffix not in allowed:
44
+ raise ValueError(f"Unsupported evaluator file extension '{suffix}'. Supported: {sorted(allowed)}")
45
+ return v
46
+
47
+
48
+ class RemoteEvaluatorDef(BaseEvaluatorDef):
49
+ """An evaluator fetched from a remote source (GitHub, registry, etc.)."""
50
+
51
+ type: Literal["remote"] = "remote"
52
+ source: str = Field(default="github", description="Evaluator source (e.g. 'github').")
53
+ ref: str = Field(description="Source-specific reference (e.g. path within the repo).")
54
+
55
+
56
+ CustomEvaluatorDef = Annotated[
57
+ BuiltinMetricDef | CodeEvaluatorDef | RemoteEvaluatorDef,
58
+ Field(discriminator="type"),
59
+ ]
60
+
61
+
62
+ class EvalRunConfig(BaseModel):
63
+ trace_files: list[str] = Field(description="Paths to trace files (Jaeger JSON or OTLP JSON).")
64
+
65
+ eval_set_file: str | None = Field(
66
+ default=None,
67
+ description="Path to a golden eval set JSON file (ADK EvalSet format).",
68
+ )
69
+
70
+ metrics: list[str] = Field(
71
+ default_factory=lambda: ["tool_trajectory_avg_score"],
72
+ description="List of built-in metric names to evaluate.",
73
+ )
74
+
75
+ custom_evaluators: list[CustomEvaluatorDef] = Field(
76
+ default_factory=list,
77
+ description="Custom evaluator definitions.",
78
+ )
79
+
80
+ trace_format: str = Field(
81
+ default="jaeger-json",
82
+ description="Format of the trace files (jaeger-json or otlp-json).",
83
+ )
84
+
85
+ judge_model: str | None = Field(
86
+ default=None,
87
+ description="LLM model for judge-based metrics.",
88
+ )
89
+
90
+ threshold: float | None = Field(
91
+ default=None,
92
+ description="Score threshold for pass/fail.",
93
+ )
94
+
95
+ output_format: str = Field(
96
+ default="table",
97
+ description="Output format: 'table', 'json', or 'summary'.",
98
+ )
99
+
100
+ max_concurrent_traces: int = Field(
101
+ default=10,
102
+ description="Maximum number of traces to evaluate concurrently.",
103
+ )
104
+
105
+ max_concurrent_evals: int = Field(
106
+ default=5,
107
+ description="Maximum number of concurrent metric evaluations (LLM API calls).",
108
+ )
@@ -0,0 +1,328 @@
1
+ """Convert trace spans into ADK Invocation objects.
2
+
3
+ Supports two trace formats:
4
+ 1. ADK format (gcp.vertex.agent scope with ADK-specific attributes)
5
+ 2. GenAI semantic conventions (standard gen_ai.* attributes from LangChain, Strands, etc.)
6
+
7
+ Automatically detects the format and routes to the appropriate converter.
8
+ Format detection checks span attributes and falls back to checking all spans if needed.
9
+ Explicit format can be specified via the format parameter to convert_trace().
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ from dataclasses import dataclass, field
16
+ from typing import Any
17
+
18
+ from google.adk.evaluation.eval_case import IntermediateData, Invocation
19
+ from google.genai import types as genai_types
20
+
21
+ from .extraction import get_extractor, parse_json
22
+ from .loader.base import Span, Trace
23
+ from .trace_attrs import (
24
+ ADK_INVOCATION_ID,
25
+ ADK_LLM_REQUEST,
26
+ ADK_LLM_RESPONSE,
27
+ ADK_SCOPE_VALUE,
28
+ ADK_TOOL_CALL_ARGS,
29
+ ADK_TOOL_RESPONSE,
30
+ OTEL_GENAI_AGENT_NAME,
31
+ OTEL_GENAI_TOOL_CALL_ID,
32
+ OTEL_GENAI_TOOL_NAME,
33
+ OTEL_SCOPE,
34
+ )
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+
39
+ @dataclass
40
+ class ConversionResult:
41
+ trace_id: str
42
+ invocations: list[Invocation] = field(default_factory=list)
43
+ warnings: list[str] = field(default_factory=list)
44
+
45
+
46
+ def convert_trace(trace: Trace, format: str | None = None) -> ConversionResult:
47
+ """Convert a trace to Invocation objects.
48
+
49
+ Args:
50
+ trace: The trace to convert
51
+ format: Optional explicit format ("adk" or "genai"). If None, auto-detects.
52
+
53
+ Returns:
54
+ ConversionResult with invocations and any warnings
55
+ """
56
+ if format is None:
57
+ trace_format = _detect_trace_format(trace)
58
+ logger.info(f"Auto-detected trace format: {trace_format} for trace {trace.trace_id}")
59
+ else:
60
+ trace_format = format
61
+ logger.info(f"Using explicit trace format: {trace_format} for trace {trace.trace_id}")
62
+
63
+ if trace_format == "genai":
64
+ from .genai_converter import convert_genai_trace
65
+
66
+ return convert_genai_trace(trace)
67
+ else:
68
+ return _convert_adk_trace(trace)
69
+
70
+
71
+ def _detect_trace_format(trace: Trace) -> str:
72
+ """Detect trace format by delegating to the extractor registry."""
73
+ return get_extractor(trace).format_name()
74
+
75
+
76
+ def _convert_adk_trace(trace: Trace) -> ConversionResult:
77
+ result = ConversionResult(trace_id=trace.trace_id)
78
+
79
+ invoke_spans = _find_adk_spans(trace, "invoke_agent")
80
+ if not invoke_spans:
81
+ result.warnings.append(f"Trace {trace.trace_id}: no invoke_agent spans found")
82
+ return result
83
+
84
+ for invoke_span in invoke_spans:
85
+ try:
86
+ invocation = _convert_invoke_span(invoke_span)
87
+ result.invocations.append(invocation)
88
+ except Exception as exc:
89
+ msg = f"Trace {trace.trace_id}: failed to convert invoke_agent span {invoke_span.span_id}: {exc}"
90
+ logger.warning(msg)
91
+ result.warnings.append(msg)
92
+
93
+ return result
94
+
95
+
96
+ def convert_traces(traces: list[Trace]) -> list[ConversionResult]:
97
+ return [convert_trace(t) for t in traces]
98
+
99
+
100
+ def _find_adk_spans(trace: Trace, operation: str) -> list[Span]:
101
+ """Find spans with ``otel.scope.name == "gcp.vertex.agent"`` matching an operation prefix."""
102
+ matches = []
103
+ for span in trace.all_spans:
104
+ if span.get_tag(OTEL_SCOPE) != ADK_SCOPE_VALUE:
105
+ continue
106
+ # operationName is e.g. "invoke_agent helm_agent" or "call_llm"
107
+ if span.operation_name.startswith(operation):
108
+ matches.append(span)
109
+ matches.sort(key=lambda s: s.start_time)
110
+ return matches
111
+
112
+
113
+ def _convert_invoke_span(invoke_span: Span) -> Invocation:
114
+ call_llm_spans = _find_children_by_op(invoke_span, "call_llm")
115
+ if not call_llm_spans:
116
+ raise ValueError(f"invoke_agent span {invoke_span.span_id} has no child call_llm spans")
117
+
118
+ tool_spans = _find_children_by_op(invoke_span, "execute_tool")
119
+
120
+ user_content = _extract_user_content(call_llm_spans[0])
121
+ final_response = _extract_final_response(call_llm_spans[-1])
122
+ tool_uses, tool_responses = _extract_tool_trajectory(call_llm_spans, tool_spans)
123
+
124
+ intermediate_data = IntermediateData(
125
+ tool_uses=tool_uses,
126
+ tool_responses=tool_responses,
127
+ )
128
+
129
+ invocation_id = invoke_span.get_tag(ADK_INVOCATION_ID, invoke_span.span_id)
130
+
131
+ return Invocation(
132
+ invocation_id=invocation_id,
133
+ user_content=user_content,
134
+ final_response=final_response,
135
+ intermediate_data=intermediate_data,
136
+ creation_timestamp=invoke_span.start_time / 1_000_000.0,
137
+ )
138
+
139
+
140
+ def _find_children_by_op(root: Span, op_prefix: str) -> list[Span]:
141
+ results: list[Span] = []
142
+ _walk(root, op_prefix, results)
143
+ results.sort(key=lambda s: s.start_time)
144
+ return results
145
+
146
+
147
+ def _walk(span: Span, op_prefix: str, acc: list[Span]) -> None:
148
+ for child in span.children:
149
+ if child.operation_name.startswith(op_prefix):
150
+ acc.append(child)
151
+ _walk(child, op_prefix, acc)
152
+
153
+
154
+ def _extract_user_content(first_call_llm: Span) -> genai_types.Content:
155
+ """Extract user input from the first call_llm span's llm_request tag."""
156
+ llm_request_raw = first_call_llm.get_tag(ADK_LLM_REQUEST, "{}")
157
+ llm_request = parse_json(llm_request_raw)
158
+ contents = llm_request.get("contents", [])
159
+
160
+ for content_dict in reversed(contents):
161
+ if content_dict.get("role") != "user":
162
+ continue
163
+ parts = content_dict.get("parts", [])
164
+ # Skip function_response parts — only want actual user text messages
165
+ text_parts = [p for p in parts if "text" in p]
166
+ if text_parts:
167
+ return genai_types.Content(
168
+ role="user",
169
+ parts=[genai_types.Part(text=p["text"]) for p in text_parts],
170
+ )
171
+
172
+ for content_dict in contents:
173
+ if content_dict.get("role") == "user":
174
+ return _content_from_dict(content_dict)
175
+
176
+ raise ValueError(f"call_llm span {first_call_llm.span_id}: no user content found in llm_request")
177
+
178
+
179
+ def _extract_final_response(last_call_llm: Span) -> genai_types.Content:
180
+ """Extract final text response from the last call_llm span's llm_response tag."""
181
+ llm_response_raw = last_call_llm.get_tag(ADK_LLM_RESPONSE, "{}")
182
+ llm_response = parse_json(llm_response_raw)
183
+
184
+ content_dict = llm_response.get("content", {})
185
+ if not content_dict:
186
+ raise ValueError(f"call_llm span {last_call_llm.span_id}: no content in llm_response")
187
+
188
+ parts_dicts = content_dict.get("parts", [])
189
+ # Final response should have text parts, not function_call parts
190
+ text_parts = [p for p in parts_dicts if "text" in p]
191
+ if text_parts:
192
+ return genai_types.Content(
193
+ role="model",
194
+ parts=[genai_types.Part(text=p["text"]) for p in text_parts],
195
+ )
196
+
197
+ # If the last call_llm only has function_call parts, that's unexpected
198
+ # for a final response — the agent may have been cut short.
199
+ logger.warning(
200
+ "call_llm span %s: last llm_response has no text parts, may not be the actual final response",
201
+ last_call_llm.span_id,
202
+ )
203
+ return _content_from_dict(content_dict)
204
+
205
+
206
+ def _extract_tool_trajectory(
207
+ call_llm_spans: list[Span],
208
+ tool_spans: list[Span],
209
+ ) -> tuple[list[genai_types.FunctionCall], list[genai_types.FunctionResponse]]:
210
+ """Extract tool calls and responses.
211
+
212
+ Prefers execute_tool spans (which have actual execution results) over
213
+ function_call parts in call_llm responses (which only have the LLM's
214
+ request to call the tool, not the result).
215
+ """
216
+ tool_uses: list[genai_types.FunctionCall] = []
217
+ tool_responses: list[genai_types.FunctionResponse] = []
218
+
219
+ if tool_spans:
220
+ for tool_span in tool_spans:
221
+ fc, fr = _extract_from_tool_span(tool_span)
222
+ if fc:
223
+ tool_uses.append(fc)
224
+ if fr:
225
+ tool_responses.append(fr)
226
+ else:
227
+ for call_llm in call_llm_spans:
228
+ fcs = _extract_function_calls_from_llm_response(call_llm)
229
+ tool_uses.extend(fcs)
230
+
231
+ return tool_uses, tool_responses
232
+
233
+
234
+ def _extract_from_tool_span(
235
+ tool_span: Span,
236
+ ) -> tuple[genai_types.FunctionCall | None, genai_types.FunctionResponse | None]:
237
+ tool_name = tool_span.get_tag(OTEL_GENAI_TOOL_NAME)
238
+ tool_call_id = tool_span.get_tag(OTEL_GENAI_TOOL_CALL_ID)
239
+
240
+ if not tool_name:
241
+ # Fallback: parse tool name from operationName "execute_tool <name>"
242
+ op = tool_span.operation_name
243
+ if op.startswith("execute_tool "):
244
+ tool_name = op[len("execute_tool ") :]
245
+ else:
246
+ logger.warning("execute_tool span %s: no tool name found", tool_span.span_id)
247
+ return None, None
248
+
249
+ args_raw = tool_span.get_tag(ADK_TOOL_CALL_ARGS, "{}")
250
+ args = parse_json(args_raw)
251
+
252
+ fc = genai_types.FunctionCall(
253
+ name=tool_name,
254
+ args=args if args else {},
255
+ id=tool_call_id,
256
+ )
257
+
258
+ response_raw = tool_span.get_tag(ADK_TOOL_RESPONSE)
259
+ fr = None
260
+ if response_raw:
261
+ response_data = parse_json(response_raw)
262
+ # Response format varies: MCP uses {"content": [...], "isError": false},
263
+ # other tools return flat dicts. We pass through as-is.
264
+ fr = genai_types.FunctionResponse(
265
+ name=tool_name,
266
+ response=response_data if response_data else {},
267
+ id=tool_call_id,
268
+ )
269
+
270
+ return fc, fr
271
+
272
+
273
+ def _extract_function_calls_from_llm_response(
274
+ call_llm: Span,
275
+ ) -> list[genai_types.FunctionCall]:
276
+ llm_response_raw = call_llm.get_tag(ADK_LLM_RESPONSE, "{}")
277
+ llm_response = parse_json(llm_response_raw)
278
+
279
+ content_dict = llm_response.get("content", {})
280
+ parts = content_dict.get("parts", [])
281
+
282
+ calls = []
283
+ for part in parts:
284
+ fc_dict = part.get("function_call")
285
+ if fc_dict:
286
+ calls.append(
287
+ genai_types.FunctionCall(
288
+ name=fc_dict.get("name", ""),
289
+ args=fc_dict.get("args", {}),
290
+ id=fc_dict.get("id"),
291
+ )
292
+ )
293
+ return calls
294
+
295
+
296
+ def _content_from_dict(content_dict: dict[str, Any]) -> genai_types.Content:
297
+ """Build a genai Content from a raw dict. Handles text, function_call, and function_response parts."""
298
+ role = content_dict.get("role", "user")
299
+ parts_dicts = content_dict.get("parts", [])
300
+
301
+ parts: list[genai_types.Part] = []
302
+ for p in parts_dicts:
303
+ if "text" in p:
304
+ parts.append(genai_types.Part(text=p["text"]))
305
+ elif "function_call" in p:
306
+ fc = p["function_call"]
307
+ parts.append(
308
+ genai_types.Part(
309
+ function_call=genai_types.FunctionCall(
310
+ name=fc.get("name", ""),
311
+ args=fc.get("args", {}),
312
+ id=fc.get("id"),
313
+ )
314
+ )
315
+ )
316
+ elif "function_response" in p:
317
+ fr = p["function_response"]
318
+ parts.append(
319
+ genai_types.Part(
320
+ function_response=genai_types.FunctionResponse(
321
+ name=fr.get("name", ""),
322
+ response=fr.get("response", {}),
323
+ id=fr.get("id"),
324
+ )
325
+ )
326
+ )
327
+
328
+ return genai_types.Content(role=role, parts=parts)