agentevals-cli 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. agentevals/__init__.py +16 -0
  2. agentevals/_protocol.py +83 -0
  3. agentevals/api/__init__.py +0 -0
  4. agentevals/api/app.py +137 -0
  5. agentevals/api/debug_routes.py +268 -0
  6. agentevals/api/models.py +204 -0
  7. agentevals/api/otlp_app.py +25 -0
  8. agentevals/api/otlp_routes.py +383 -0
  9. agentevals/api/routes.py +554 -0
  10. agentevals/api/streaming_routes.py +373 -0
  11. agentevals/builtin_metrics.py +234 -0
  12. agentevals/cli.py +643 -0
  13. agentevals/config.py +108 -0
  14. agentevals/converter.py +328 -0
  15. agentevals/custom_evaluators.py +468 -0
  16. agentevals/eval_config_loader.py +147 -0
  17. agentevals/evaluator/__init__.py +24 -0
  18. agentevals/evaluator/resolver.py +70 -0
  19. agentevals/evaluator/sources.py +293 -0
  20. agentevals/evaluator/templates.py +224 -0
  21. agentevals/extraction.py +444 -0
  22. agentevals/genai_converter.py +538 -0
  23. agentevals/loader/__init__.py +7 -0
  24. agentevals/loader/base.py +53 -0
  25. agentevals/loader/jaeger.py +112 -0
  26. agentevals/loader/otlp.py +193 -0
  27. agentevals/mcp_server.py +236 -0
  28. agentevals/output.py +204 -0
  29. agentevals/runner.py +310 -0
  30. agentevals/sdk.py +433 -0
  31. agentevals/streaming/__init__.py +120 -0
  32. agentevals/streaming/incremental_processor.py +337 -0
  33. agentevals/streaming/processor.py +285 -0
  34. agentevals/streaming/session.py +36 -0
  35. agentevals/streaming/ws_server.py +806 -0
  36. agentevals/trace_attrs.py +32 -0
  37. agentevals/trace_metrics.py +126 -0
  38. agentevals/utils/__init__.py +0 -0
  39. agentevals/utils/genai_messages.py +142 -0
  40. agentevals/utils/log_buffer.py +43 -0
  41. agentevals/utils/log_enrichment.py +187 -0
  42. agentevals_cli-0.5.2.dist-info/METADATA +22 -0
  43. agentevals_cli-0.5.2.dist-info/RECORD +46 -0
  44. agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
  45. agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
  46. agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,112 @@
1
+ """Jaeger JSON trace loader."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ from typing import Any
8
+
9
+ from .base import Span, Trace, TraceLoader
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class JaegerJsonLoader(TraceLoader):
15
+ """Loads traces from Jaeger JSON export files.
16
+
17
+ Expected format::
18
+
19
+ {
20
+ "data": [
21
+ {
22
+ "traceID": "...",
23
+ "spans": [
24
+ {
25
+ "traceID": "...",
26
+ "spanID": "...",
27
+ "operationName": "...",
28
+ "references": [{"refType": "CHILD_OF", "spanID": "..."}],
29
+ "startTime": <microseconds>,
30
+ "duration": <microseconds>,
31
+ "tags": [{"key": "...", "type": "...", "value": ...}],
32
+ ...
33
+ },
34
+ ...
35
+ ]
36
+ },
37
+ ...
38
+ ]
39
+ }
40
+ """
41
+
42
+ def format_name(self) -> str:
43
+ return "jaeger-json"
44
+
45
+ def load(self, source: str) -> list[Trace]:
46
+ with open(source) as f:
47
+ raw = json.load(f)
48
+
49
+ if not isinstance(raw, dict) or "data" not in raw:
50
+ raise ValueError(f"Invalid Jaeger JSON format: expected top-level 'data' key in {source}")
51
+
52
+ traces: list[Trace] = []
53
+ for trace_data in raw["data"]:
54
+ trace = self._parse_trace(trace_data)
55
+ if trace:
56
+ traces.append(trace)
57
+
58
+ logger.info("Loaded %d trace(s) from %s", len(traces), source)
59
+ return traces
60
+
61
+ def _parse_trace(self, trace_data: dict[str, Any]) -> Trace | None:
62
+ trace_id = trace_data.get("traceID", "")
63
+ raw_spans = trace_data.get("spans", [])
64
+
65
+ if not raw_spans:
66
+ logger.warning("Trace %s has no spans, skipping", trace_id)
67
+ return None
68
+
69
+ spans_by_id: dict[str, Span] = {}
70
+ for raw_span in raw_spans:
71
+ span = self._parse_span(raw_span)
72
+ spans_by_id[span.span_id] = span
73
+
74
+ root_spans: list[Span] = []
75
+ for span in spans_by_id.values():
76
+ if span.parent_span_id and span.parent_span_id in spans_by_id:
77
+ spans_by_id[span.parent_span_id].children.append(span)
78
+ else:
79
+ root_spans.append(span)
80
+
81
+ for span in spans_by_id.values():
82
+ span.children.sort(key=lambda s: s.start_time)
83
+
84
+ root_spans.sort(key=lambda s: s.start_time)
85
+
86
+ return Trace(
87
+ trace_id=trace_id,
88
+ root_spans=root_spans,
89
+ all_spans=list(spans_by_id.values()),
90
+ )
91
+
92
+ def _parse_span(self, raw_span: dict[str, Any]) -> Span:
93
+ parent_span_id: str | None = None
94
+ for ref in raw_span.get("references", []):
95
+ if ref.get("refType") == "CHILD_OF":
96
+ parent_span_id = ref.get("spanID")
97
+ break
98
+
99
+ # Jaeger tags are an array of {key, type, value} — flatten to dict
100
+ tags: dict[str, Any] = {}
101
+ for tag in raw_span.get("tags", []):
102
+ tags[tag["key"]] = tag["value"]
103
+
104
+ return Span(
105
+ trace_id=raw_span.get("traceID", ""),
106
+ span_id=raw_span.get("spanID", ""),
107
+ parent_span_id=parent_span_id,
108
+ operation_name=raw_span.get("operationName", ""),
109
+ start_time=raw_span.get("startTime", 0),
110
+ duration=raw_span.get("duration", 0),
111
+ tags=tags,
112
+ )
@@ -0,0 +1,193 @@
1
+ """OTLP/JSON trace loader for native OpenTelemetry format."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+
8
+ from ..trace_attrs import (
9
+ OTEL_GENAI_INPUT_MESSAGES,
10
+ OTEL_GENAI_OUTPUT_MESSAGES,
11
+ OTEL_SCOPE,
12
+ OTEL_SCOPE_VERSION,
13
+ )
14
+ from .base import Span, Trace, TraceLoader
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class OtlpJsonLoader(TraceLoader):
20
+ """Loads traces from OTLP/JSON format (native OpenTelemetry format).
21
+
22
+ Supports two formats:
23
+ 1. Full OTLP export with resourceSpans structure
24
+ 2. JSONL format - one span per line (for streaming use cases)
25
+
26
+ OTLP uses nanosecond timestamps - these are converted to microseconds
27
+ to match the internal Span representation.
28
+ """
29
+
30
+ def format_name(self) -> str:
31
+ return "otlp-json"
32
+
33
+ def load(self, source: str) -> list[Trace]:
34
+ """Load OTLP JSON file or JSONL (one span per line)."""
35
+ with open(source) as f:
36
+ content = f.read().strip()
37
+
38
+ if not content:
39
+ logger.warning("Empty trace file: %s", source)
40
+ return []
41
+
42
+ if content.startswith("{"):
43
+ try:
44
+ data = json.loads(content)
45
+ if "resourceSpans" in data:
46
+ traces = self._parse_otlp_export(data)
47
+ else:
48
+ raise ValueError("Not a full OTLP export, trying JSONL")
49
+ except (json.JSONDecodeError, ValueError):
50
+ spans_list = [json.loads(line) for line in content.split("\n") if line.strip()]
51
+ traces = self._parse_otlp_spans(spans_list)
52
+ else:
53
+ spans_list = [json.loads(line) for line in content.split("\n") if line.strip()]
54
+ traces = self._parse_otlp_spans(spans_list)
55
+
56
+ logger.info("Loaded %d trace(s) from %s", len(traces), source)
57
+ return traces
58
+
59
+ def _parse_otlp_export(self, data: dict) -> list[Trace]:
60
+ """Parse full OTLP export structure with resourceSpans."""
61
+ all_spans = []
62
+
63
+ for resource_span in data.get("resourceSpans", []):
64
+ resource_attrs = self._extract_attributes(resource_span.get("resource", {}).get("attributes", []))
65
+ for scope_span in resource_span.get("scopeSpans", []):
66
+ scope = scope_span.get("scope", {})
67
+ scope_name = scope.get("name", "")
68
+ scope_version = scope.get("version", "")
69
+
70
+ for span_data in scope_span.get("spans", []):
71
+ span = self._parse_span(span_data, resource_attrs, scope_name, scope_version)
72
+ all_spans.append(span)
73
+
74
+ return self._build_traces(all_spans)
75
+
76
+ def _parse_otlp_spans(self, spans_data: list[dict]) -> list[Trace]:
77
+ """Parse flat list of OTLP spans (JSONL format for streaming)."""
78
+ all_spans = [self._parse_span(span_data, {}, "", "") for span_data in spans_data]
79
+ return self._build_traces(all_spans)
80
+
81
+ _GENAI_EVENT_KEYS = {OTEL_GENAI_INPUT_MESSAGES, OTEL_GENAI_OUTPUT_MESSAGES}
82
+
83
+ def _parse_span(
84
+ self,
85
+ span_data: dict,
86
+ resource_attrs: dict,
87
+ scope_name: str,
88
+ scope_version: str,
89
+ ) -> Span:
90
+ """Convert OTLP span to normalized Span object."""
91
+ attributes = self._extract_attributes(span_data.get("attributes", []))
92
+
93
+ if scope_name:
94
+ attributes[OTEL_SCOPE] = scope_name
95
+ if scope_version:
96
+ attributes[OTEL_SCOPE_VERSION] = scope_version
97
+
98
+ self._promote_genai_event_attributes(span_data, attributes)
99
+
100
+ attributes.update(resource_attrs)
101
+
102
+ start_time_ns = int(span_data.get("startTimeUnixNano", "0"))
103
+ end_time_ns = int(span_data.get("endTimeUnixNano", "0"))
104
+ start_time_us = start_time_ns // 1000
105
+ duration_us = (end_time_ns - start_time_ns) // 1000
106
+
107
+ parent_span_id = span_data.get("parentSpanId") or None
108
+
109
+ return Span(
110
+ trace_id=span_data.get("traceId", ""),
111
+ span_id=span_data.get("spanId", ""),
112
+ parent_span_id=parent_span_id,
113
+ operation_name=span_data.get("name", ""),
114
+ start_time=start_time_us,
115
+ duration=duration_us,
116
+ tags=attributes,
117
+ )
118
+
119
+ def _promote_genai_event_attributes(self, span_data: dict, attributes: dict) -> None:
120
+ """Promote gen_ai.input/output.messages from span events to attributes.
121
+
122
+ Some SDKs (e.g. Strands) store message content in span events rather
123
+ than span attributes. This promotes those values so the converter can
124
+ find them via normal attribute lookups.
125
+ """
126
+ for event in span_data.get("events", []):
127
+ for attr in event.get("attributes", []):
128
+ key = attr.get("key", "")
129
+ if key in self._GENAI_EVENT_KEYS and key not in attributes:
130
+ value_obj = attr.get("value", {})
131
+ if "stringValue" in value_obj:
132
+ attributes[key] = value_obj["stringValue"]
133
+
134
+ def _extract_attributes(self, attrs_list: list[dict]) -> dict:
135
+ """Convert OTLP attributes array to flat dict.
136
+
137
+ OTLP attributes are [{key, value: {stringValue|intValue|...}}]
138
+ We flatten to {key: value} for easier use.
139
+ """
140
+ result = {}
141
+ for attr in attrs_list:
142
+ key = attr.get("key", "")
143
+ value_obj = attr.get("value", {})
144
+
145
+ if "stringValue" in value_obj:
146
+ result[key] = value_obj["stringValue"]
147
+ elif "intValue" in value_obj:
148
+ result[key] = int(value_obj["intValue"])
149
+ elif "doubleValue" in value_obj:
150
+ result[key] = float(value_obj["doubleValue"])
151
+ elif "boolValue" in value_obj:
152
+ result[key] = value_obj["boolValue"]
153
+ elif "arrayValue" in value_obj:
154
+ result[key] = json.dumps(value_obj["arrayValue"])
155
+ elif "kvlistValue" in value_obj:
156
+ result[key] = json.dumps(value_obj["kvlistValue"])
157
+
158
+ return result
159
+
160
+ def _build_traces(self, all_spans: list[Span]) -> list[Trace]:
161
+ """Group spans by trace_id and build parent-child relationships."""
162
+ traces_by_id: dict[str, list[Span]] = {}
163
+
164
+ for span in all_spans:
165
+ if span.trace_id not in traces_by_id:
166
+ traces_by_id[span.trace_id] = []
167
+ traces_by_id[span.trace_id].append(span)
168
+
169
+ traces = []
170
+ for trace_id, spans in traces_by_id.items():
171
+ spans_by_id = {s.span_id: s for s in spans}
172
+ root_spans = []
173
+
174
+ for span in spans:
175
+ if span.parent_span_id and span.parent_span_id in spans_by_id:
176
+ spans_by_id[span.parent_span_id].children.append(span)
177
+ else:
178
+ root_spans.append(span)
179
+
180
+ for span in spans:
181
+ span.children.sort(key=lambda s: s.start_time)
182
+
183
+ root_spans.sort(key=lambda s: s.start_time)
184
+
185
+ traces.append(
186
+ Trace(
187
+ trace_id=trace_id,
188
+ root_spans=root_spans,
189
+ all_spans=spans,
190
+ )
191
+ )
192
+
193
+ return traces
@@ -0,0 +1,236 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import tempfile
5
+ from typing import Any
6
+
7
+ import httpx
8
+ from mcp.server import FastMCP
9
+
10
+ from agentevals.config import EvalRunConfig
11
+ from agentevals.runner import run_evaluation
12
+
13
+ _DEFAULT_SERVER_URL = "http://localhost:8001"
14
+
15
+
16
+ def create_server(server_url: str | None = None) -> FastMCP:
17
+ mcp = FastMCP("agentevals")
18
+ _url = (server_url or os.environ.get("AGENTEVALS_SERVER_URL", _DEFAULT_SERVER_URL)).rstrip("/")
19
+
20
+ def _unwrap(response_json: dict) -> Any:
21
+ if response_json.get("error"):
22
+ raise RuntimeError(f"API error: {response_json['error']}")
23
+ return response_json["data"]
24
+
25
+ async def _get(path: str) -> Any:
26
+ try:
27
+ async with httpx.AsyncClient(timeout=30) as client:
28
+ r = await client.get(f"{_url}{path}")
29
+ r.raise_for_status()
30
+ return _unwrap(r.json())
31
+ except httpx.ConnectError as exc:
32
+ raise RuntimeError(
33
+ f"Cannot reach agentevals server at {_url}. Start it with: uv run agentevals serve --dev"
34
+ ) from exc
35
+ except httpx.HTTPStatusError as exc:
36
+ raise RuntimeError(f"Server error {exc.response.status_code}: {exc.response.text}") from exc
37
+
38
+ async def _post(path: str, body: dict) -> Any:
39
+ try:
40
+ async with httpx.AsyncClient(timeout=60) as client:
41
+ r = await client.post(f"{_url}{path}", json=body)
42
+ r.raise_for_status()
43
+ return _unwrap(r.json())
44
+ except httpx.ConnectError as exc:
45
+ raise RuntimeError(
46
+ f"Cannot reach agentevals server at {_url}. Start it with: uv run agentevals serve --dev"
47
+ ) from exc
48
+ except httpx.HTTPStatusError as exc:
49
+ raise RuntimeError(f"Server error {exc.response.status_code}: {exc.response.text}") from exc
50
+
51
+ def _summarize_run_result(result) -> dict[str, Any]:
52
+ traces = []
53
+ for tr in result.trace_results:
54
+ traces.append(
55
+ {
56
+ "trace_id": tr.trace_id,
57
+ "num_invocations": tr.num_invocations,
58
+ "metrics": [
59
+ {
60
+ "metric": mr.metric_name,
61
+ "score": mr.score,
62
+ "status": mr.eval_status,
63
+ **({"error": mr.error} if mr.error else {}),
64
+ }
65
+ for mr in tr.metric_results
66
+ ],
67
+ **({"warnings": tr.conversion_warnings} if tr.conversion_warnings else {}),
68
+ }
69
+ )
70
+ return {
71
+ "passed": all(mr["status"] != "FAILED" for tr in traces for mr in tr["metrics"]),
72
+ "traces": traces,
73
+ **({"errors": result.errors} if result.errors else {}),
74
+ }
75
+
76
+ @mcp.tool()
77
+ async def list_metrics() -> list[dict[str, Any]]:
78
+ """List all available evaluation metrics with their descriptions and requirements."""
79
+ return await _get("/api/metrics")
80
+
81
+ @mcp.tool()
82
+ async def evaluate_traces(
83
+ trace_files: list[str],
84
+ metrics: list[str] | None = None,
85
+ trace_format: str = "jaeger-json",
86
+ eval_set_file: str | None = None,
87
+ judge_model: str | None = None,
88
+ threshold: float | None = None,
89
+ eval_config_file: str | None = None,
90
+ ) -> dict[str, Any]:
91
+ """Evaluate one or more local agent trace files.
92
+
93
+ Does not require the agentevals server to be running. Returns a flat summary
94
+ with a top-level 'passed' boolean and per-trace metric scores.
95
+
96
+ Args:
97
+ trace_files: Absolute paths to Jaeger JSON or OTLP JSON/JSONL trace files.
98
+ metrics: Metric names to evaluate. Use list_metrics to see available options.
99
+ trace_format: "jaeger-json" or "otlp-json".
100
+ eval_set_file: Path to a golden eval set JSON for comparison metrics.
101
+ judge_model: LLM model for judge-based metrics (e.g. "gemini-2.5-flash").
102
+ threshold: Score threshold for PASS/FAIL classification (0.0–1.0).
103
+ eval_config_file: Path to an eval config YAML file with custom evaluators.
104
+ """
105
+ if metrics is None:
106
+ metrics = ["tool_trajectory_avg_score"]
107
+ if eval_config_file:
108
+ from agentevals.eval_config_loader import load_eval_config, merge_configs
109
+
110
+ file_config = load_eval_config(eval_config_file)
111
+ cli_config = EvalRunConfig(
112
+ trace_files=trace_files,
113
+ metrics=metrics,
114
+ trace_format=trace_format,
115
+ eval_set_file=eval_set_file,
116
+ judge_model=judge_model,
117
+ threshold=threshold,
118
+ )
119
+ config = merge_configs(file_config, cli_config)
120
+ else:
121
+ config = EvalRunConfig(
122
+ trace_files=trace_files,
123
+ metrics=metrics,
124
+ trace_format=trace_format,
125
+ eval_set_file=eval_set_file,
126
+ judge_model=judge_model,
127
+ threshold=threshold,
128
+ )
129
+ result = await run_evaluation(config)
130
+ return _summarize_run_result(result)
131
+
132
+ @mcp.tool()
133
+ async def list_sessions(limit: int = 20) -> list[dict[str, Any]]:
134
+ """List streaming trace sessions, most recent first.
135
+
136
+ Requires agentevals serve to be running.
137
+
138
+ Args:
139
+ limit: Maximum number of sessions to return (default: 20).
140
+ """
141
+ sessions = await _get("/api/streaming/sessions")
142
+ sessions.sort(key=lambda s: s.get("startedAt", ""), reverse=True)
143
+ return [
144
+ {
145
+ "sessionId": s["sessionId"],
146
+ "isComplete": s["isComplete"],
147
+ "spanCount": s["spanCount"],
148
+ "startedAt": s["startedAt"],
149
+ }
150
+ for s in sessions[:limit]
151
+ ]
152
+
153
+ @mcp.tool()
154
+ async def summarize_session(session_id: str) -> dict[str, Any]:
155
+ """Get a structured summary of a session's invocations, tool calls, and messages.
156
+
157
+ Parses the raw trace and returns human-readable invocation data: user messages,
158
+ agent responses, and tool calls made. For the full span data, use get_session_trace.
159
+
160
+ Args:
161
+ session_id: Session ID from list_sessions.
162
+ """
163
+ from agentevals.converter import convert_traces
164
+ from agentevals.loader.otlp import OtlpJsonLoader
165
+
166
+ raw = await _post("/api/streaming/get-trace", {"session_id": session_id})
167
+
168
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
169
+ f.write(raw["traceContent"])
170
+ tmp_path = f.name
171
+
172
+ traces = OtlpJsonLoader().load(tmp_path)
173
+ if not traces:
174
+ return {"session_id": session_id, "num_spans": raw["numSpans"], "invocations": []}
175
+
176
+ invocations = []
177
+ for conv in convert_traces(traces):
178
+ for inv in conv.invocations:
179
+ tool_calls = []
180
+ if inv.intermediate_data:
181
+ tool_calls = [
182
+ {"tool": tu.name, "args": getattr(tu, "args", {})} for tu in inv.intermediate_data.tool_uses
183
+ ]
184
+ invocations.append(
185
+ {
186
+ "user": next((p.text for p in inv.user_content.parts if p.text), "")
187
+ if inv.user_content
188
+ else "",
189
+ "response": next((p.text for p in inv.final_response.parts if p.text), "")
190
+ if inv.final_response
191
+ else "",
192
+ "tool_calls": tool_calls,
193
+ }
194
+ )
195
+
196
+ return {
197
+ "session_id": session_id,
198
+ "num_spans": raw["numSpans"],
199
+ "num_invocations": len(invocations),
200
+ "invocations": invocations,
201
+ }
202
+
203
+ @mcp.tool()
204
+ async def evaluate_sessions(
205
+ golden_session_id: str,
206
+ metrics: list[str] | None = None,
207
+ judge_model: str = "gemini-2.5-flash",
208
+ eval_set_id: str | None = None,
209
+ ) -> dict[str, Any]:
210
+ """Evaluate all completed sessions against a golden reference session.
211
+
212
+ The server builds the eval set from the golden session automatically — no file
213
+ creation or pre-existing eval set needed. Call list_sessions first to find session IDs.
214
+
215
+ Requires agentevals serve to be running.
216
+
217
+ Args:
218
+ golden_session_id: Session ID of the reference/golden run.
219
+ metrics: Metric names to evaluate. Use list_metrics to see available options.
220
+ judge_model: LLM model for judge-based metrics.
221
+ eval_set_id: A label for the eval set built from the golden session. You can use
222
+ any string or omit it — a default will be generated automatically.
223
+ """
224
+ if metrics is None:
225
+ metrics = ["tool_trajectory_avg_score"]
226
+ return await _post(
227
+ "/api/streaming/evaluate-sessions",
228
+ {
229
+ "golden_session_id": golden_session_id,
230
+ "eval_set_id": eval_set_id or f"eval-{golden_session_id[:12]}",
231
+ "metrics": metrics,
232
+ "judge_model": judge_model,
233
+ },
234
+ )
235
+
236
+ return mcp