PyPI - agentevals-cli - Versions diffs - 0.5.2__py3-none-any.whl - Mend

agentevals-cli 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

agentevals/__init__.py +16 -0
agentevals/_protocol.py +83 -0
agentevals/api/__init__.py +0 -0
agentevals/api/app.py +137 -0
agentevals/api/debug_routes.py +268 -0
agentevals/api/models.py +204 -0
agentevals/api/otlp_app.py +25 -0
agentevals/api/otlp_routes.py +383 -0
agentevals/api/routes.py +554 -0
agentevals/api/streaming_routes.py +373 -0
agentevals/builtin_metrics.py +234 -0
agentevals/cli.py +643 -0
agentevals/config.py +108 -0
agentevals/converter.py +328 -0
agentevals/custom_evaluators.py +468 -0
agentevals/eval_config_loader.py +147 -0
agentevals/evaluator/__init__.py +24 -0
agentevals/evaluator/resolver.py +70 -0
agentevals/evaluator/sources.py +293 -0
agentevals/evaluator/templates.py +224 -0
agentevals/extraction.py +444 -0
agentevals/genai_converter.py +538 -0
agentevals/loader/__init__.py +7 -0
agentevals/loader/base.py +53 -0
agentevals/loader/jaeger.py +112 -0
agentevals/loader/otlp.py +193 -0
agentevals/mcp_server.py +236 -0
agentevals/output.py +204 -0
agentevals/runner.py +310 -0
agentevals/sdk.py +433 -0
agentevals/streaming/__init__.py +120 -0
agentevals/streaming/incremental_processor.py +337 -0
agentevals/streaming/processor.py +285 -0
agentevals/streaming/session.py +36 -0
agentevals/streaming/ws_server.py +806 -0
agentevals/trace_attrs.py +32 -0
agentevals/trace_metrics.py +126 -0
agentevals/utils/__init__.py +0 -0
agentevals/utils/genai_messages.py +142 -0
agentevals/utils/log_buffer.py +43 -0
agentevals/utils/log_enrichment.py +187 -0
agentevals_cli-0.5.2.dist-info/METADATA +22 -0
agentevals_cli-0.5.2.dist-info/RECORD +46 -0
agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0

agentevals/loader/jaeger.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""Jaeger JSON trace loader."""
+from __future__ import annotations
+import json
+import logging
+from typing import Any
+from .base import Span, Trace, TraceLoader
+logger = logging.getLogger(__name__)
+class JaegerJsonLoader(TraceLoader):
+    """Loads traces from Jaeger JSON export files.
+    Expected format::
+        {
+            "data": [
+                {
+                    "traceID": "...",
+                    "spans": [
+                        {
+                            "traceID": "...",
+                            "spanID": "...",
+                            "operationName": "...",
+                            "references": [{"refType": "CHILD_OF", "spanID": "..."}],
+                            "startTime": <microseconds>,
+                            "duration": <microseconds>,
+                            "tags": [{"key": "...", "type": "...", "value": ...}],
+                            ...
+                        },
+                        ...
+                    ]
+                },
+                ...
+            ]
+        }
+    """
+    def format_name(self) -> str:
+        return "jaeger-json"
+    def load(self, source: str) -> list[Trace]:
+        with open(source) as f:
+            raw = json.load(f)
+        if not isinstance(raw, dict) or "data" not in raw:
+            raise ValueError(f"Invalid Jaeger JSON format: expected top-level 'data' key in {source}")
+        traces: list[Trace] = []
+        for trace_data in raw["data"]:
+            trace = self._parse_trace(trace_data)
+            if trace:
+                traces.append(trace)
+        logger.info("Loaded %d trace(s) from %s", len(traces), source)
+        return traces
+    def _parse_trace(self, trace_data: dict[str, Any]) -> Trace | None:
+        trace_id = trace_data.get("traceID", "")
+        raw_spans = trace_data.get("spans", [])
+        if not raw_spans:
+            logger.warning("Trace %s has no spans, skipping", trace_id)
+            return None
+        spans_by_id: dict[str, Span] = {}
+        for raw_span in raw_spans:
+            span = self._parse_span(raw_span)
+            spans_by_id[span.span_id] = span
+        root_spans: list[Span] = []
+        for span in spans_by_id.values():
+            if span.parent_span_id and span.parent_span_id in spans_by_id:
+                spans_by_id[span.parent_span_id].children.append(span)
+            else:
+                root_spans.append(span)
+        for span in spans_by_id.values():
+            span.children.sort(key=lambda s: s.start_time)
+        root_spans.sort(key=lambda s: s.start_time)
+        return Trace(
+            trace_id=trace_id,
+            root_spans=root_spans,
+            all_spans=list(spans_by_id.values()),
+        )
+    def _parse_span(self, raw_span: dict[str, Any]) -> Span:
+        parent_span_id: str | None = None
+        for ref in raw_span.get("references", []):
+            if ref.get("refType") == "CHILD_OF":
+                parent_span_id = ref.get("spanID")
+                break
+        # Jaeger tags are an array of {key, type, value} — flatten to dict
+        tags: dict[str, Any] = {}
+        for tag in raw_span.get("tags", []):
+            tags[tag["key"]] = tag["value"]
+        return Span(
+            trace_id=raw_span.get("traceID", ""),
+            span_id=raw_span.get("spanID", ""),
+            parent_span_id=parent_span_id,
+            operation_name=raw_span.get("operationName", ""),
+            start_time=raw_span.get("startTime", 0),
+            duration=raw_span.get("duration", 0),
+            tags=tags,
+        )

agentevals/loader/otlp.py ADDED Viewed

@@ -0,0 +1,193 @@
+"""OTLP/JSON trace loader for native OpenTelemetry format."""
+from __future__ import annotations
+import json
+import logging
+from ..trace_attrs import (
+    OTEL_GENAI_INPUT_MESSAGES,
+    OTEL_GENAI_OUTPUT_MESSAGES,
+    OTEL_SCOPE,
+    OTEL_SCOPE_VERSION,
+)
+from .base import Span, Trace, TraceLoader
+logger = logging.getLogger(__name__)
+class OtlpJsonLoader(TraceLoader):
+    """Loads traces from OTLP/JSON format (native OpenTelemetry format).
+    Supports two formats:
+    1. Full OTLP export with resourceSpans structure
+    2. JSONL format - one span per line (for streaming use cases)
+    OTLP uses nanosecond timestamps - these are converted to microseconds
+    to match the internal Span representation.
+    """
+    def format_name(self) -> str:
+        return "otlp-json"
+    def load(self, source: str) -> list[Trace]:
+        """Load OTLP JSON file or JSONL (one span per line)."""
+        with open(source) as f:
+            content = f.read().strip()
+        if not content:
+            logger.warning("Empty trace file: %s", source)
+            return []
+        if content.startswith("{"):
+            try:
+                data = json.loads(content)
+                if "resourceSpans" in data:
+                    traces = self._parse_otlp_export(data)
+                else:
+                    raise ValueError("Not a full OTLP export, trying JSONL")
+            except (json.JSONDecodeError, ValueError):
+                spans_list = [json.loads(line) for line in content.split("\n") if line.strip()]
+                traces = self._parse_otlp_spans(spans_list)
+        else:
+            spans_list = [json.loads(line) for line in content.split("\n") if line.strip()]
+            traces = self._parse_otlp_spans(spans_list)
+        logger.info("Loaded %d trace(s) from %s", len(traces), source)
+        return traces
+    def _parse_otlp_export(self, data: dict) -> list[Trace]:
+        """Parse full OTLP export structure with resourceSpans."""
+        all_spans = []
+        for resource_span in data.get("resourceSpans", []):
+            resource_attrs = self._extract_attributes(resource_span.get("resource", {}).get("attributes", []))
+            for scope_span in resource_span.get("scopeSpans", []):
+                scope = scope_span.get("scope", {})
+                scope_name = scope.get("name", "")
+                scope_version = scope.get("version", "")
+                for span_data in scope_span.get("spans", []):
+                    span = self._parse_span(span_data, resource_attrs, scope_name, scope_version)
+                    all_spans.append(span)
+        return self._build_traces(all_spans)
+    def _parse_otlp_spans(self, spans_data: list[dict]) -> list[Trace]:
+        """Parse flat list of OTLP spans (JSONL format for streaming)."""
+        all_spans = [self._parse_span(span_data, {}, "", "") for span_data in spans_data]
+        return self._build_traces(all_spans)
+    _GENAI_EVENT_KEYS = {OTEL_GENAI_INPUT_MESSAGES, OTEL_GENAI_OUTPUT_MESSAGES}
+    def _parse_span(
+        self,
+        span_data: dict,
+        resource_attrs: dict,
+        scope_name: str,
+        scope_version: str,
+    ) -> Span:
+        """Convert OTLP span to normalized Span object."""
+        attributes = self._extract_attributes(span_data.get("attributes", []))
+        if scope_name:
+            attributes[OTEL_SCOPE] = scope_name
+        if scope_version:
+            attributes[OTEL_SCOPE_VERSION] = scope_version
+        self._promote_genai_event_attributes(span_data, attributes)
+        attributes.update(resource_attrs)
+        start_time_ns = int(span_data.get("startTimeUnixNano", "0"))
+        end_time_ns = int(span_data.get("endTimeUnixNano", "0"))
+        start_time_us = start_time_ns // 1000
+        duration_us = (end_time_ns - start_time_ns) // 1000
+        parent_span_id = span_data.get("parentSpanId") or None
+        return Span(
+            trace_id=span_data.get("traceId", ""),
+            span_id=span_data.get("spanId", ""),
+            parent_span_id=parent_span_id,
+            operation_name=span_data.get("name", ""),
+            start_time=start_time_us,
+            duration=duration_us,
+            tags=attributes,
+        )
+    def _promote_genai_event_attributes(self, span_data: dict, attributes: dict) -> None:
+        """Promote gen_ai.input/output.messages from span events to attributes.
+        Some SDKs (e.g. Strands) store message content in span events rather
+        than span attributes. This promotes those values so the converter can
+        find them via normal attribute lookups.
+        """
+        for event in span_data.get("events", []):
+            for attr in event.get("attributes", []):
+                key = attr.get("key", "")
+                if key in self._GENAI_EVENT_KEYS and key not in attributes:
+                    value_obj = attr.get("value", {})
+                    if "stringValue" in value_obj:
+                        attributes[key] = value_obj["stringValue"]
+    def _extract_attributes(self, attrs_list: list[dict]) -> dict:
+        """Convert OTLP attributes array to flat dict.
+        OTLP attributes are [{key, value: {stringValue|intValue|...}}]
+        We flatten to {key: value} for easier use.
+        """
+        result = {}
+        for attr in attrs_list:
+            key = attr.get("key", "")
+            value_obj = attr.get("value", {})
+            if "stringValue" in value_obj:
+                result[key] = value_obj["stringValue"]
+            elif "intValue" in value_obj:
+                result[key] = int(value_obj["intValue"])
+            elif "doubleValue" in value_obj:
+                result[key] = float(value_obj["doubleValue"])
+            elif "boolValue" in value_obj:
+                result[key] = value_obj["boolValue"]
+            elif "arrayValue" in value_obj:
+                result[key] = json.dumps(value_obj["arrayValue"])
+            elif "kvlistValue" in value_obj:
+                result[key] = json.dumps(value_obj["kvlistValue"])
+        return result
+    def _build_traces(self, all_spans: list[Span]) -> list[Trace]:
+        """Group spans by trace_id and build parent-child relationships."""
+        traces_by_id: dict[str, list[Span]] = {}
+        for span in all_spans:
+            if span.trace_id not in traces_by_id:
+                traces_by_id[span.trace_id] = []
+            traces_by_id[span.trace_id].append(span)
+        traces = []
+        for trace_id, spans in traces_by_id.items():
+            spans_by_id = {s.span_id: s for s in spans}
+            root_spans = []
+            for span in spans:
+                if span.parent_span_id and span.parent_span_id in spans_by_id:
+                    spans_by_id[span.parent_span_id].children.append(span)
+                else:
+                    root_spans.append(span)
+            for span in spans:
+                span.children.sort(key=lambda s: s.start_time)
+            root_spans.sort(key=lambda s: s.start_time)
+            traces.append(
+                Trace(
+                    trace_id=trace_id,
+                    root_spans=root_spans,
+                    all_spans=spans,
+                )
+            )
+        return traces

agentevals/mcp_server.py ADDED Viewed

@@ -0,0 +1,236 @@
+from __future__ import annotations
+import os
+import tempfile
+from typing import Any
+import httpx
+from mcp.server import FastMCP
+from agentevals.config import EvalRunConfig
+from agentevals.runner import run_evaluation
+_DEFAULT_SERVER_URL = "http://localhost:8001"
+def create_server(server_url: str | None = None) -> FastMCP:
+    mcp = FastMCP("agentevals")
+    _url = (server_url or os.environ.get("AGENTEVALS_SERVER_URL", _DEFAULT_SERVER_URL)).rstrip("/")
+    def _unwrap(response_json: dict) -> Any:
+        if response_json.get("error"):
+            raise RuntimeError(f"API error: {response_json['error']}")
+        return response_json["data"]
+    async def _get(path: str) -> Any:
+        try:
+            async with httpx.AsyncClient(timeout=30) as client:
+                r = await client.get(f"{_url}{path}")
+                r.raise_for_status()
+                return _unwrap(r.json())
+        except httpx.ConnectError as exc:
+            raise RuntimeError(
+                f"Cannot reach agentevals server at {_url}. Start it with: uv run agentevals serve --dev"
+            ) from exc
+        except httpx.HTTPStatusError as exc:
+            raise RuntimeError(f"Server error {exc.response.status_code}: {exc.response.text}") from exc
+    async def _post(path: str, body: dict) -> Any:
+        try:
+            async with httpx.AsyncClient(timeout=60) as client:
+                r = await client.post(f"{_url}{path}", json=body)
+                r.raise_for_status()
+                return _unwrap(r.json())
+        except httpx.ConnectError as exc:
+            raise RuntimeError(
+                f"Cannot reach agentevals server at {_url}. Start it with: uv run agentevals serve --dev"
+            ) from exc
+        except httpx.HTTPStatusError as exc:
+            raise RuntimeError(f"Server error {exc.response.status_code}: {exc.response.text}") from exc
+    def _summarize_run_result(result) -> dict[str, Any]:
+        traces = []
+        for tr in result.trace_results:
+            traces.append(
+                {
+                    "trace_id": tr.trace_id,
+                    "num_invocations": tr.num_invocations,
+                    "metrics": [
+                        {
+                            "metric": mr.metric_name,
+                            "score": mr.score,
+                            "status": mr.eval_status,
+                            **({"error": mr.error} if mr.error else {}),
+                        }
+                        for mr in tr.metric_results
+                    ],
+                    **({"warnings": tr.conversion_warnings} if tr.conversion_warnings else {}),
+                }
+            )
+        return {
+            "passed": all(mr["status"] != "FAILED" for tr in traces for mr in tr["metrics"]),
+            "traces": traces,
+            **({"errors": result.errors} if result.errors else {}),
+        }
+    @mcp.tool()
+    async def list_metrics() -> list[dict[str, Any]]:
+        """List all available evaluation metrics with their descriptions and requirements."""
+        return await _get("/api/metrics")
+    @mcp.tool()
+    async def evaluate_traces(
+        trace_files: list[str],
+        metrics: list[str] | None = None,
+        trace_format: str = "jaeger-json",
+        eval_set_file: str | None = None,
+        judge_model: str | None = None,
+        threshold: float | None = None,
+        eval_config_file: str | None = None,
+    ) -> dict[str, Any]:
+        """Evaluate one or more local agent trace files.
+        Does not require the agentevals server to be running. Returns a flat summary
+        with a top-level 'passed' boolean and per-trace metric scores.
+        Args:
+            trace_files: Absolute paths to Jaeger JSON or OTLP JSON/JSONL trace files.
+            metrics: Metric names to evaluate. Use list_metrics to see available options.
+            trace_format: "jaeger-json" or "otlp-json".
+            eval_set_file: Path to a golden eval set JSON for comparison metrics.
+            judge_model: LLM model for judge-based metrics (e.g. "gemini-2.5-flash").
+            threshold: Score threshold for PASS/FAIL classification (0.0–1.0).
+            eval_config_file: Path to an eval config YAML file with custom evaluators.
+        """
+        if metrics is None:
+            metrics = ["tool_trajectory_avg_score"]
+        if eval_config_file:
+            from agentevals.eval_config_loader import load_eval_config, merge_configs
+            file_config = load_eval_config(eval_config_file)
+            cli_config = EvalRunConfig(
+                trace_files=trace_files,
+                metrics=metrics,
+                trace_format=trace_format,
+                eval_set_file=eval_set_file,
+                judge_model=judge_model,
+                threshold=threshold,
+            )
+            config = merge_configs(file_config, cli_config)
+        else:
+            config = EvalRunConfig(
+                trace_files=trace_files,
+                metrics=metrics,
+                trace_format=trace_format,
+                eval_set_file=eval_set_file,
+                judge_model=judge_model,
+                threshold=threshold,
+            )
+        result = await run_evaluation(config)
+        return _summarize_run_result(result)
+    @mcp.tool()
+    async def list_sessions(limit: int = 20) -> list[dict[str, Any]]:
+        """List streaming trace sessions, most recent first.
+        Requires agentevals serve to be running.
+        Args:
+            limit: Maximum number of sessions to return (default: 20).
+        """
+        sessions = await _get("/api/streaming/sessions")
+        sessions.sort(key=lambda s: s.get("startedAt", ""), reverse=True)
+        return [
+            {
+                "sessionId": s["sessionId"],
+                "isComplete": s["isComplete"],
+                "spanCount": s["spanCount"],
+                "startedAt": s["startedAt"],
+            }
+            for s in sessions[:limit]
+        ]
+    @mcp.tool()
+    async def summarize_session(session_id: str) -> dict[str, Any]:
+        """Get a structured summary of a session's invocations, tool calls, and messages.
+        Parses the raw trace and returns human-readable invocation data: user messages,
+        agent responses, and tool calls made. For the full span data, use get_session_trace.
+        Args:
+            session_id: Session ID from list_sessions.
+        """
+        from agentevals.converter import convert_traces
+        from agentevals.loader.otlp import OtlpJsonLoader
+        raw = await _post("/api/streaming/get-trace", {"session_id": session_id})
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
+            f.write(raw["traceContent"])
+            tmp_path = f.name
+        traces = OtlpJsonLoader().load(tmp_path)
+        if not traces:
+            return {"session_id": session_id, "num_spans": raw["numSpans"], "invocations": []}
+        invocations = []
+        for conv in convert_traces(traces):
+            for inv in conv.invocations:
+                tool_calls = []
+                if inv.intermediate_data:
+                    tool_calls = [
+                        {"tool": tu.name, "args": getattr(tu, "args", {})} for tu in inv.intermediate_data.tool_uses
+                    ]
+                invocations.append(
+                    {
+                        "user": next((p.text for p in inv.user_content.parts if p.text), "")
+                        if inv.user_content
+                        else "",
+                        "response": next((p.text for p in inv.final_response.parts if p.text), "")
+                        if inv.final_response
+                        else "",
+                        "tool_calls": tool_calls,
+                    }
+                )
+        return {
+            "session_id": session_id,
+            "num_spans": raw["numSpans"],
+            "num_invocations": len(invocations),
+            "invocations": invocations,
+        }
+    @mcp.tool()
+    async def evaluate_sessions(
+        golden_session_id: str,
+        metrics: list[str] | None = None,
+        judge_model: str = "gemini-2.5-flash",
+        eval_set_id: str | None = None,
+    ) -> dict[str, Any]:
+        """Evaluate all completed sessions against a golden reference session.
+        The server builds the eval set from the golden session automatically — no file
+        creation or pre-existing eval set needed. Call list_sessions first to find session IDs.
+        Requires agentevals serve to be running.
+        Args:
+            golden_session_id: Session ID of the reference/golden run.
+            metrics: Metric names to evaluate. Use list_metrics to see available options.
+            judge_model: LLM model for judge-based metrics.
+            eval_set_id: A label for the eval set built from the golden session. You can use
+                         any string or omit it — a default will be generated automatically.
+        """
+        if metrics is None:
+            metrics = ["tool_trajectory_avg_score"]
+        return await _post(
+            "/api/streaming/evaluate-sessions",
+            {
+                "golden_session_id": golden_session_id,
+                "eval_set_id": eval_set_id or f"eval-{golden_session_id[:12]}",
+                "metrics": metrics,
+                "judge_model": judge_model,
+            },
+        )
+    return mcp