agentevals-cli 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. agentevals/__init__.py +16 -0
  2. agentevals/_protocol.py +83 -0
  3. agentevals/api/__init__.py +0 -0
  4. agentevals/api/app.py +137 -0
  5. agentevals/api/debug_routes.py +268 -0
  6. agentevals/api/models.py +204 -0
  7. agentevals/api/otlp_app.py +25 -0
  8. agentevals/api/otlp_routes.py +383 -0
  9. agentevals/api/routes.py +554 -0
  10. agentevals/api/streaming_routes.py +373 -0
  11. agentevals/builtin_metrics.py +234 -0
  12. agentevals/cli.py +643 -0
  13. agentevals/config.py +108 -0
  14. agentevals/converter.py +328 -0
  15. agentevals/custom_evaluators.py +468 -0
  16. agentevals/eval_config_loader.py +147 -0
  17. agentevals/evaluator/__init__.py +24 -0
  18. agentevals/evaluator/resolver.py +70 -0
  19. agentevals/evaluator/sources.py +293 -0
  20. agentevals/evaluator/templates.py +224 -0
  21. agentevals/extraction.py +444 -0
  22. agentevals/genai_converter.py +538 -0
  23. agentevals/loader/__init__.py +7 -0
  24. agentevals/loader/base.py +53 -0
  25. agentevals/loader/jaeger.py +112 -0
  26. agentevals/loader/otlp.py +193 -0
  27. agentevals/mcp_server.py +236 -0
  28. agentevals/output.py +204 -0
  29. agentevals/runner.py +310 -0
  30. agentevals/sdk.py +433 -0
  31. agentevals/streaming/__init__.py +120 -0
  32. agentevals/streaming/incremental_processor.py +337 -0
  33. agentevals/streaming/processor.py +285 -0
  34. agentevals/streaming/session.py +36 -0
  35. agentevals/streaming/ws_server.py +806 -0
  36. agentevals/trace_attrs.py +32 -0
  37. agentevals/trace_metrics.py +126 -0
  38. agentevals/utils/__init__.py +0 -0
  39. agentevals/utils/genai_messages.py +142 -0
  40. agentevals/utils/log_buffer.py +43 -0
  41. agentevals/utils/log_enrichment.py +187 -0
  42. agentevals_cli-0.5.2.dist-info/METADATA +22 -0
  43. agentevals_cli-0.5.2.dist-info/RECORD +46 -0
  44. agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
  45. agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
  46. agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
agentevals/__init__.py ADDED
@@ -0,0 +1,16 @@
1
+ """agentevals: Standalone CLI to evaluate agent traces using ADK's scoring framework."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ try:
6
+ __version__ = version("agentevals")
7
+ except PackageNotFoundError:
8
+ __version__ = "0.0.0-dev"
9
+
10
+
11
+ def __getattr__(name):
12
+ if name == "AgentEvals":
13
+ from .sdk import AgentEvals
14
+
15
+ return AgentEvals
16
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,83 @@
1
+ """CLI-internal protocol types for the custom evaluator JSON wire format.
2
+
3
+ These mirror the types in ``agentevals_evaluator_sdk.types`` but are owned by
4
+ the CLI so that the CLI and SDK packages can be versioned independently. The
5
+ JSON schema produced/consumed by these models is the contract — not the Python
6
+ types themselves.
7
+
8
+ Protocol versioning rules:
9
+ - ``protocol_version`` uses ``"MAJOR.MINOR"`` format.
10
+ - MINOR bumps are additive-only (new fields with defaults). Old deserializers
11
+ silently ignore unknown fields.
12
+ - MAJOR bumps signal breaking changes (removed/renamed fields, type changes).
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from enum import Enum
18
+ from typing import Any, Optional
19
+
20
+ from pydantic import BaseModel, Field
21
+
22
+ PROTOCOL_VERSION = "1.0"
23
+
24
+
25
+ class ToolCallData(BaseModel):
26
+ """A single tool call made by the agent."""
27
+
28
+ name: str
29
+ args: dict[str, Any] = Field(default_factory=dict)
30
+
31
+
32
+ class ToolResponseData(BaseModel):
33
+ """A single tool response received by the agent."""
34
+
35
+ name: str
36
+ output: str = ""
37
+
38
+
39
+ class IntermediateStepData(BaseModel):
40
+ """Intermediate steps between user input and final response."""
41
+
42
+ tool_calls: list[ToolCallData] = Field(default_factory=list)
43
+ tool_responses: list[ToolResponseData] = Field(default_factory=list)
44
+
45
+
46
+ class InvocationData(BaseModel):
47
+ """Simplified, language-agnostic representation of a single agent turn."""
48
+
49
+ invocation_id: str = ""
50
+ user_content: str = ""
51
+ final_response: Optional[str] = None
52
+ intermediate_steps: IntermediateStepData = Field(default_factory=IntermediateStepData)
53
+
54
+
55
+ class EvalInput(BaseModel):
56
+ """Input payload sent to a custom evaluator on stdin."""
57
+
58
+ protocol_version: str = PROTOCOL_VERSION
59
+ metric_name: str
60
+ threshold: float = 0.5
61
+ config: dict[str, Any] = Field(default_factory=dict)
62
+ invocations: list[InvocationData] = Field(default_factory=list)
63
+ expected_invocations: Optional[list[InvocationData]] = None
64
+
65
+
66
+ class EvalStatus(str, Enum):
67
+ """Allowed ``status`` values on the evaluator JSON wire format (matches evaluator-sdk)."""
68
+
69
+ PASSED = "PASSED"
70
+ FAILED = "FAILED"
71
+ NOT_EVALUATED = "NOT_EVALUATED"
72
+
73
+
74
+ class EvalResult(BaseModel):
75
+ """Output payload expected from a custom evaluator on stdout."""
76
+
77
+ score: float = Field(ge=0.0, le=1.0)
78
+ status: Optional[EvalStatus] = Field(
79
+ default=None,
80
+ description="Derived from score vs threshold if omitted.",
81
+ )
82
+ per_invocation_scores: list[Optional[float]] = Field(default_factory=list)
83
+ details: Optional[dict[str, Any]] = None
File without changes
agentevals/api/app.py ADDED
@@ -0,0 +1,137 @@
1
+ """FastAPI application for agentevals REST API."""
2
+
3
+ import asyncio
4
+ import json
5
+ import logging
6
+ import os
7
+ from contextlib import asynccontextmanager
8
+ from pathlib import Path
9
+
10
+ from fastapi import FastAPI
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+ from fastapi.responses import StreamingResponse
13
+
14
+ from agentevals import __version__
15
+
16
+ from ..utils.log_buffer import log_buffer
17
+ from .debug_routes import debug_router
18
+ from .debug_routes import set_trace_manager as set_debug_trace_manager
19
+ from .routes import router
20
+
21
+ try:
22
+ from dotenv import load_dotenv
23
+
24
+ env_path = Path(__file__).parent.parent.parent.parent / ".env"
25
+ if env_path.exists():
26
+ load_dotenv(env_path)
27
+ except ImportError:
28
+ pass
29
+
30
+
31
+ @asynccontextmanager
32
+ async def lifespan(app: FastAPI):
33
+ log_level_str = os.getenv("AGENTEVALS_LOG_LEVEL", "INFO").upper()
34
+ log_level = getattr(logging, log_level_str, logging.INFO)
35
+ logging.basicConfig(
36
+ level=log_level,
37
+ format="%(levelname)s:%(name)s:%(message)s",
38
+ force=True,
39
+ )
40
+ ae_logger = logging.getLogger("agentevals")
41
+ ae_logger.setLevel(log_level)
42
+ if log_buffer not in ae_logger.handlers:
43
+ log_buffer.setFormatter(logging.Formatter("%(levelname)s:%(name)s:%(message)s"))
44
+ ae_logger.addHandler(log_buffer)
45
+ if _trace_manager:
46
+ _trace_manager.start_cleanup_task()
47
+ yield
48
+ if _trace_manager:
49
+ await _trace_manager.shutdown()
50
+ ae_logger.removeHandler(log_buffer)
51
+
52
+
53
+ app = FastAPI(
54
+ title="agentevals API",
55
+ version=__version__,
56
+ description="REST API for evaluating agent traces using ADK's scoring framework",
57
+ lifespan=lifespan,
58
+ )
59
+
60
+ app.add_middleware(
61
+ CORSMiddleware,
62
+ allow_origins=["http://localhost:5173", "http://localhost:5174"],
63
+ allow_credentials=True,
64
+ allow_methods=["*"],
65
+ allow_headers=["*"],
66
+ expose_headers=["*"],
67
+ )
68
+
69
+ app.include_router(router, prefix="/api")
70
+ app.include_router(debug_router, prefix="/api/debug")
71
+
72
+ _live_mode = os.getenv("AGENTEVALS_LIVE") == "1"
73
+ _trace_manager = None
74
+
75
+ if _live_mode:
76
+ from fastapi import WebSocket
77
+
78
+ from ..streaming.ws_server import StreamingTraceManager
79
+ from .streaming_routes import set_trace_manager, streaming_router
80
+
81
+ app.include_router(streaming_router, prefix="/api/streaming")
82
+ _trace_manager = StreamingTraceManager()
83
+ set_trace_manager(_trace_manager)
84
+ set_debug_trace_manager(_trace_manager)
85
+
86
+ @app.websocket("/ws/traces")
87
+ async def websocket_endpoint(websocket: WebSocket):
88
+ await _trace_manager.handle_connection(websocket)
89
+
90
+ @app.get("/stream/ui-updates")
91
+ async def ui_updates_stream():
92
+ async def event_generator():
93
+ queue = _trace_manager.register_sse_client()
94
+ try:
95
+ while True:
96
+ event = await queue.get()
97
+ if event is None:
98
+ break
99
+ yield f"data: {json.dumps(event)}\n\n"
100
+ except asyncio.CancelledError:
101
+ pass
102
+ finally:
103
+ _trace_manager.unregister_sse_client(queue)
104
+
105
+ return StreamingResponse(
106
+ event_generator(),
107
+ media_type="text/event-stream",
108
+ headers={
109
+ "Cache-Control": "no-cache",
110
+ "Connection": "keep-alive",
111
+ },
112
+ )
113
+
114
+
115
+ def get_trace_manager():
116
+ return _trace_manager
117
+
118
+
119
+ _static_dir = Path(__file__).parent.parent / "_static"
120
+ _has_ui = _static_dir.is_dir() and (_static_dir / "index.html").exists()
121
+
122
+ if _has_ui and not os.getenv("AGENTEVALS_HEADLESS"):
123
+ from fastapi.responses import FileResponse
124
+ from fastapi.staticfiles import StaticFiles
125
+
126
+ app.mount("/assets", StaticFiles(directory=_static_dir / "assets"), name="ui-assets")
127
+
128
+ @app.get("/")
129
+ async def root():
130
+ return FileResponse(_static_dir / "index.html")
131
+
132
+ @app.get("/{path:path}")
133
+ async def spa_fallback(path: str):
134
+ file_path = _static_dir / path
135
+ if file_path.is_file():
136
+ return FileResponse(file_path)
137
+ return FileResponse(_static_dir / "index.html")
@@ -0,0 +1,268 @@
1
+ from __future__ import annotations
2
+
3
+ import glob
4
+ import importlib.metadata
5
+ import io
6
+ import json
7
+ import logging
8
+ import os
9
+ import platform
10
+ import sys
11
+ import tempfile
12
+ import zipfile
13
+ from datetime import UTC, datetime
14
+ from typing import TYPE_CHECKING
15
+
16
+ from fastapi import APIRouter, HTTPException, UploadFile
17
+ from fastapi import File as FastAPIFile
18
+ from fastapi.responses import StreamingResponse
19
+ from pydantic import BaseModel
20
+
21
+ from agentevals import __version__
22
+
23
+ from ..utils.log_buffer import log_buffer
24
+ from .models import DebugLoadData, SessionInfo, StandardResponse, WSSessionCompleteEvent, WSSessionStartedEvent
25
+
26
+ if TYPE_CHECKING:
27
+ from ..streaming.ws_server import StreamingTraceManager
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ debug_router = APIRouter()
32
+
33
+ _trace_manager: StreamingTraceManager | None = None
34
+
35
+
36
+ def set_trace_manager(manager: StreamingTraceManager) -> None:
37
+ global _trace_manager
38
+ _trace_manager = manager
39
+
40
+
41
+ class FrontendDiagnostics(BaseModel):
42
+ user_description: str = ""
43
+ browser_info: dict = {}
44
+ console_logs: list[dict] = []
45
+ app_state: dict = {}
46
+ network_errors: list[dict] = []
47
+
48
+
49
+ def _get_package_version(name: str) -> str | None:
50
+ try:
51
+ return importlib.metadata.version(name)
52
+ except importlib.metadata.PackageNotFoundError:
53
+ return None
54
+
55
+
56
+ def _collect_environment() -> dict:
57
+ packages = [
58
+ "fastapi",
59
+ "uvicorn",
60
+ "google-adk",
61
+ "google-genai",
62
+ "opentelemetry-sdk",
63
+ "opentelemetry-api",
64
+ "pydantic",
65
+ ]
66
+ return {
67
+ "timestamp": datetime.now(tz=UTC).isoformat(),
68
+ "agentevals_version": __version__,
69
+ "python_version": sys.version,
70
+ "os": platform.system(),
71
+ "os_version": platform.release(),
72
+ "machine": platform.machine(),
73
+ "packages": {p: _get_package_version(p) for p in packages},
74
+ "config": {
75
+ "log_level": os.getenv("AGENTEVALS_LOG_LEVEL", "INFO"),
76
+ "live_mode": os.getenv("AGENTEVALS_LIVE") == "1",
77
+ },
78
+ "api_keys": {
79
+ "google": bool(os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY")),
80
+ "anthropic": bool(os.getenv("ANTHROPIC_API_KEY")),
81
+ "openai": bool(os.getenv("OPENAI_API_KEY")),
82
+ },
83
+ }
84
+
85
+
86
+ def _collect_sessions() -> list[dict]:
87
+ if not _trace_manager:
88
+ return []
89
+
90
+ sessions_data = []
91
+ for session in _trace_manager.sessions.values():
92
+ sessions_data.append(
93
+ {
94
+ "session_id": session.session_id,
95
+ "trace_id": session.trace_id,
96
+ "eval_set_id": session.eval_set_id,
97
+ "started_at": session.started_at.isoformat(),
98
+ "is_complete": session.is_complete,
99
+ "span_count": len(session.spans),
100
+ "log_count": len(session.logs),
101
+ "metadata": session.metadata,
102
+ "spans": session.spans,
103
+ "logs": session.logs,
104
+ }
105
+ )
106
+ return sessions_data
107
+
108
+
109
+ def _collect_temp_files(session_ids: set[str] | None = None) -> dict[str, str]:
110
+ """Collect temp files, filtering JSONL files to current sessions only."""
111
+ tmp_dir = tempfile.gettempdir()
112
+ files = {}
113
+ for pattern in ["agentevals_*.jsonl", "eval_set_*.json"]:
114
+ for path in glob.glob(os.path.join(tmp_dir, pattern)):
115
+ basename = os.path.basename(path)
116
+ # Filter JSONL files to only include current sessions
117
+ if session_ids is not None and basename.endswith(".jsonl"):
118
+ # Extract session ID from filename: agentevals_{session_id}.jsonl
119
+ sid = basename.removeprefix("agentevals_").removesuffix(".jsonl")
120
+ if sid not in session_ids:
121
+ continue
122
+ try:
123
+ with open(path) as f:
124
+ files[basename] = f.read()
125
+ except OSError:
126
+ logger.debug("Could not read temp file %s", path)
127
+ return files
128
+
129
+
130
+ @debug_router.post("/bundle")
131
+ async def create_debug_bundle(diagnostics: FrontendDiagnostics):
132
+ timestamp = datetime.now(tz=UTC).strftime("%Y%m%d-%H%M%S")
133
+ prefix = f"bug-report-{timestamp}"
134
+
135
+ buf = io.BytesIO()
136
+ with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
137
+ env = _collect_environment()
138
+ metadata = {
139
+ **env,
140
+ "user_description": diagnostics.user_description,
141
+ "browser_info": diagnostics.browser_info,
142
+ }
143
+ zf.writestr(f"{prefix}/metadata.json", json.dumps(metadata, indent=2))
144
+
145
+ sessions = _collect_sessions()
146
+ for s in sessions:
147
+ sid = s["session_id"]
148
+ zf.writestr(
149
+ f"{prefix}/sessions/{sid}/spans.json",
150
+ json.dumps(s["spans"], indent=2),
151
+ )
152
+ zf.writestr(
153
+ f"{prefix}/sessions/{sid}/logs.json",
154
+ json.dumps(s["logs"], indent=2),
155
+ )
156
+ session_meta = {k: v for k, v in s.items() if k not in ("spans", "logs")}
157
+ zf.writestr(
158
+ f"{prefix}/sessions/{sid}/session_meta.json",
159
+ json.dumps(session_meta, indent=2),
160
+ )
161
+
162
+ zf.writestr(f"{prefix}/backend_logs.txt", log_buffer.get_text())
163
+
164
+ current_session_ids = {s["session_id"] for s in sessions}
165
+ temp_files = _collect_temp_files(session_ids=current_session_ids)
166
+ for filename, content in temp_files.items():
167
+ zf.writestr(f"{prefix}/temp_files/{filename}", content)
168
+
169
+ zf.writestr(
170
+ f"{prefix}/frontend_state.json",
171
+ json.dumps(diagnostics.app_state, indent=2),
172
+ )
173
+ zf.writestr(
174
+ f"{prefix}/console_logs.json",
175
+ json.dumps(diagnostics.console_logs, indent=2),
176
+ )
177
+ zf.writestr(
178
+ f"{prefix}/network_errors.json",
179
+ json.dumps(diagnostics.network_errors, indent=2),
180
+ )
181
+
182
+ buf.seek(0)
183
+ return StreamingResponse(
184
+ buf,
185
+ media_type="application/zip",
186
+ headers={"Content-Disposition": f'attachment; filename="bug-report-{timestamp}.zip"'},
187
+ )
188
+
189
+
190
+ @debug_router.post("/load", response_model=StandardResponse[DebugLoadData])
191
+ async def load_debug_bundle(file: UploadFile = FastAPIFile(...)):
192
+ if not _trace_manager:
193
+ raise HTTPException(
194
+ status_code=400,
195
+ detail="Live mode is not enabled. Start with: agentevals serve --dev",
196
+ )
197
+
198
+ content = await file.read()
199
+ try:
200
+ zf = zipfile.ZipFile(io.BytesIO(content))
201
+ except zipfile.BadZipFile as exc:
202
+ raise HTTPException(status_code=400, detail="Invalid ZIP file") from exc
203
+
204
+ session_dirs: dict[str, list[str]] = {}
205
+ for name in zf.namelist():
206
+ parts = name.split("/")
207
+ if len(parts) >= 4 and parts[-3] == "sessions":
208
+ sid = parts[-2]
209
+ session_dirs.setdefault(sid, []).append(name)
210
+
211
+ if not session_dirs:
212
+ raise HTTPException(status_code=400, detail="No sessions found in ZIP")
213
+
214
+ from ..streaming.session import TraceSession
215
+
216
+ loaded = []
217
+ for sid, files in session_dirs.items():
218
+ meta_file = next((f for f in files if f.endswith("session_meta.json")), None)
219
+ spans_file = next((f for f in files if f.endswith("spans.json")), None)
220
+ logs_file = next((f for f in files if f.endswith("logs.json")), None)
221
+
222
+ if not spans_file:
223
+ continue
224
+
225
+ meta = json.loads(zf.read(meta_file)) if meta_file else {}
226
+ spans = json.loads(zf.read(spans_file))
227
+ logs = json.loads(zf.read(logs_file)) if logs_file else []
228
+
229
+ session = TraceSession(
230
+ session_id=meta.get("session_id", sid),
231
+ trace_id=meta.get("trace_id", sid),
232
+ eval_set_id=meta.get("eval_set_id"),
233
+ spans=spans,
234
+ logs=logs,
235
+ is_complete=True,
236
+ metadata=meta.get("metadata", {}),
237
+ )
238
+
239
+ _trace_manager.sessions[session.session_id] = session
240
+
241
+ await _trace_manager.broadcast_to_ui(
242
+ WSSessionStartedEvent(
243
+ session=SessionInfo(
244
+ session_id=session.session_id,
245
+ trace_id=session.trace_id,
246
+ eval_set_id=session.eval_set_id,
247
+ span_count=len(session.spans),
248
+ is_complete=False,
249
+ started_at=session.started_at.isoformat(),
250
+ metadata=session.metadata,
251
+ ),
252
+ ).model_dump(by_alias=True)
253
+ )
254
+
255
+ invocations_data = await _trace_manager._extract_invocations(session)
256
+ await _trace_manager._save_spans_to_temp_file(session)
257
+
258
+ await _trace_manager.broadcast_to_ui(
259
+ WSSessionCompleteEvent(
260
+ session_id=session.session_id,
261
+ invocations=invocations_data,
262
+ ).model_dump(by_alias=True)
263
+ )
264
+
265
+ loaded.append(session.session_id)
266
+ logger.info("Loaded session from bug report: %s", session.session_id)
267
+
268
+ return StandardResponse(data=DebugLoadData(loaded_sessions=loaded, count=len(loaded)))
@@ -0,0 +1,204 @@
1
+ """Pydantic response and event models for the agentevals API.
2
+
3
+ Provides a StandardResponse[T] envelope, typed REST response models,
4
+ SSE evaluation event models, and WebSocket/UI broadcast event models.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any, Generic, TypeVar
10
+
11
+ from pydantic import BaseModel, ConfigDict, Field
12
+ from pydantic.alias_generators import to_camel
13
+
14
+ T = TypeVar("T")
15
+
16
+
17
+ class CamelModel(BaseModel):
18
+ model_config = ConfigDict(
19
+ alias_generator=to_camel,
20
+ populate_by_name=True,
21
+ )
22
+
23
+
24
+ class StandardResponse(CamelModel, Generic[T]):
25
+ data: T
26
+ error: str | None = None
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # REST response data models
31
+ # ---------------------------------------------------------------------------
32
+
33
+
34
+ class HealthData(CamelModel):
35
+ status: str
36
+ version: str
37
+
38
+
39
+ class ApiKeyStatus(CamelModel):
40
+ google: bool
41
+ anthropic: bool
42
+ openai: bool
43
+
44
+
45
+ class ConfigData(CamelModel):
46
+ api_keys: ApiKeyStatus
47
+
48
+
49
+ class MetricInfo(CamelModel):
50
+ name: str
51
+ category: str
52
+ requires_eval_set: bool
53
+ requires_llm: bool = Field(alias="requiresLLM")
54
+ requires_gcp: bool = Field(alias="requiresGCP")
55
+ requires_rubrics: bool
56
+ description: str
57
+ working: bool
58
+
59
+
60
+ class EvalSetValidation(CamelModel):
61
+ valid: bool
62
+ eval_set_id: str | None = None
63
+ num_cases: int | None = None
64
+ errors: list[str] = Field(default_factory=list)
65
+
66
+
67
+ class SessionInfo(CamelModel):
68
+ session_id: str
69
+ trace_id: str
70
+ eval_set_id: str | None = None
71
+ span_count: int
72
+ is_complete: bool
73
+ started_at: str
74
+ metadata: dict[str, Any] = Field(default_factory=dict)
75
+ invocations: list[dict[str, Any]] | None = None
76
+
77
+
78
+ class CreateEvalSetData(CamelModel):
79
+ eval_set: dict[str, Any]
80
+ num_invocations: int
81
+
82
+
83
+ class SessionEvalResult(CamelModel):
84
+ session_id: str
85
+ trace_id: str | None = None
86
+ num_invocations: int | None = None
87
+ metric_results: list[dict[str, Any]] | None = None
88
+ error: str | None = None
89
+
90
+
91
+ class EvaluateSessionsData(CamelModel):
92
+ golden_session_id: str
93
+ eval_set_id: str
94
+ results: list[SessionEvalResult]
95
+
96
+
97
+ class PrepareEvaluationData(CamelModel):
98
+ eval_set_url: str
99
+ trace_urls: list[str]
100
+ num_traces: int
101
+
102
+
103
+ class GetTraceData(CamelModel):
104
+ session_id: str
105
+ trace_content: str
106
+ num_spans: int
107
+
108
+
109
+ class DebugLoadData(CamelModel):
110
+ loaded_sessions: list[str]
111
+ count: int
112
+
113
+
114
+ # ---------------------------------------------------------------------------
115
+ # SSE evaluation event models
116
+ # ---------------------------------------------------------------------------
117
+
118
+
119
+ class SSEProgressEvent(CamelModel):
120
+ message: str
121
+
122
+
123
+ class SSETraceProgress(CamelModel):
124
+ trace_id: str
125
+ partial_result: dict[str, Any]
126
+
127
+
128
+ class SSETraceProgressEvent(CamelModel):
129
+ trace_progress: SSETraceProgress
130
+
131
+
132
+ class SSEPerformanceMetricsEvent(CamelModel):
133
+ trace_id: str
134
+ performance_metrics: dict[str, Any]
135
+ trace_metadata: dict[str, Any] | None = None
136
+
137
+
138
+ class SSEDoneEvent(CamelModel):
139
+ done: bool = True
140
+ result: dict[str, Any]
141
+
142
+
143
+ class SSEErrorEvent(CamelModel):
144
+ error: str
145
+
146
+
147
+ # ---------------------------------------------------------------------------
148
+ # WebSocket / UI broadcast event models
149
+ # ---------------------------------------------------------------------------
150
+
151
+
152
+ class WSSessionStartedEvent(CamelModel):
153
+ type: str = "session_started"
154
+ session: SessionInfo
155
+
156
+
157
+ class WSSessionCompleteEvent(CamelModel):
158
+ type: str = "session_complete"
159
+ session_id: str
160
+ invocations: list[dict[str, Any]]
161
+
162
+
163
+ class WSSpanReceivedEvent(CamelModel):
164
+ type: str = "span_received"
165
+ session_id: str
166
+ span: dict[str, Any]
167
+
168
+
169
+ class WSUserInputEvent(CamelModel):
170
+ type: str = "user_input"
171
+ session_id: str
172
+ invocation_id: str
173
+ text: str
174
+ timestamp: float
175
+
176
+
177
+ class WSAgentResponseEvent(CamelModel):
178
+ type: str = "agent_response"
179
+ session_id: str
180
+ invocation_id: str
181
+ text: str
182
+ timestamp: float
183
+
184
+
185
+ class WSToolCallEvent(CamelModel):
186
+ type: str = "tool_call"
187
+ session_id: str
188
+ invocation_id: str
189
+ tool_call: dict[str, Any]
190
+ timestamp: float
191
+
192
+
193
+ class WSTokenUpdateEvent(CamelModel):
194
+ type: str = "token_update"
195
+ session_id: str
196
+ invocation_id: str | None = None
197
+ input_tokens: int
198
+ output_tokens: int
199
+ model: str | None = None
200
+
201
+
202
+ class WSErrorEvent(CamelModel):
203
+ type: str = "error"
204
+ message: str