evalgate-sdk 3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalgate_sdk/__init__.py +707 -0
- evalgate_sdk/_version.py +3 -0
- evalgate_sdk/assertions.py +1362 -0
- evalgate_sdk/auto.py +247 -0
- evalgate_sdk/batch.py +174 -0
- evalgate_sdk/cache.py +111 -0
- evalgate_sdk/ci_context.py +123 -0
- evalgate_sdk/cli/__init__.py +111 -0
- evalgate_sdk/cli/api.py +261 -0
- evalgate_sdk/cli/cli_constants.py +20 -0
- evalgate_sdk/cli/commands.py +1041 -0
- evalgate_sdk/cli/config.py +228 -0
- evalgate_sdk/cli/env.py +43 -0
- evalgate_sdk/cli/formatters/types.py +132 -0
- evalgate_sdk/cli/golden_commands.py +322 -0
- evalgate_sdk/cli/manifest.py +301 -0
- evalgate_sdk/cli/new_commands.py +435 -0
- evalgate_sdk/cli/policy_packs.py +103 -0
- evalgate_sdk/cli/profiles.py +12 -0
- evalgate_sdk/cli/regression_gate.py +312 -0
- evalgate_sdk/cli/render/__init__.py +1 -0
- evalgate_sdk/cli/render/snippet.py +18 -0
- evalgate_sdk/cli/render/sort.py +29 -0
- evalgate_sdk/cli/report/__init__.py +1 -0
- evalgate_sdk/cli/report/build_check_report.py +209 -0
- evalgate_sdk/cli/traces.py +186 -0
- evalgate_sdk/cli/workspace.py +63 -0
- evalgate_sdk/client.py +609 -0
- evalgate_sdk/cluster.py +359 -0
- evalgate_sdk/collector.py +161 -0
- evalgate_sdk/constants.py +6 -0
- evalgate_sdk/context.py +151 -0
- evalgate_sdk/errors.py +236 -0
- evalgate_sdk/export.py +238 -0
- evalgate_sdk/formatters/__init__.py +11 -0
- evalgate_sdk/formatters/github.py +51 -0
- evalgate_sdk/formatters/human.py +68 -0
- evalgate_sdk/formatters/json_fmt.py +11 -0
- evalgate_sdk/formatters/pr_comment.py +80 -0
- evalgate_sdk/golden.py +426 -0
- evalgate_sdk/integrations/__init__.py +1 -0
- evalgate_sdk/integrations/anthropic.py +99 -0
- evalgate_sdk/integrations/autogen.py +62 -0
- evalgate_sdk/integrations/crewai.py +61 -0
- evalgate_sdk/integrations/langchain.py +100 -0
- evalgate_sdk/integrations/openai.py +155 -0
- evalgate_sdk/integrations/openai_eval.py +221 -0
- evalgate_sdk/local.py +144 -0
- evalgate_sdk/logger.py +123 -0
- evalgate_sdk/matchers.py +62 -0
- evalgate_sdk/otel.py +256 -0
- evalgate_sdk/pagination.py +145 -0
- evalgate_sdk/py.typed +0 -0
- evalgate_sdk/pytest_plugin.py +96 -0
- evalgate_sdk/reason_codes.py +103 -0
- evalgate_sdk/regression.py +196 -0
- evalgate_sdk/replay_decision.py +115 -0
- evalgate_sdk/runtime/__init__.py +50 -0
- evalgate_sdk/runtime/adapters/__init__.py +1 -0
- evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
- evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
- evalgate_sdk/runtime/context.py +68 -0
- evalgate_sdk/runtime/eval.py +318 -0
- evalgate_sdk/runtime/execution_mode.py +170 -0
- evalgate_sdk/runtime/executor.py +92 -0
- evalgate_sdk/runtime/registry.py +125 -0
- evalgate_sdk/runtime/run_report.py +249 -0
- evalgate_sdk/runtime/types.py +143 -0
- evalgate_sdk/snapshot.py +219 -0
- evalgate_sdk/streaming.py +124 -0
- evalgate_sdk/synthesize.py +226 -0
- evalgate_sdk/testing.py +128 -0
- evalgate_sdk/types.py +666 -0
- evalgate_sdk/utils/__init__.py +1 -0
- evalgate_sdk/utils/input_hash.py +42 -0
- evalgate_sdk/workflows.py +264 -0
- evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
- evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
- evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
- evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""WorkflowTracer — multi-agent workflow tracing with handoffs, decisions, and cost tracking."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import contextlib
|
|
6
|
+
import json
|
|
7
|
+
import uuid
|
|
8
|
+
from collections.abc import Callable
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from typing import Any, TypeVar
|
|
11
|
+
|
|
12
|
+
from evalgate_sdk.types import (
|
|
13
|
+
AgentHandoff,
|
|
14
|
+
AgentSpanContext,
|
|
15
|
+
CostRecord,
|
|
16
|
+
HandoffType,
|
|
17
|
+
RecordCostParams,
|
|
18
|
+
RecordDecisionParams,
|
|
19
|
+
WorkflowContext,
|
|
20
|
+
WorkflowDefinition,
|
|
21
|
+
WorkflowStatus,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
T = TypeVar("T")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class WorkflowTracer:
|
|
28
|
+
"""Traces multi-agent workflows with span tracking, handoffs, decisions, and costs.
|
|
29
|
+
|
|
30
|
+
Usage::
|
|
31
|
+
|
|
32
|
+
tracer = WorkflowTracer(client)
|
|
33
|
+
ctx = await tracer.start_workflow("my-workflow")
|
|
34
|
+
span = await tracer.start_agent_span("agent-1", {"query": "hello"})
|
|
35
|
+
await tracer.end_agent_span(span, {"response": "hi"})
|
|
36
|
+
await tracer.end_workflow()
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
client: Any | None = None,
|
|
42
|
+
*,
|
|
43
|
+
name: str | None = None,
|
|
44
|
+
session_id: str | None = None,
|
|
45
|
+
offline: bool = False,
|
|
46
|
+
) -> None:
|
|
47
|
+
self._client = client
|
|
48
|
+
self._name = name
|
|
49
|
+
self._session_id = session_id or str(uuid.uuid4())
|
|
50
|
+
self._offline = offline or client is None
|
|
51
|
+
self._workflow: WorkflowContext | None = None
|
|
52
|
+
self._handoffs: list[AgentHandoff] = []
|
|
53
|
+
self._decisions: list[RecordDecisionParams] = []
|
|
54
|
+
self._costs: list[CostRecord] = []
|
|
55
|
+
self._spans: list[AgentSpanContext] = []
|
|
56
|
+
|
|
57
|
+
# ── Workflow lifecycle ────────────────────────────────────────
|
|
58
|
+
|
|
59
|
+
async def start_workflow(
|
|
60
|
+
self,
|
|
61
|
+
name: str | None = None,
|
|
62
|
+
definition: WorkflowDefinition | None = None,
|
|
63
|
+
metadata: dict[str, Any] | None = None,
|
|
64
|
+
) -> WorkflowContext:
|
|
65
|
+
wf_name = name or self._name or "unnamed-workflow"
|
|
66
|
+
trace_id: str | None = None
|
|
67
|
+
|
|
68
|
+
if not self._offline:
|
|
69
|
+
from evalgate_sdk.types import CreateTraceParams
|
|
70
|
+
|
|
71
|
+
trace = await self._client.traces.create(
|
|
72
|
+
CreateTraceParams(
|
|
73
|
+
name=wf_name,
|
|
74
|
+
metadata={
|
|
75
|
+
**(metadata or {}),
|
|
76
|
+
"workflow": True,
|
|
77
|
+
"session_id": self._session_id,
|
|
78
|
+
},
|
|
79
|
+
)
|
|
80
|
+
)
|
|
81
|
+
trace_id = trace.id
|
|
82
|
+
|
|
83
|
+
self._workflow = WorkflowContext(
|
|
84
|
+
workflow_id=str(uuid.uuid4()),
|
|
85
|
+
trace_id=trace_id,
|
|
86
|
+
name=wf_name,
|
|
87
|
+
status=WorkflowStatus.RUNNING,
|
|
88
|
+
definition=definition,
|
|
89
|
+
metadata=metadata,
|
|
90
|
+
started_at=datetime.now(timezone.utc),
|
|
91
|
+
)
|
|
92
|
+
return self._workflow
|
|
93
|
+
|
|
94
|
+
async def end_workflow(
|
|
95
|
+
self,
|
|
96
|
+
output: dict[str, Any] | None = None,
|
|
97
|
+
status: WorkflowStatus = WorkflowStatus.COMPLETED,
|
|
98
|
+
) -> None:
|
|
99
|
+
if self._workflow is None:
|
|
100
|
+
return
|
|
101
|
+
self._workflow.status = status
|
|
102
|
+
if not self._offline and self._workflow.trace_id is not None:
|
|
103
|
+
from evalgate_sdk.types import UpdateTraceParams
|
|
104
|
+
|
|
105
|
+
await self._client.traces.update(
|
|
106
|
+
self._workflow.trace_id,
|
|
107
|
+
UpdateTraceParams(
|
|
108
|
+
status=status.value,
|
|
109
|
+
metadata={
|
|
110
|
+
"output": output,
|
|
111
|
+
"handoffs": len(self._handoffs),
|
|
112
|
+
"decisions": len(self._decisions),
|
|
113
|
+
"total_cost": self.get_total_cost(),
|
|
114
|
+
},
|
|
115
|
+
),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# ── Agent spans ──────────────────────────────────────────────
|
|
119
|
+
|
|
120
|
+
async def start_agent_span(
|
|
121
|
+
self,
|
|
122
|
+
agent_name: str,
|
|
123
|
+
input: dict[str, Any] | None = None,
|
|
124
|
+
parent_span_id: str | None = None,
|
|
125
|
+
) -> AgentSpanContext:
|
|
126
|
+
span_id = str(uuid.uuid4())
|
|
127
|
+
trace_id = self._workflow.trace_id if self._workflow else None
|
|
128
|
+
|
|
129
|
+
if not self._offline and trace_id is not None:
|
|
130
|
+
from evalgate_sdk.types import CreateSpanParams
|
|
131
|
+
|
|
132
|
+
await self._client.traces.create_span(
|
|
133
|
+
trace_id,
|
|
134
|
+
CreateSpanParams(
|
|
135
|
+
name=agent_name,
|
|
136
|
+
span_id=span_id,
|
|
137
|
+
type="agent",
|
|
138
|
+
input=json.dumps(input) if input else None,
|
|
139
|
+
metadata={"parent_span_id": parent_span_id},
|
|
140
|
+
),
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
ctx = AgentSpanContext(
|
|
144
|
+
span_id=span_id,
|
|
145
|
+
agent_name=agent_name,
|
|
146
|
+
trace_id=trace_id,
|
|
147
|
+
parent_span_id=parent_span_id,
|
|
148
|
+
started_at=datetime.now(timezone.utc),
|
|
149
|
+
)
|
|
150
|
+
self._spans.append(ctx)
|
|
151
|
+
return ctx
|
|
152
|
+
|
|
153
|
+
async def end_agent_span(
|
|
154
|
+
self,
|
|
155
|
+
span: AgentSpanContext,
|
|
156
|
+
output: dict[str, Any] | None = None,
|
|
157
|
+
error: str | None = None,
|
|
158
|
+
) -> None:
|
|
159
|
+
span.ended_at = datetime.now(timezone.utc)
|
|
160
|
+
if not self._offline and span.trace_id is not None:
|
|
161
|
+
from evalgate_sdk.types import UpdateTraceParams
|
|
162
|
+
|
|
163
|
+
metadata: dict[str, Any] = {}
|
|
164
|
+
if output:
|
|
165
|
+
metadata["span_output"] = output
|
|
166
|
+
if error:
|
|
167
|
+
metadata["span_error"] = error
|
|
168
|
+
metadata["span_id"] = span.span_id
|
|
169
|
+
metadata["ended_at"] = span.ended_at.isoformat()
|
|
170
|
+
with contextlib.suppress(Exception):
|
|
171
|
+
await self._client.traces.update(
|
|
172
|
+
span.trace_id,
|
|
173
|
+
UpdateTraceParams(metadata=metadata),
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# ── Handoffs ─────────────────────────────────────────────────
|
|
177
|
+
|
|
178
|
+
async def record_handoff(
|
|
179
|
+
self,
|
|
180
|
+
from_agent: str | None,
|
|
181
|
+
to_agent: str,
|
|
182
|
+
context: dict[str, Any] | None = None,
|
|
183
|
+
handoff_type: HandoffType = HandoffType.DELEGATION,
|
|
184
|
+
) -> None:
|
|
185
|
+
handoff = AgentHandoff(
|
|
186
|
+
from_agent=from_agent,
|
|
187
|
+
to_agent=to_agent,
|
|
188
|
+
context=context,
|
|
189
|
+
handoff_type=handoff_type,
|
|
190
|
+
timestamp=datetime.now(timezone.utc),
|
|
191
|
+
)
|
|
192
|
+
self._handoffs.append(handoff)
|
|
193
|
+
|
|
194
|
+
# ── Decision auditing ────────────────────────────────────────
|
|
195
|
+
|
|
196
|
+
async def record_decision(self, params: RecordDecisionParams) -> None:
|
|
197
|
+
self._decisions.append(params)
|
|
198
|
+
|
|
199
|
+
# ── Cost tracking ────────────────────────────────────────────
|
|
200
|
+
|
|
201
|
+
async def record_cost(self, params: RecordCostParams) -> CostRecord:
|
|
202
|
+
record = CostRecord(
|
|
203
|
+
agent_name=params.agent_name,
|
|
204
|
+
category=params.category,
|
|
205
|
+
amount=params.amount,
|
|
206
|
+
currency=params.currency,
|
|
207
|
+
model=params.model,
|
|
208
|
+
tokens=params.tokens,
|
|
209
|
+
metadata=params.metadata,
|
|
210
|
+
timestamp=datetime.now(timezone.utc),
|
|
211
|
+
)
|
|
212
|
+
self._costs.append(record)
|
|
213
|
+
return record
|
|
214
|
+
|
|
215
|
+
def get_total_cost(self) -> float:
|
|
216
|
+
return sum(c.amount for c in self._costs)
|
|
217
|
+
|
|
218
|
+
def get_cost_breakdown(self) -> dict[str, float]:
|
|
219
|
+
breakdown: dict[str, float] = {}
|
|
220
|
+
for c in self._costs:
|
|
221
|
+
key = c.category.value
|
|
222
|
+
breakdown[key] = breakdown.get(key, 0.0) + c.amount
|
|
223
|
+
return breakdown
|
|
224
|
+
|
|
225
|
+
# ── Accessors ────────────────────────────────────────────────
|
|
226
|
+
|
|
227
|
+
def get_current_workflow(self) -> WorkflowContext | None:
|
|
228
|
+
return self._workflow
|
|
229
|
+
|
|
230
|
+
def is_workflow_active(self) -> bool:
|
|
231
|
+
return self._workflow is not None and self._workflow.status == WorkflowStatus.RUNNING
|
|
232
|
+
|
|
233
|
+
def get_handoffs(self) -> list[AgentHandoff]:
|
|
234
|
+
return list(self._handoffs)
|
|
235
|
+
|
|
236
|
+
def get_decisions(self) -> list[RecordDecisionParams]:
|
|
237
|
+
return list(self._decisions)
|
|
238
|
+
|
|
239
|
+
def get_costs(self) -> list[CostRecord]:
|
|
240
|
+
return list(self._costs)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def create_workflow_tracer(client: Any | None = None, **kwargs: Any) -> WorkflowTracer:
|
|
244
|
+
"""Factory for WorkflowTracer."""
|
|
245
|
+
return WorkflowTracer(client, **kwargs)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
async def trace_workflow_step(
|
|
249
|
+
tracer: WorkflowTracer,
|
|
250
|
+
agent_name: str,
|
|
251
|
+
fn: Callable[[], Any],
|
|
252
|
+
input: dict[str, Any] | None = None,
|
|
253
|
+
) -> Any:
|
|
254
|
+
"""Convenience wrapper: open a span, run *fn*, close the span."""
|
|
255
|
+
span = await tracer.start_agent_span(agent_name, input)
|
|
256
|
+
try:
|
|
257
|
+
result = fn()
|
|
258
|
+
if hasattr(result, "__await__"):
|
|
259
|
+
result = await result
|
|
260
|
+
await tracer.end_agent_span(span, output={"result": str(result)})
|
|
261
|
+
return result
|
|
262
|
+
except Exception as exc:
|
|
263
|
+
await tracer.end_agent_span(span, error=str(exc))
|
|
264
|
+
raise
|