evalgate-sdk 3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. evalgate_sdk/__init__.py +707 -0
  2. evalgate_sdk/_version.py +3 -0
  3. evalgate_sdk/assertions.py +1362 -0
  4. evalgate_sdk/auto.py +247 -0
  5. evalgate_sdk/batch.py +174 -0
  6. evalgate_sdk/cache.py +111 -0
  7. evalgate_sdk/ci_context.py +123 -0
  8. evalgate_sdk/cli/__init__.py +111 -0
  9. evalgate_sdk/cli/api.py +261 -0
  10. evalgate_sdk/cli/cli_constants.py +20 -0
  11. evalgate_sdk/cli/commands.py +1041 -0
  12. evalgate_sdk/cli/config.py +228 -0
  13. evalgate_sdk/cli/env.py +43 -0
  14. evalgate_sdk/cli/formatters/types.py +132 -0
  15. evalgate_sdk/cli/golden_commands.py +322 -0
  16. evalgate_sdk/cli/manifest.py +301 -0
  17. evalgate_sdk/cli/new_commands.py +435 -0
  18. evalgate_sdk/cli/policy_packs.py +103 -0
  19. evalgate_sdk/cli/profiles.py +12 -0
  20. evalgate_sdk/cli/regression_gate.py +312 -0
  21. evalgate_sdk/cli/render/__init__.py +1 -0
  22. evalgate_sdk/cli/render/snippet.py +18 -0
  23. evalgate_sdk/cli/render/sort.py +29 -0
  24. evalgate_sdk/cli/report/__init__.py +1 -0
  25. evalgate_sdk/cli/report/build_check_report.py +209 -0
  26. evalgate_sdk/cli/traces.py +186 -0
  27. evalgate_sdk/cli/workspace.py +63 -0
  28. evalgate_sdk/client.py +609 -0
  29. evalgate_sdk/cluster.py +359 -0
  30. evalgate_sdk/collector.py +161 -0
  31. evalgate_sdk/constants.py +6 -0
  32. evalgate_sdk/context.py +151 -0
  33. evalgate_sdk/errors.py +236 -0
  34. evalgate_sdk/export.py +238 -0
  35. evalgate_sdk/formatters/__init__.py +11 -0
  36. evalgate_sdk/formatters/github.py +51 -0
  37. evalgate_sdk/formatters/human.py +68 -0
  38. evalgate_sdk/formatters/json_fmt.py +11 -0
  39. evalgate_sdk/formatters/pr_comment.py +80 -0
  40. evalgate_sdk/golden.py +426 -0
  41. evalgate_sdk/integrations/__init__.py +1 -0
  42. evalgate_sdk/integrations/anthropic.py +99 -0
  43. evalgate_sdk/integrations/autogen.py +62 -0
  44. evalgate_sdk/integrations/crewai.py +61 -0
  45. evalgate_sdk/integrations/langchain.py +100 -0
  46. evalgate_sdk/integrations/openai.py +155 -0
  47. evalgate_sdk/integrations/openai_eval.py +221 -0
  48. evalgate_sdk/local.py +144 -0
  49. evalgate_sdk/logger.py +123 -0
  50. evalgate_sdk/matchers.py +62 -0
  51. evalgate_sdk/otel.py +256 -0
  52. evalgate_sdk/pagination.py +145 -0
  53. evalgate_sdk/py.typed +0 -0
  54. evalgate_sdk/pytest_plugin.py +96 -0
  55. evalgate_sdk/reason_codes.py +103 -0
  56. evalgate_sdk/regression.py +196 -0
  57. evalgate_sdk/replay_decision.py +115 -0
  58. evalgate_sdk/runtime/__init__.py +50 -0
  59. evalgate_sdk/runtime/adapters/__init__.py +1 -0
  60. evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
  61. evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
  62. evalgate_sdk/runtime/context.py +68 -0
  63. evalgate_sdk/runtime/eval.py +318 -0
  64. evalgate_sdk/runtime/execution_mode.py +170 -0
  65. evalgate_sdk/runtime/executor.py +92 -0
  66. evalgate_sdk/runtime/registry.py +125 -0
  67. evalgate_sdk/runtime/run_report.py +249 -0
  68. evalgate_sdk/runtime/types.py +143 -0
  69. evalgate_sdk/snapshot.py +219 -0
  70. evalgate_sdk/streaming.py +124 -0
  71. evalgate_sdk/synthesize.py +226 -0
  72. evalgate_sdk/testing.py +128 -0
  73. evalgate_sdk/types.py +666 -0
  74. evalgate_sdk/utils/__init__.py +1 -0
  75. evalgate_sdk/utils/input_hash.py +42 -0
  76. evalgate_sdk/workflows.py +264 -0
  77. evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
  78. evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
  79. evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
  80. evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,264 @@
1
+ """WorkflowTracer — multi-agent workflow tracing with handoffs, decisions, and cost tracking."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import contextlib
6
+ import json
7
+ import uuid
8
+ from collections.abc import Callable
9
+ from datetime import datetime, timezone
10
+ from typing import Any, TypeVar
11
+
12
+ from evalgate_sdk.types import (
13
+ AgentHandoff,
14
+ AgentSpanContext,
15
+ CostRecord,
16
+ HandoffType,
17
+ RecordCostParams,
18
+ RecordDecisionParams,
19
+ WorkflowContext,
20
+ WorkflowDefinition,
21
+ WorkflowStatus,
22
+ )
23
+
24
+ T = TypeVar("T")
25
+
26
+
27
+ class WorkflowTracer:
28
+ """Traces multi-agent workflows with span tracking, handoffs, decisions, and costs.
29
+
30
+ Usage::
31
+
32
+ tracer = WorkflowTracer(client)
33
+ ctx = await tracer.start_workflow("my-workflow")
34
+ span = await tracer.start_agent_span("agent-1", {"query": "hello"})
35
+ await tracer.end_agent_span(span, {"response": "hi"})
36
+ await tracer.end_workflow()
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ client: Any | None = None,
42
+ *,
43
+ name: str | None = None,
44
+ session_id: str | None = None,
45
+ offline: bool = False,
46
+ ) -> None:
47
+ self._client = client
48
+ self._name = name
49
+ self._session_id = session_id or str(uuid.uuid4())
50
+ self._offline = offline or client is None
51
+ self._workflow: WorkflowContext | None = None
52
+ self._handoffs: list[AgentHandoff] = []
53
+ self._decisions: list[RecordDecisionParams] = []
54
+ self._costs: list[CostRecord] = []
55
+ self._spans: list[AgentSpanContext] = []
56
+
57
+ # ── Workflow lifecycle ────────────────────────────────────────
58
+
59
+ async def start_workflow(
60
+ self,
61
+ name: str | None = None,
62
+ definition: WorkflowDefinition | None = None,
63
+ metadata: dict[str, Any] | None = None,
64
+ ) -> WorkflowContext:
65
+ wf_name = name or self._name or "unnamed-workflow"
66
+ trace_id: str | None = None
67
+
68
+ if not self._offline:
69
+ from evalgate_sdk.types import CreateTraceParams
70
+
71
+ trace = await self._client.traces.create(
72
+ CreateTraceParams(
73
+ name=wf_name,
74
+ metadata={
75
+ **(metadata or {}),
76
+ "workflow": True,
77
+ "session_id": self._session_id,
78
+ },
79
+ )
80
+ )
81
+ trace_id = trace.id
82
+
83
+ self._workflow = WorkflowContext(
84
+ workflow_id=str(uuid.uuid4()),
85
+ trace_id=trace_id,
86
+ name=wf_name,
87
+ status=WorkflowStatus.RUNNING,
88
+ definition=definition,
89
+ metadata=metadata,
90
+ started_at=datetime.now(timezone.utc),
91
+ )
92
+ return self._workflow
93
+
94
+ async def end_workflow(
95
+ self,
96
+ output: dict[str, Any] | None = None,
97
+ status: WorkflowStatus = WorkflowStatus.COMPLETED,
98
+ ) -> None:
99
+ if self._workflow is None:
100
+ return
101
+ self._workflow.status = status
102
+ if not self._offline and self._workflow.trace_id is not None:
103
+ from evalgate_sdk.types import UpdateTraceParams
104
+
105
+ await self._client.traces.update(
106
+ self._workflow.trace_id,
107
+ UpdateTraceParams(
108
+ status=status.value,
109
+ metadata={
110
+ "output": output,
111
+ "handoffs": len(self._handoffs),
112
+ "decisions": len(self._decisions),
113
+ "total_cost": self.get_total_cost(),
114
+ },
115
+ ),
116
+ )
117
+
118
+ # ── Agent spans ──────────────────────────────────────────────
119
+
120
+ async def start_agent_span(
121
+ self,
122
+ agent_name: str,
123
+ input: dict[str, Any] | None = None,
124
+ parent_span_id: str | None = None,
125
+ ) -> AgentSpanContext:
126
+ span_id = str(uuid.uuid4())
127
+ trace_id = self._workflow.trace_id if self._workflow else None
128
+
129
+ if not self._offline and trace_id is not None:
130
+ from evalgate_sdk.types import CreateSpanParams
131
+
132
+ await self._client.traces.create_span(
133
+ trace_id,
134
+ CreateSpanParams(
135
+ name=agent_name,
136
+ span_id=span_id,
137
+ type="agent",
138
+ input=json.dumps(input) if input else None,
139
+ metadata={"parent_span_id": parent_span_id},
140
+ ),
141
+ )
142
+
143
+ ctx = AgentSpanContext(
144
+ span_id=span_id,
145
+ agent_name=agent_name,
146
+ trace_id=trace_id,
147
+ parent_span_id=parent_span_id,
148
+ started_at=datetime.now(timezone.utc),
149
+ )
150
+ self._spans.append(ctx)
151
+ return ctx
152
+
153
+ async def end_agent_span(
154
+ self,
155
+ span: AgentSpanContext,
156
+ output: dict[str, Any] | None = None,
157
+ error: str | None = None,
158
+ ) -> None:
159
+ span.ended_at = datetime.now(timezone.utc)
160
+ if not self._offline and span.trace_id is not None:
161
+ from evalgate_sdk.types import UpdateTraceParams
162
+
163
+ metadata: dict[str, Any] = {}
164
+ if output:
165
+ metadata["span_output"] = output
166
+ if error:
167
+ metadata["span_error"] = error
168
+ metadata["span_id"] = span.span_id
169
+ metadata["ended_at"] = span.ended_at.isoformat()
170
+ with contextlib.suppress(Exception):
171
+ await self._client.traces.update(
172
+ span.trace_id,
173
+ UpdateTraceParams(metadata=metadata),
174
+ )
175
+
176
+ # ── Handoffs ─────────────────────────────────────────────────
177
+
178
+ async def record_handoff(
179
+ self,
180
+ from_agent: str | None,
181
+ to_agent: str,
182
+ context: dict[str, Any] | None = None,
183
+ handoff_type: HandoffType = HandoffType.DELEGATION,
184
+ ) -> None:
185
+ handoff = AgentHandoff(
186
+ from_agent=from_agent,
187
+ to_agent=to_agent,
188
+ context=context,
189
+ handoff_type=handoff_type,
190
+ timestamp=datetime.now(timezone.utc),
191
+ )
192
+ self._handoffs.append(handoff)
193
+
194
+ # ── Decision auditing ────────────────────────────────────────
195
+
196
+ async def record_decision(self, params: RecordDecisionParams) -> None:
197
+ self._decisions.append(params)
198
+
199
+ # ── Cost tracking ────────────────────────────────────────────
200
+
201
+ async def record_cost(self, params: RecordCostParams) -> CostRecord:
202
+ record = CostRecord(
203
+ agent_name=params.agent_name,
204
+ category=params.category,
205
+ amount=params.amount,
206
+ currency=params.currency,
207
+ model=params.model,
208
+ tokens=params.tokens,
209
+ metadata=params.metadata,
210
+ timestamp=datetime.now(timezone.utc),
211
+ )
212
+ self._costs.append(record)
213
+ return record
214
+
215
+ def get_total_cost(self) -> float:
216
+ return sum(c.amount for c in self._costs)
217
+
218
+ def get_cost_breakdown(self) -> dict[str, float]:
219
+ breakdown: dict[str, float] = {}
220
+ for c in self._costs:
221
+ key = c.category.value
222
+ breakdown[key] = breakdown.get(key, 0.0) + c.amount
223
+ return breakdown
224
+
225
+ # ── Accessors ────────────────────────────────────────────────
226
+
227
+ def get_current_workflow(self) -> WorkflowContext | None:
228
+ return self._workflow
229
+
230
+ def is_workflow_active(self) -> bool:
231
+ return self._workflow is not None and self._workflow.status == WorkflowStatus.RUNNING
232
+
233
+ def get_handoffs(self) -> list[AgentHandoff]:
234
+ return list(self._handoffs)
235
+
236
+ def get_decisions(self) -> list[RecordDecisionParams]:
237
+ return list(self._decisions)
238
+
239
+ def get_costs(self) -> list[CostRecord]:
240
+ return list(self._costs)
241
+
242
+
243
+ def create_workflow_tracer(client: Any | None = None, **kwargs: Any) -> WorkflowTracer:
244
+ """Factory for WorkflowTracer."""
245
+ return WorkflowTracer(client, **kwargs)
246
+
247
+
248
+ async def trace_workflow_step(
249
+ tracer: WorkflowTracer,
250
+ agent_name: str,
251
+ fn: Callable[[], Any],
252
+ input: dict[str, Any] | None = None,
253
+ ) -> Any:
254
+ """Convenience wrapper: open a span, run *fn*, close the span."""
255
+ span = await tracer.start_agent_span(agent_name, input)
256
+ try:
257
+ result = fn()
258
+ if hasattr(result, "__await__"):
259
+ result = await result
260
+ await tracer.end_agent_span(span, output={"result": str(result)})
261
+ return result
262
+ except Exception as exc:
263
+ await tracer.end_agent_span(span, error=str(exc))
264
+ raise