traceops 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. trace_ops/__init__.py +153 -0
  2. trace_ops/_types.py +356 -0
  3. trace_ops/assertions.py +129 -0
  4. trace_ops/cassette.py +154 -0
  5. trace_ops/cli.py +810 -0
  6. trace_ops/diff.py +295 -0
  7. trace_ops/export/__init__.py +20 -0
  8. trace_ops/export/finetune.py +143 -0
  9. trace_ops/interceptors/__init__.py +10 -0
  10. trace_ops/interceptors/crewai.py +200 -0
  11. trace_ops/interceptors/langchain.py +375 -0
  12. trace_ops/interceptors/langgraph.py +319 -0
  13. trace_ops/mcp/__init__.py +31 -0
  14. trace_ops/mcp/diff.py +133 -0
  15. trace_ops/mcp/events.py +42 -0
  16. trace_ops/mcp/interceptor.py +146 -0
  17. trace_ops/normalize.py +238 -0
  18. trace_ops/pytest_plugin.py +314 -0
  19. trace_ops/rag/__init__.py +71 -0
  20. trace_ops/rag/assertions.py +271 -0
  21. trace_ops/rag/context_analysis.py +227 -0
  22. trace_ops/rag/diff.py +231 -0
  23. trace_ops/rag/export.py +269 -0
  24. trace_ops/rag/interceptors/__init__.py +27 -0
  25. trace_ops/rag/interceptors/chromadb_interceptor.py +66 -0
  26. trace_ops/rag/interceptors/embedding_interceptor.py +54 -0
  27. trace_ops/rag/interceptors/langchain_retriever.py +63 -0
  28. trace_ops/rag/interceptors/llamaindex_retriever.py +58 -0
  29. trace_ops/rag/interceptors/pinecone_interceptor.py +54 -0
  30. trace_ops/rag/interceptors/qdrant_interceptor.py +47 -0
  31. trace_ops/rag/recorder.py +93 -0
  32. trace_ops/rag/scorers.py +210 -0
  33. trace_ops/rag/snapshot.py +335 -0
  34. trace_ops/recorder.py +972 -0
  35. trace_ops/replayer.py +626 -0
  36. trace_ops/reporters/__init__.py +6 -0
  37. trace_ops/reporters/cost_dashboard.py +243 -0
  38. trace_ops/reporters/html.py +323 -0
  39. trace_ops/reporters/terminal.py +304 -0
  40. trace_ops/semantic/__init__.py +31 -0
  41. trace_ops/semantic/assertions.py +57 -0
  42. trace_ops/semantic/similarity.py +163 -0
  43. trace_ops/streaming.py +570 -0
  44. traceops-0.5.0.dist-info/METADATA +436 -0
  45. traceops-0.5.0.dist-info/RECORD +48 -0
  46. traceops-0.5.0.dist-info/WHEEL +4 -0
  47. traceops-0.5.0.dist-info/entry_points.txt +5 -0
  48. traceops-0.5.0.dist-info/licenses/LICENSE +21 -0
trace_ops/__init__.py ADDED
@@ -0,0 +1,153 @@
1
+ """traceops — record and replay LLM agent traces for deterministic regression testing.
2
+
3
+ Framework-agnostic. Works with OpenAI, Anthropic, LiteLLM, LangChain, CrewAI,
4
+ or any custom agent. No LLM calls during replay — tests are fast, free, deterministic.
5
+
6
+ Quick start:
7
+ from trace_ops import Recorder, Replayer
8
+
9
+ # Record an agent run
10
+ with Recorder(save_to="cassettes/test_math.yaml") as rec:
11
+ result = agent.run("What is 2+2?")
12
+
13
+ # Replay deterministically (zero API calls)
14
+ with Replayer("cassettes/test_math.yaml"):
15
+ result = agent.run("What is 2+2?")
16
+ assert result == "4"
17
+
18
+ # Also works async:
19
+ async with Recorder(save_to="cassettes/test.yaml") as rec:
20
+ result = await agent.arun("What is 2+2?")
21
+ """
22
+
23
+ from trace_ops._types import (
24
+ EventType,
25
+ Trace,
26
+ TraceEvent,
27
+ TraceMetadata,
28
+ )
29
+ from trace_ops.assertions import (
30
+ AgentLoopError,
31
+ BudgetExceededError,
32
+ assert_cost_under,
33
+ assert_max_llm_calls,
34
+ assert_no_loops,
35
+ assert_tokens_under,
36
+ )
37
+ from trace_ops.cassette import (
38
+ CassetteMismatchError,
39
+ CassetteNotFoundError,
40
+ load_cassette,
41
+ save_cassette,
42
+ )
43
+ from trace_ops.diff import TraceDiff, assert_trace_unchanged, diff_traces
44
+ from trace_ops.normalize import (
45
+ NormalizedResponse,
46
+ NormalizedToolCall,
47
+ normalize_for_comparison,
48
+ normalize_response,
49
+ )
50
+ from trace_ops.recorder import Recorder
51
+ from trace_ops.replayer import Replayer
52
+ from trace_ops.reporters.cost_dashboard import CostDashboard, CostSummary
53
+
54
+ # RAG add-on (graceful degradation if not installed)
55
+ try:
56
+ from trace_ops.rag.diff import RAGDiffResult, diff_rag
57
+ from trace_ops.rag.assertions import (
58
+ RAGAssertionError,
59
+ assert_chunk_count,
60
+ assert_retrieval_latency,
61
+ assert_min_relevance_score,
62
+ assert_no_retrieval_drift,
63
+ assert_rag_scores,
64
+ )
65
+ from trace_ops.rag.scorers import RagasScorer, DeepEvalScorer
66
+ from trace_ops.rag.snapshot import RetrieverSnapshot
67
+ from trace_ops.rag.context_analysis import analyze_context_usage
68
+ _RAG_AVAILABLE = True
69
+ except ImportError:
70
+ _RAG_AVAILABLE = False
71
+
72
+ # MCP add-on
73
+ try:
74
+ from trace_ops.mcp.diff import MCPDiffResult, diff_mcp
75
+ _MCP_AVAILABLE = True
76
+ except ImportError:
77
+ _MCP_AVAILABLE = False
78
+
79
+ # Semantic add-on
80
+ try:
81
+ from trace_ops.semantic.similarity import SemanticDiffResult, semantic_similarity
82
+ from trace_ops.semantic.assertions import SemanticRegressionError, assert_semantic_similarity
83
+ _SEMANTIC_AVAILABLE = True
84
+ except ImportError:
85
+ _SEMANTIC_AVAILABLE = False
86
+
87
+ # Export add-on
88
+ try:
89
+ from trace_ops.export.finetune import to_openai_finetune, to_anthropic_finetune
90
+ _EXPORT_AVAILABLE = True
91
+ except ImportError:
92
+ _EXPORT_AVAILABLE = False
93
+
94
+ __version__ = "0.5.0"
95
+
96
+ __all__ = [
97
+ # Core
98
+ "Recorder",
99
+ "Replayer",
100
+ # Types
101
+ "Trace",
102
+ "TraceEvent",
103
+ "TraceMetadata",
104
+ "EventType",
105
+ # Cassette
106
+ "save_cassette",
107
+ "load_cassette",
108
+ "CassetteNotFoundError",
109
+ "CassetteMismatchError",
110
+ # Diff
111
+ "TraceDiff",
112
+ "diff_traces",
113
+ "assert_trace_unchanged",
114
+ # Normalization
115
+ "NormalizedToolCall",
116
+ "NormalizedResponse",
117
+ "normalize_response",
118
+ "normalize_for_comparison",
119
+ # Assertions
120
+ "assert_cost_under",
121
+ "assert_tokens_under",
122
+ "assert_max_llm_calls",
123
+ "assert_no_loops",
124
+ "BudgetExceededError",
125
+ "AgentLoopError",
126
+ # Reporters
127
+ "CostDashboard",
128
+ "CostSummary",
129
+ # RAG (available when trace_ops[rag] installed)
130
+ "diff_rag",
131
+ "RAGDiffResult",
132
+ "RAGAssertionError",
133
+ "assert_chunk_count",
134
+ "assert_retrieval_latency",
135
+ "assert_min_relevance_score",
136
+ "assert_no_retrieval_drift",
137
+ "assert_rag_scores",
138
+ "RagasScorer",
139
+ "DeepEvalScorer",
140
+ "RetrieverSnapshot",
141
+ "analyze_context_usage",
142
+ # MCP
143
+ "diff_mcp",
144
+ "MCPDiffResult",
145
+ # Semantic
146
+ "semantic_similarity",
147
+ "SemanticDiffResult",
148
+ "SemanticRegressionError",
149
+ "assert_semantic_similarity",
150
+ # Export / fine-tune
151
+ "to_openai_finetune",
152
+ "to_anthropic_finetune",
153
+ ]
trace_ops/_types.py ADDED
@@ -0,0 +1,356 @@
1
+ """Core data model for agent execution traces.
2
+
3
+ A Trace is a complete recording of an agent run. It contains a sequence of
4
+ Events — each event is either an LLM call, a tool invocation, or an agent
5
+ decision. The trace captures everything needed to deterministically replay
6
+ the agent's execution without making real API calls.
7
+
8
+ Key design decision: we record at the SDK level (intercepting openai.chat.completions.create,
9
+ anthropic.messages.create, etc.) rather than at the HTTP level (like VCR.py). This gives us
10
+ semantic understanding of what happened — we know "this was a tool call" vs "this was a
11
+ completion" — which HTTP-level recording can't distinguish.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import hashlib
17
+ import json
18
+ import threading
19
+ import time
20
+ from dataclasses import dataclass, field
21
+ from enum import Enum
22
+ from typing import Any
23
+ from uuid import uuid4
24
+
25
+
26
+ class EventType(str, Enum):
27
+ """Types of events in an agent trace."""
28
+
29
+ LLM_REQUEST = "llm_request"
30
+ LLM_RESPONSE = "llm_response"
31
+ TOOL_CALL = "tool_call"
32
+ TOOL_RESULT = "tool_result"
33
+ AGENT_DECISION = "agent_decision"
34
+ ERROR = "error"
35
+
36
+ # RAG events
37
+ RETRIEVAL = "retrieval"
38
+ EMBEDDING_CALL = "embedding_call"
39
+ RAG_SCORES = "rag_scores"
40
+
41
+ # MCP events
42
+ MCP_SERVER_CONNECT = "mcp_server_connect"
43
+ MCP_TOOL_CALL = "mcp_tool_call"
44
+ MCP_TOOL_RESULT = "mcp_tool_result"
45
+
46
+
47
+ @dataclass
48
+ class TraceEvent:
49
+ """A single event in an agent execution trace.
50
+
51
+ Events are the atoms of a trace. Each event records one interaction:
52
+ an LLM call, a tool invocation, or an agent-level decision.
53
+ """
54
+
55
+ event_type: EventType
56
+ timestamp: float = field(default_factory=time.time)
57
+ event_id: str = field(default_factory=lambda: uuid4().hex[:12])
58
+
59
+ # LLM-specific fields
60
+ provider: str | None = None # "openai", "anthropic", "litellm"
61
+ model: str | None = None
62
+ messages: list[dict[str, Any]] | None = None # input messages
63
+ response: dict[str, Any] | None = None # full response object
64
+ temperature: float | None = None
65
+ max_tokens: int | None = None
66
+ tools: list[dict[str, Any]] | None = None # tool definitions sent to LLM
67
+
68
+ # Tool-specific fields
69
+ tool_name: str | None = None
70
+ tool_input: dict[str, Any] | None = None
71
+ tool_output: Any = None
72
+
73
+ # Agent decision fields
74
+ decision: str | None = None # e.g., "delegate_to_agent_b", "select_tool_search"
75
+ reasoning: str | None = None
76
+
77
+ # Error fields
78
+ error_type: str | None = None
79
+ error_message: str | None = None
80
+
81
+ # Cost tracking
82
+ input_tokens: int | None = None
83
+ output_tokens: int | None = None
84
+ cost_usd: float | None = None
85
+
86
+ # Timing
87
+ duration_ms: float | None = None
88
+
89
+ # Metadata
90
+ metadata: dict[str, Any] = field(default_factory=dict)
91
+
92
+ # RAG-specific fields
93
+ query: str | None = None # retrieval query text
94
+ chunks: list[dict[str, Any]] | None = None # retrieved chunks [{id, text, score, metadata}]
95
+ vector_store: str | None = None # "chromadb", "pinecone", etc.
96
+ collection: str | None = None # collection / index name
97
+ top_k: int | None = None
98
+ total_chunks_searched: int | None = None
99
+ dimensions: int | None = None # embedding dimensions
100
+ scores: dict[str, float] | None = None # RAG quality scores
101
+
102
+ # MCP-specific fields
103
+ server_name: str | None = None
104
+ server_url: str | None = None
105
+ capabilities: list[str] | None = None
106
+ arguments: dict[str, Any] | None = None
107
+ result: Any = None
108
+ is_error: bool | None = None
109
+
110
+ def to_dict(self) -> dict[str, Any]:
111
+ """Serialize to a dict, dropping None fields for compact storage."""
112
+ d: dict[str, Any] = {
113
+ "event_type": self.event_type.value,
114
+ "timestamp": self.timestamp,
115
+ "event_id": self.event_id,
116
+ }
117
+ for key in [
118
+ "provider", "model", "messages", "response", "temperature",
119
+ "max_tokens", "tools", "tool_name", "tool_input", "tool_output",
120
+ "decision", "reasoning", "error_type", "error_message",
121
+ "input_tokens", "output_tokens", "cost_usd", "duration_ms",
122
+ # RAG fields
123
+ "query", "chunks", "vector_store", "collection", "top_k",
124
+ "total_chunks_searched", "dimensions", "scores",
125
+ # MCP fields
126
+ "server_name", "server_url", "capabilities", "arguments", "result", "is_error",
127
+ ]:
128
+ val = getattr(self, key)
129
+ if val is not None:
130
+ d[key] = val
131
+ if self.metadata:
132
+ d["metadata"] = self.metadata
133
+ return d
134
+
135
+ @classmethod
136
+ def from_dict(cls, data: dict[str, Any]) -> TraceEvent:
137
+ """Deserialize from a dict."""
138
+ data = dict(data)
139
+ data["event_type"] = EventType(data["event_type"])
140
+ return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
141
+
142
+
143
+ @dataclass
144
+ class TraceMetadata:
145
+ """Metadata about the trace recording environment."""
146
+
147
+ recorded_at: float = field(default_factory=time.time)
148
+ trace_ops_version: str = "0.5.0"
149
+ python_version: str = ""
150
+ framework: str | None = None # "langchain", "crewai", "openai-agents-sdk", "custom"
151
+ description: str = ""
152
+ tags: list[str] = field(default_factory=list)
153
+ env: dict[str, str] = field(default_factory=dict)
154
+
155
+ def to_dict(self) -> dict[str, Any]:
156
+ d: dict[str, Any] = {
157
+ "recorded_at": self.recorded_at,
158
+ "trace_ops_version": self.trace_ops_version,
159
+ }
160
+ if self.python_version:
161
+ d["python_version"] = self.python_version
162
+ if self.framework:
163
+ d["framework"] = self.framework
164
+ if self.description:
165
+ d["description"] = self.description
166
+ if self.tags:
167
+ d["tags"] = self.tags
168
+ if self.env:
169
+ d["env"] = self.env
170
+ return d
171
+
172
+ @classmethod
173
+ def from_dict(cls, data: dict[str, Any]) -> TraceMetadata:
174
+ return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
175
+
176
+
177
+ @dataclass
178
+ class Trace:
179
+ """A complete recording of an agent execution.
180
+
181
+ This is the top-level object that gets saved to a cassette file.
182
+ It contains all events from a single agent run, plus metadata
183
+ about the recording environment.
184
+ """
185
+
186
+ trace_id: str = field(default_factory=lambda: uuid4().hex[:16])
187
+ events: list[TraceEvent] = field(default_factory=list)
188
+ metadata: TraceMetadata = field(default_factory=TraceMetadata)
189
+
190
+ # Summary stats (computed after recording)
191
+ total_llm_calls: int = 0
192
+ total_tool_calls: int = 0
193
+ total_tokens: int = 0
194
+ total_cost_usd: float = 0.0
195
+ total_duration_ms: float = 0.0
196
+
197
+ # Thread safety — protects events list and stats
198
+ _lock: threading.Lock = field(
199
+ default_factory=threading.Lock, init=False, repr=False, compare=False
200
+ )
201
+
202
+ def add_event(self, event: TraceEvent) -> None:
203
+ """Add an event to the trace (thread-safe)."""
204
+ with self._lock:
205
+ self.events.append(event)
206
+ self._update_stats(event)
207
+
208
+ def _update_stats(self, event: TraceEvent) -> None:
209
+ """Update summary statistics after adding an event."""
210
+ if event.event_type == EventType.LLM_RESPONSE:
211
+ self.total_llm_calls += 1
212
+ if event.input_tokens:
213
+ self.total_tokens += event.input_tokens
214
+ if event.output_tokens:
215
+ self.total_tokens += event.output_tokens
216
+ if event.cost_usd:
217
+ self.total_cost_usd += event.cost_usd
218
+ elif event.event_type == EventType.TOOL_RESULT:
219
+ self.total_tool_calls += 1
220
+ elif event.event_type == EventType.EMBEDDING_CALL:
221
+ if event.cost_usd:
222
+ self.total_cost_usd += event.cost_usd
223
+ if event.duration_ms:
224
+ self.total_duration_ms += event.duration_ms
225
+
226
+ def finalize(self) -> None:
227
+ """Compute final stats after recording is complete."""
228
+ self.total_llm_calls = sum(
229
+ 1 for e in self.events if e.event_type == EventType.LLM_RESPONSE
230
+ )
231
+ self.total_tool_calls = sum(
232
+ 1 for e in self.events if e.event_type == EventType.TOOL_RESULT
233
+ )
234
+ self.total_tokens = sum(
235
+ (e.input_tokens or 0) + (e.output_tokens or 0)
236
+ for e in self.events
237
+ if e.event_type == EventType.LLM_RESPONSE
238
+ )
239
+ self.total_cost_usd = sum(
240
+ e.cost_usd or 0.0
241
+ for e in self.events
242
+ if e.cost_usd is not None
243
+ )
244
+ self.total_duration_ms = sum(
245
+ e.duration_ms or 0.0 for e in self.events if e.duration_ms
246
+ )
247
+
248
+ @property
249
+ def llm_events(self) -> list[TraceEvent]:
250
+ """Get only LLM request/response events."""
251
+ return [
252
+ e for e in self.events
253
+ if e.event_type in (EventType.LLM_REQUEST, EventType.LLM_RESPONSE)
254
+ ]
255
+
256
+ @property
257
+ def tool_events(self) -> list[TraceEvent]:
258
+ """Get only tool call/result events."""
259
+ return [
260
+ e for e in self.events
261
+ if e.event_type in (EventType.TOOL_CALL, EventType.TOOL_RESULT)
262
+ ]
263
+
264
+ @property
265
+ def retrieval_events(self) -> list[TraceEvent]:
266
+ """Get all retrieval events (RAG vector store queries)."""
267
+ return [e for e in self.events if e.event_type == EventType.RETRIEVAL]
268
+
269
+ @property
270
+ def embedding_events(self) -> list[TraceEvent]:
271
+ """Get all embedding call events."""
272
+ return [e for e in self.events if e.event_type == EventType.EMBEDDING_CALL]
273
+
274
+ @property
275
+ def mcp_events(self) -> list[TraceEvent]:
276
+ """Get all MCP-related events."""
277
+ return [
278
+ e for e in self.events
279
+ if e.event_type in (
280
+ EventType.MCP_SERVER_CONNECT,
281
+ EventType.MCP_TOOL_CALL,
282
+ EventType.MCP_TOOL_RESULT,
283
+ )
284
+ ]
285
+
286
+ @property
287
+ def rag_scores(self) -> dict[str, float] | None:
288
+ """Get cached RAG quality scores from the cassette, if any."""
289
+ for e in self.events:
290
+ if e.event_type == EventType.RAG_SCORES and e.scores:
291
+ return e.scores
292
+ return None
293
+
294
+ @property
295
+ def trajectory(self) -> list[str]:
296
+ """Get the high-level trajectory as a list of step descriptions.
297
+
298
+ Returns something like:
299
+ ["llm_call:gpt-4o", "tool:search_files", "llm_call:gpt-4o", "tool:read_file"]
300
+ """
301
+ steps = []
302
+ for event in self.events:
303
+ if event.event_type == EventType.LLM_REQUEST:
304
+ steps.append(f"llm_call:{event.model or 'unknown'}")
305
+ elif event.event_type == EventType.TOOL_CALL:
306
+ steps.append(f"tool:{event.tool_name or 'unknown'}")
307
+ elif event.event_type == EventType.AGENT_DECISION:
308
+ steps.append(f"decision:{event.decision or 'unknown'}")
309
+ elif event.event_type == EventType.ERROR:
310
+ steps.append(f"error:{event.error_type or 'unknown'}")
311
+ elif event.event_type == EventType.RETRIEVAL:
312
+ steps.append(f"retrieval:{event.vector_store or 'unknown'}")
313
+ elif event.event_type == EventType.EMBEDDING_CALL:
314
+ steps.append(f"embedding:{event.model or 'unknown'}")
315
+ elif event.event_type == EventType.MCP_TOOL_CALL:
316
+ steps.append(f"mcp:{event.server_name or 'unknown'}.{event.tool_name or 'unknown'}")
317
+ return steps
318
+
319
+ def fingerprint(self) -> str:
320
+ """Generate a hash fingerprint of the trajectory.
321
+
322
+ Two traces with the same fingerprint took the same path
323
+ (same sequence of LLM calls, tool calls, and decisions).
324
+ """
325
+ trajectory_str = "|".join(self.trajectory)
326
+ return hashlib.sha256(trajectory_str.encode()).hexdigest()[:16]
327
+
328
+ def to_dict(self) -> dict[str, Any]:
329
+ """Serialize the full trace to a dict."""
330
+ return {
331
+ "version": "1",
332
+ "trace_id": self.trace_id,
333
+ "metadata": self.metadata.to_dict(),
334
+ "events": [e.to_dict() for e in self.events],
335
+ "summary": {
336
+ "total_llm_calls": self.total_llm_calls,
337
+ "total_tool_calls": self.total_tool_calls,
338
+ "total_tokens": self.total_tokens,
339
+ "total_cost_usd": self.total_cost_usd,
340
+ "total_duration_ms": self.total_duration_ms,
341
+ "trajectory": self.trajectory,
342
+ "fingerprint": self.fingerprint(),
343
+ },
344
+ }
345
+
346
+ @classmethod
347
+ def from_dict(cls, data: dict[str, Any]) -> Trace:
348
+ """Deserialize a trace from a dict."""
349
+ trace = cls(
350
+ trace_id=data.get("trace_id", uuid4().hex[:16]),
351
+ metadata=TraceMetadata.from_dict(data.get("metadata", {})),
352
+ )
353
+ for event_data in data.get("events", []):
354
+ trace.events.append(TraceEvent.from_dict(event_data))
355
+ trace.finalize()
356
+ return trace
@@ -0,0 +1,129 @@
1
+ """Budget and behavioural assertions for agent traces.
2
+
3
+ These helpers let you guard against cost overruns, token bloat,
4
+ excessive LLM round-trips, and infinite tool-call loops directly
5
+ inside your test suite.
6
+
7
+ Usage::
8
+
9
+ from trace_ops.assertions import assert_cost_under, assert_no_loops
10
+
11
+ with Recorder() as rec:
12
+ agent.run("Summarize the report")
13
+
14
+ assert_cost_under(rec.trace, max_usd=0.50)
15
+ assert_no_loops(rec.trace)
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from trace_ops._types import EventType, Trace
21
+
22
+
23
+ class BudgetExceededError(AssertionError):
24
+ """Raised when an agent trace exceeds a defined budget."""
25
+
26
+
27
+ class AgentLoopError(AssertionError):
28
+ """Raised when an agent trace exhibits loop-like behaviour."""
29
+
30
+
31
+ # ── Public assertion functions ──────────────────────────────────────
32
+
33
+
34
+ def assert_cost_under(trace: Trace, *, max_usd: float) -> None:
35
+ """Assert that total cost of a trace is within budget.
36
+
37
+ Args:
38
+ trace: The recorded agent trace.
39
+ max_usd: Maximum allowed cost in US dollars.
40
+
41
+ Raises:
42
+ BudgetExceededError: If ``trace.total_cost_usd`` exceeds *max_usd*.
43
+ """
44
+ if trace.total_cost_usd > max_usd:
45
+ raise BudgetExceededError(
46
+ f"Trace cost ${trace.total_cost_usd:.4f} exceeds budget of ${max_usd:.4f}.\n"
47
+ f"The agent made {trace.total_llm_calls} LLM calls "
48
+ f"using {trace.total_tokens:,} tokens.\n"
49
+ f"Optimise prompts or reduce tool-call loops to lower cost."
50
+ )
51
+
52
+
53
+ def assert_tokens_under(trace: Trace, *, max_tokens: int) -> None:
54
+ """Assert that total token usage is within a limit.
55
+
56
+ Args:
57
+ trace: The recorded agent trace.
58
+ max_tokens: Maximum allowed token count (input + output).
59
+
60
+ Raises:
61
+ BudgetExceededError: If ``trace.total_tokens`` exceeds *max_tokens*.
62
+ """
63
+ if trace.total_tokens > max_tokens:
64
+ raise BudgetExceededError(
65
+ f"Trace used {trace.total_tokens:,} tokens, "
66
+ f"exceeding limit of {max_tokens:,}.\n"
67
+ f"The agent made {trace.total_llm_calls} LLM calls.\n"
68
+ f"Reduce prompt size or limit tool-call depth."
69
+ )
70
+
71
+
72
+ def assert_max_llm_calls(trace: Trace, *, max_calls: int) -> None:
73
+ """Assert that the agent didn't make too many LLM round-trips.
74
+
75
+ Args:
76
+ trace: The recorded agent trace.
77
+ max_calls: Maximum allowed LLM calls.
78
+
79
+ Raises:
80
+ BudgetExceededError: If ``trace.total_llm_calls`` exceeds *max_calls*.
81
+ """
82
+ if trace.total_llm_calls > max_calls:
83
+ raise BudgetExceededError(
84
+ f"Trace made {trace.total_llm_calls} LLM calls, "
85
+ f"exceeding limit of {max_calls}.\n"
86
+ f"Trajectory: {' → '.join(trace.trajectory)}\n"
87
+ f"The agent may be stuck in a loop or using an inefficient strategy."
88
+ )
89
+
90
+
91
+ def assert_no_loops(
92
+ trace: Trace,
93
+ *,
94
+ max_consecutive_same_tool: int = 3,
95
+ ) -> None:
96
+ """Assert that the trace doesn't contain tool-call loops.
97
+
98
+ Scans for *N* consecutive ``TOOL_CALL`` events with the same
99
+ ``tool_name``. Such runs typically indicate the agent is stuck
100
+ retrying the same action.
101
+
102
+ Args:
103
+ trace: The recorded agent trace.
104
+ max_consecutive_same_tool: Maximum allowed consecutive calls
105
+ to the same tool before raising.
106
+
107
+ Raises:
108
+ AgentLoopError: If a run of same-tool calls exceeds the limit.
109
+ """
110
+ tool_events = [
111
+ e for e in trace.events if e.event_type == EventType.TOOL_CALL
112
+ ]
113
+ if not tool_events:
114
+ return
115
+
116
+ consecutive = 1
117
+ for i in range(1, len(tool_events)):
118
+ if tool_events[i].tool_name == tool_events[i - 1].tool_name:
119
+ consecutive += 1
120
+ if consecutive > max_consecutive_same_tool:
121
+ raise AgentLoopError(
122
+ f"Detected {consecutive} consecutive calls to tool "
123
+ f"'{tool_events[i].tool_name}' "
124
+ f"(limit: {max_consecutive_same_tool}).\n"
125
+ f"The agent may be stuck in an infinite loop.\n"
126
+ f"Check the agent's exit conditions or add loop guards."
127
+ )
128
+ else:
129
+ consecutive = 1