traceops 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trace_ops/__init__.py +153 -0
- trace_ops/_types.py +356 -0
- trace_ops/assertions.py +129 -0
- trace_ops/cassette.py +154 -0
- trace_ops/cli.py +810 -0
- trace_ops/diff.py +295 -0
- trace_ops/export/__init__.py +20 -0
- trace_ops/export/finetune.py +143 -0
- trace_ops/interceptors/__init__.py +10 -0
- trace_ops/interceptors/crewai.py +200 -0
- trace_ops/interceptors/langchain.py +375 -0
- trace_ops/interceptors/langgraph.py +319 -0
- trace_ops/mcp/__init__.py +31 -0
- trace_ops/mcp/diff.py +133 -0
- trace_ops/mcp/events.py +42 -0
- trace_ops/mcp/interceptor.py +146 -0
- trace_ops/normalize.py +238 -0
- trace_ops/pytest_plugin.py +314 -0
- trace_ops/rag/__init__.py +71 -0
- trace_ops/rag/assertions.py +271 -0
- trace_ops/rag/context_analysis.py +227 -0
- trace_ops/rag/diff.py +231 -0
- trace_ops/rag/export.py +269 -0
- trace_ops/rag/interceptors/__init__.py +27 -0
- trace_ops/rag/interceptors/chromadb_interceptor.py +66 -0
- trace_ops/rag/interceptors/embedding_interceptor.py +54 -0
- trace_ops/rag/interceptors/langchain_retriever.py +63 -0
- trace_ops/rag/interceptors/llamaindex_retriever.py +58 -0
- trace_ops/rag/interceptors/pinecone_interceptor.py +54 -0
- trace_ops/rag/interceptors/qdrant_interceptor.py +47 -0
- trace_ops/rag/recorder.py +93 -0
- trace_ops/rag/scorers.py +210 -0
- trace_ops/rag/snapshot.py +335 -0
- trace_ops/recorder.py +972 -0
- trace_ops/replayer.py +626 -0
- trace_ops/reporters/__init__.py +6 -0
- trace_ops/reporters/cost_dashboard.py +243 -0
- trace_ops/reporters/html.py +323 -0
- trace_ops/reporters/terminal.py +304 -0
- trace_ops/semantic/__init__.py +31 -0
- trace_ops/semantic/assertions.py +57 -0
- trace_ops/semantic/similarity.py +163 -0
- trace_ops/streaming.py +570 -0
- traceops-0.5.0.dist-info/METADATA +436 -0
- traceops-0.5.0.dist-info/RECORD +48 -0
- traceops-0.5.0.dist-info/WHEEL +4 -0
- traceops-0.5.0.dist-info/entry_points.txt +5 -0
- traceops-0.5.0.dist-info/licenses/LICENSE +21 -0
trace_ops/__init__.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""traceops — record and replay LLM agent traces for deterministic regression testing.
|
|
2
|
+
|
|
3
|
+
Framework-agnostic. Works with OpenAI, Anthropic, LiteLLM, LangChain, CrewAI,
|
|
4
|
+
or any custom agent. No LLM calls during replay — tests are fast, free, deterministic.
|
|
5
|
+
|
|
6
|
+
Quick start:
|
|
7
|
+
from trace_ops import Recorder, Replayer
|
|
8
|
+
|
|
9
|
+
# Record an agent run
|
|
10
|
+
with Recorder(save_to="cassettes/test_math.yaml") as rec:
|
|
11
|
+
result = agent.run("What is 2+2?")
|
|
12
|
+
|
|
13
|
+
# Replay deterministically (zero API calls)
|
|
14
|
+
with Replayer("cassettes/test_math.yaml"):
|
|
15
|
+
result = agent.run("What is 2+2?")
|
|
16
|
+
assert result == "4"
|
|
17
|
+
|
|
18
|
+
# Also works async:
|
|
19
|
+
async with Recorder(save_to="cassettes/test.yaml") as rec:
|
|
20
|
+
result = await agent.arun("What is 2+2?")
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from trace_ops._types import (
|
|
24
|
+
EventType,
|
|
25
|
+
Trace,
|
|
26
|
+
TraceEvent,
|
|
27
|
+
TraceMetadata,
|
|
28
|
+
)
|
|
29
|
+
from trace_ops.assertions import (
|
|
30
|
+
AgentLoopError,
|
|
31
|
+
BudgetExceededError,
|
|
32
|
+
assert_cost_under,
|
|
33
|
+
assert_max_llm_calls,
|
|
34
|
+
assert_no_loops,
|
|
35
|
+
assert_tokens_under,
|
|
36
|
+
)
|
|
37
|
+
from trace_ops.cassette import (
|
|
38
|
+
CassetteMismatchError,
|
|
39
|
+
CassetteNotFoundError,
|
|
40
|
+
load_cassette,
|
|
41
|
+
save_cassette,
|
|
42
|
+
)
|
|
43
|
+
from trace_ops.diff import TraceDiff, assert_trace_unchanged, diff_traces
|
|
44
|
+
from trace_ops.normalize import (
|
|
45
|
+
NormalizedResponse,
|
|
46
|
+
NormalizedToolCall,
|
|
47
|
+
normalize_for_comparison,
|
|
48
|
+
normalize_response,
|
|
49
|
+
)
|
|
50
|
+
from trace_ops.recorder import Recorder
|
|
51
|
+
from trace_ops.replayer import Replayer
|
|
52
|
+
from trace_ops.reporters.cost_dashboard import CostDashboard, CostSummary
|
|
53
|
+
|
|
54
|
+
# RAG add-on (graceful degradation if not installed)
|
|
55
|
+
try:
|
|
56
|
+
from trace_ops.rag.diff import RAGDiffResult, diff_rag
|
|
57
|
+
from trace_ops.rag.assertions import (
|
|
58
|
+
RAGAssertionError,
|
|
59
|
+
assert_chunk_count,
|
|
60
|
+
assert_retrieval_latency,
|
|
61
|
+
assert_min_relevance_score,
|
|
62
|
+
assert_no_retrieval_drift,
|
|
63
|
+
assert_rag_scores,
|
|
64
|
+
)
|
|
65
|
+
from trace_ops.rag.scorers import RagasScorer, DeepEvalScorer
|
|
66
|
+
from trace_ops.rag.snapshot import RetrieverSnapshot
|
|
67
|
+
from trace_ops.rag.context_analysis import analyze_context_usage
|
|
68
|
+
_RAG_AVAILABLE = True
|
|
69
|
+
except ImportError:
|
|
70
|
+
_RAG_AVAILABLE = False
|
|
71
|
+
|
|
72
|
+
# MCP add-on
|
|
73
|
+
try:
|
|
74
|
+
from trace_ops.mcp.diff import MCPDiffResult, diff_mcp
|
|
75
|
+
_MCP_AVAILABLE = True
|
|
76
|
+
except ImportError:
|
|
77
|
+
_MCP_AVAILABLE = False
|
|
78
|
+
|
|
79
|
+
# Semantic add-on
|
|
80
|
+
try:
|
|
81
|
+
from trace_ops.semantic.similarity import SemanticDiffResult, semantic_similarity
|
|
82
|
+
from trace_ops.semantic.assertions import SemanticRegressionError, assert_semantic_similarity
|
|
83
|
+
_SEMANTIC_AVAILABLE = True
|
|
84
|
+
except ImportError:
|
|
85
|
+
_SEMANTIC_AVAILABLE = False
|
|
86
|
+
|
|
87
|
+
# Export add-on
|
|
88
|
+
try:
|
|
89
|
+
from trace_ops.export.finetune import to_openai_finetune, to_anthropic_finetune
|
|
90
|
+
_EXPORT_AVAILABLE = True
|
|
91
|
+
except ImportError:
|
|
92
|
+
_EXPORT_AVAILABLE = False
|
|
93
|
+
|
|
94
|
+
__version__ = "0.5.0"
|
|
95
|
+
|
|
96
|
+
__all__ = [
|
|
97
|
+
# Core
|
|
98
|
+
"Recorder",
|
|
99
|
+
"Replayer",
|
|
100
|
+
# Types
|
|
101
|
+
"Trace",
|
|
102
|
+
"TraceEvent",
|
|
103
|
+
"TraceMetadata",
|
|
104
|
+
"EventType",
|
|
105
|
+
# Cassette
|
|
106
|
+
"save_cassette",
|
|
107
|
+
"load_cassette",
|
|
108
|
+
"CassetteNotFoundError",
|
|
109
|
+
"CassetteMismatchError",
|
|
110
|
+
# Diff
|
|
111
|
+
"TraceDiff",
|
|
112
|
+
"diff_traces",
|
|
113
|
+
"assert_trace_unchanged",
|
|
114
|
+
# Normalization
|
|
115
|
+
"NormalizedToolCall",
|
|
116
|
+
"NormalizedResponse",
|
|
117
|
+
"normalize_response",
|
|
118
|
+
"normalize_for_comparison",
|
|
119
|
+
# Assertions
|
|
120
|
+
"assert_cost_under",
|
|
121
|
+
"assert_tokens_under",
|
|
122
|
+
"assert_max_llm_calls",
|
|
123
|
+
"assert_no_loops",
|
|
124
|
+
"BudgetExceededError",
|
|
125
|
+
"AgentLoopError",
|
|
126
|
+
# Reporters
|
|
127
|
+
"CostDashboard",
|
|
128
|
+
"CostSummary",
|
|
129
|
+
# RAG (available when trace_ops[rag] installed)
|
|
130
|
+
"diff_rag",
|
|
131
|
+
"RAGDiffResult",
|
|
132
|
+
"RAGAssertionError",
|
|
133
|
+
"assert_chunk_count",
|
|
134
|
+
"assert_retrieval_latency",
|
|
135
|
+
"assert_min_relevance_score",
|
|
136
|
+
"assert_no_retrieval_drift",
|
|
137
|
+
"assert_rag_scores",
|
|
138
|
+
"RagasScorer",
|
|
139
|
+
"DeepEvalScorer",
|
|
140
|
+
"RetrieverSnapshot",
|
|
141
|
+
"analyze_context_usage",
|
|
142
|
+
# MCP
|
|
143
|
+
"diff_mcp",
|
|
144
|
+
"MCPDiffResult",
|
|
145
|
+
# Semantic
|
|
146
|
+
"semantic_similarity",
|
|
147
|
+
"SemanticDiffResult",
|
|
148
|
+
"SemanticRegressionError",
|
|
149
|
+
"assert_semantic_similarity",
|
|
150
|
+
# Export / fine-tune
|
|
151
|
+
"to_openai_finetune",
|
|
152
|
+
"to_anthropic_finetune",
|
|
153
|
+
]
|
trace_ops/_types.py
ADDED
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
"""Core data model for agent execution traces.
|
|
2
|
+
|
|
3
|
+
A Trace is a complete recording of an agent run. It contains a sequence of
|
|
4
|
+
Events — each event is either an LLM call, a tool invocation, or an agent
|
|
5
|
+
decision. The trace captures everything needed to deterministically replay
|
|
6
|
+
the agent's execution without making real API calls.
|
|
7
|
+
|
|
8
|
+
Key design decision: we record at the SDK level (intercepting openai.chat.completions.create,
|
|
9
|
+
anthropic.messages.create, etc.) rather than at the HTTP level (like VCR.py). This gives us
|
|
10
|
+
semantic understanding of what happened — we know "this was a tool call" vs "this was a
|
|
11
|
+
completion" — which HTTP-level recording can't distinguish.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import hashlib
|
|
17
|
+
import json
|
|
18
|
+
import threading
|
|
19
|
+
import time
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
from enum import Enum
|
|
22
|
+
from typing import Any
|
|
23
|
+
from uuid import uuid4
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class EventType(str, Enum):
|
|
27
|
+
"""Types of events in an agent trace."""
|
|
28
|
+
|
|
29
|
+
LLM_REQUEST = "llm_request"
|
|
30
|
+
LLM_RESPONSE = "llm_response"
|
|
31
|
+
TOOL_CALL = "tool_call"
|
|
32
|
+
TOOL_RESULT = "tool_result"
|
|
33
|
+
AGENT_DECISION = "agent_decision"
|
|
34
|
+
ERROR = "error"
|
|
35
|
+
|
|
36
|
+
# RAG events
|
|
37
|
+
RETRIEVAL = "retrieval"
|
|
38
|
+
EMBEDDING_CALL = "embedding_call"
|
|
39
|
+
RAG_SCORES = "rag_scores"
|
|
40
|
+
|
|
41
|
+
# MCP events
|
|
42
|
+
MCP_SERVER_CONNECT = "mcp_server_connect"
|
|
43
|
+
MCP_TOOL_CALL = "mcp_tool_call"
|
|
44
|
+
MCP_TOOL_RESULT = "mcp_tool_result"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class TraceEvent:
|
|
49
|
+
"""A single event in an agent execution trace.
|
|
50
|
+
|
|
51
|
+
Events are the atoms of a trace. Each event records one interaction:
|
|
52
|
+
an LLM call, a tool invocation, or an agent-level decision.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
event_type: EventType
|
|
56
|
+
timestamp: float = field(default_factory=time.time)
|
|
57
|
+
event_id: str = field(default_factory=lambda: uuid4().hex[:12])
|
|
58
|
+
|
|
59
|
+
# LLM-specific fields
|
|
60
|
+
provider: str | None = None # "openai", "anthropic", "litellm"
|
|
61
|
+
model: str | None = None
|
|
62
|
+
messages: list[dict[str, Any]] | None = None # input messages
|
|
63
|
+
response: dict[str, Any] | None = None # full response object
|
|
64
|
+
temperature: float | None = None
|
|
65
|
+
max_tokens: int | None = None
|
|
66
|
+
tools: list[dict[str, Any]] | None = None # tool definitions sent to LLM
|
|
67
|
+
|
|
68
|
+
# Tool-specific fields
|
|
69
|
+
tool_name: str | None = None
|
|
70
|
+
tool_input: dict[str, Any] | None = None
|
|
71
|
+
tool_output: Any = None
|
|
72
|
+
|
|
73
|
+
# Agent decision fields
|
|
74
|
+
decision: str | None = None # e.g., "delegate_to_agent_b", "select_tool_search"
|
|
75
|
+
reasoning: str | None = None
|
|
76
|
+
|
|
77
|
+
# Error fields
|
|
78
|
+
error_type: str | None = None
|
|
79
|
+
error_message: str | None = None
|
|
80
|
+
|
|
81
|
+
# Cost tracking
|
|
82
|
+
input_tokens: int | None = None
|
|
83
|
+
output_tokens: int | None = None
|
|
84
|
+
cost_usd: float | None = None
|
|
85
|
+
|
|
86
|
+
# Timing
|
|
87
|
+
duration_ms: float | None = None
|
|
88
|
+
|
|
89
|
+
# Metadata
|
|
90
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
91
|
+
|
|
92
|
+
# RAG-specific fields
|
|
93
|
+
query: str | None = None # retrieval query text
|
|
94
|
+
chunks: list[dict[str, Any]] | None = None # retrieved chunks [{id, text, score, metadata}]
|
|
95
|
+
vector_store: str | None = None # "chromadb", "pinecone", etc.
|
|
96
|
+
collection: str | None = None # collection / index name
|
|
97
|
+
top_k: int | None = None
|
|
98
|
+
total_chunks_searched: int | None = None
|
|
99
|
+
dimensions: int | None = None # embedding dimensions
|
|
100
|
+
scores: dict[str, float] | None = None # RAG quality scores
|
|
101
|
+
|
|
102
|
+
# MCP-specific fields
|
|
103
|
+
server_name: str | None = None
|
|
104
|
+
server_url: str | None = None
|
|
105
|
+
capabilities: list[str] | None = None
|
|
106
|
+
arguments: dict[str, Any] | None = None
|
|
107
|
+
result: Any = None
|
|
108
|
+
is_error: bool | None = None
|
|
109
|
+
|
|
110
|
+
def to_dict(self) -> dict[str, Any]:
|
|
111
|
+
"""Serialize to a dict, dropping None fields for compact storage."""
|
|
112
|
+
d: dict[str, Any] = {
|
|
113
|
+
"event_type": self.event_type.value,
|
|
114
|
+
"timestamp": self.timestamp,
|
|
115
|
+
"event_id": self.event_id,
|
|
116
|
+
}
|
|
117
|
+
for key in [
|
|
118
|
+
"provider", "model", "messages", "response", "temperature",
|
|
119
|
+
"max_tokens", "tools", "tool_name", "tool_input", "tool_output",
|
|
120
|
+
"decision", "reasoning", "error_type", "error_message",
|
|
121
|
+
"input_tokens", "output_tokens", "cost_usd", "duration_ms",
|
|
122
|
+
# RAG fields
|
|
123
|
+
"query", "chunks", "vector_store", "collection", "top_k",
|
|
124
|
+
"total_chunks_searched", "dimensions", "scores",
|
|
125
|
+
# MCP fields
|
|
126
|
+
"server_name", "server_url", "capabilities", "arguments", "result", "is_error",
|
|
127
|
+
]:
|
|
128
|
+
val = getattr(self, key)
|
|
129
|
+
if val is not None:
|
|
130
|
+
d[key] = val
|
|
131
|
+
if self.metadata:
|
|
132
|
+
d["metadata"] = self.metadata
|
|
133
|
+
return d
|
|
134
|
+
|
|
135
|
+
@classmethod
|
|
136
|
+
def from_dict(cls, data: dict[str, Any]) -> TraceEvent:
|
|
137
|
+
"""Deserialize from a dict."""
|
|
138
|
+
data = dict(data)
|
|
139
|
+
data["event_type"] = EventType(data["event_type"])
|
|
140
|
+
return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@dataclass
|
|
144
|
+
class TraceMetadata:
|
|
145
|
+
"""Metadata about the trace recording environment."""
|
|
146
|
+
|
|
147
|
+
recorded_at: float = field(default_factory=time.time)
|
|
148
|
+
trace_ops_version: str = "0.5.0"
|
|
149
|
+
python_version: str = ""
|
|
150
|
+
framework: str | None = None # "langchain", "crewai", "openai-agents-sdk", "custom"
|
|
151
|
+
description: str = ""
|
|
152
|
+
tags: list[str] = field(default_factory=list)
|
|
153
|
+
env: dict[str, str] = field(default_factory=dict)
|
|
154
|
+
|
|
155
|
+
def to_dict(self) -> dict[str, Any]:
|
|
156
|
+
d: dict[str, Any] = {
|
|
157
|
+
"recorded_at": self.recorded_at,
|
|
158
|
+
"trace_ops_version": self.trace_ops_version,
|
|
159
|
+
}
|
|
160
|
+
if self.python_version:
|
|
161
|
+
d["python_version"] = self.python_version
|
|
162
|
+
if self.framework:
|
|
163
|
+
d["framework"] = self.framework
|
|
164
|
+
if self.description:
|
|
165
|
+
d["description"] = self.description
|
|
166
|
+
if self.tags:
|
|
167
|
+
d["tags"] = self.tags
|
|
168
|
+
if self.env:
|
|
169
|
+
d["env"] = self.env
|
|
170
|
+
return d
|
|
171
|
+
|
|
172
|
+
@classmethod
|
|
173
|
+
def from_dict(cls, data: dict[str, Any]) -> TraceMetadata:
|
|
174
|
+
return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@dataclass
|
|
178
|
+
class Trace:
|
|
179
|
+
"""A complete recording of an agent execution.
|
|
180
|
+
|
|
181
|
+
This is the top-level object that gets saved to a cassette file.
|
|
182
|
+
It contains all events from a single agent run, plus metadata
|
|
183
|
+
about the recording environment.
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
trace_id: str = field(default_factory=lambda: uuid4().hex[:16])
|
|
187
|
+
events: list[TraceEvent] = field(default_factory=list)
|
|
188
|
+
metadata: TraceMetadata = field(default_factory=TraceMetadata)
|
|
189
|
+
|
|
190
|
+
# Summary stats (computed after recording)
|
|
191
|
+
total_llm_calls: int = 0
|
|
192
|
+
total_tool_calls: int = 0
|
|
193
|
+
total_tokens: int = 0
|
|
194
|
+
total_cost_usd: float = 0.0
|
|
195
|
+
total_duration_ms: float = 0.0
|
|
196
|
+
|
|
197
|
+
# Thread safety — protects events list and stats
|
|
198
|
+
_lock: threading.Lock = field(
|
|
199
|
+
default_factory=threading.Lock, init=False, repr=False, compare=False
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
def add_event(self, event: TraceEvent) -> None:
|
|
203
|
+
"""Add an event to the trace (thread-safe)."""
|
|
204
|
+
with self._lock:
|
|
205
|
+
self.events.append(event)
|
|
206
|
+
self._update_stats(event)
|
|
207
|
+
|
|
208
|
+
def _update_stats(self, event: TraceEvent) -> None:
|
|
209
|
+
"""Update summary statistics after adding an event."""
|
|
210
|
+
if event.event_type == EventType.LLM_RESPONSE:
|
|
211
|
+
self.total_llm_calls += 1
|
|
212
|
+
if event.input_tokens:
|
|
213
|
+
self.total_tokens += event.input_tokens
|
|
214
|
+
if event.output_tokens:
|
|
215
|
+
self.total_tokens += event.output_tokens
|
|
216
|
+
if event.cost_usd:
|
|
217
|
+
self.total_cost_usd += event.cost_usd
|
|
218
|
+
elif event.event_type == EventType.TOOL_RESULT:
|
|
219
|
+
self.total_tool_calls += 1
|
|
220
|
+
elif event.event_type == EventType.EMBEDDING_CALL:
|
|
221
|
+
if event.cost_usd:
|
|
222
|
+
self.total_cost_usd += event.cost_usd
|
|
223
|
+
if event.duration_ms:
|
|
224
|
+
self.total_duration_ms += event.duration_ms
|
|
225
|
+
|
|
226
|
+
def finalize(self) -> None:
|
|
227
|
+
"""Compute final stats after recording is complete."""
|
|
228
|
+
self.total_llm_calls = sum(
|
|
229
|
+
1 for e in self.events if e.event_type == EventType.LLM_RESPONSE
|
|
230
|
+
)
|
|
231
|
+
self.total_tool_calls = sum(
|
|
232
|
+
1 for e in self.events if e.event_type == EventType.TOOL_RESULT
|
|
233
|
+
)
|
|
234
|
+
self.total_tokens = sum(
|
|
235
|
+
(e.input_tokens or 0) + (e.output_tokens or 0)
|
|
236
|
+
for e in self.events
|
|
237
|
+
if e.event_type == EventType.LLM_RESPONSE
|
|
238
|
+
)
|
|
239
|
+
self.total_cost_usd = sum(
|
|
240
|
+
e.cost_usd or 0.0
|
|
241
|
+
for e in self.events
|
|
242
|
+
if e.cost_usd is not None
|
|
243
|
+
)
|
|
244
|
+
self.total_duration_ms = sum(
|
|
245
|
+
e.duration_ms or 0.0 for e in self.events if e.duration_ms
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
@property
|
|
249
|
+
def llm_events(self) -> list[TraceEvent]:
|
|
250
|
+
"""Get only LLM request/response events."""
|
|
251
|
+
return [
|
|
252
|
+
e for e in self.events
|
|
253
|
+
if e.event_type in (EventType.LLM_REQUEST, EventType.LLM_RESPONSE)
|
|
254
|
+
]
|
|
255
|
+
|
|
256
|
+
@property
|
|
257
|
+
def tool_events(self) -> list[TraceEvent]:
|
|
258
|
+
"""Get only tool call/result events."""
|
|
259
|
+
return [
|
|
260
|
+
e for e in self.events
|
|
261
|
+
if e.event_type in (EventType.TOOL_CALL, EventType.TOOL_RESULT)
|
|
262
|
+
]
|
|
263
|
+
|
|
264
|
+
@property
|
|
265
|
+
def retrieval_events(self) -> list[TraceEvent]:
|
|
266
|
+
"""Get all retrieval events (RAG vector store queries)."""
|
|
267
|
+
return [e for e in self.events if e.event_type == EventType.RETRIEVAL]
|
|
268
|
+
|
|
269
|
+
@property
|
|
270
|
+
def embedding_events(self) -> list[TraceEvent]:
|
|
271
|
+
"""Get all embedding call events."""
|
|
272
|
+
return [e for e in self.events if e.event_type == EventType.EMBEDDING_CALL]
|
|
273
|
+
|
|
274
|
+
@property
|
|
275
|
+
def mcp_events(self) -> list[TraceEvent]:
|
|
276
|
+
"""Get all MCP-related events."""
|
|
277
|
+
return [
|
|
278
|
+
e for e in self.events
|
|
279
|
+
if e.event_type in (
|
|
280
|
+
EventType.MCP_SERVER_CONNECT,
|
|
281
|
+
EventType.MCP_TOOL_CALL,
|
|
282
|
+
EventType.MCP_TOOL_RESULT,
|
|
283
|
+
)
|
|
284
|
+
]
|
|
285
|
+
|
|
286
|
+
@property
|
|
287
|
+
def rag_scores(self) -> dict[str, float] | None:
|
|
288
|
+
"""Get cached RAG quality scores from the cassette, if any."""
|
|
289
|
+
for e in self.events:
|
|
290
|
+
if e.event_type == EventType.RAG_SCORES and e.scores:
|
|
291
|
+
return e.scores
|
|
292
|
+
return None
|
|
293
|
+
|
|
294
|
+
@property
|
|
295
|
+
def trajectory(self) -> list[str]:
|
|
296
|
+
"""Get the high-level trajectory as a list of step descriptions.
|
|
297
|
+
|
|
298
|
+
Returns something like:
|
|
299
|
+
["llm_call:gpt-4o", "tool:search_files", "llm_call:gpt-4o", "tool:read_file"]
|
|
300
|
+
"""
|
|
301
|
+
steps = []
|
|
302
|
+
for event in self.events:
|
|
303
|
+
if event.event_type == EventType.LLM_REQUEST:
|
|
304
|
+
steps.append(f"llm_call:{event.model or 'unknown'}")
|
|
305
|
+
elif event.event_type == EventType.TOOL_CALL:
|
|
306
|
+
steps.append(f"tool:{event.tool_name or 'unknown'}")
|
|
307
|
+
elif event.event_type == EventType.AGENT_DECISION:
|
|
308
|
+
steps.append(f"decision:{event.decision or 'unknown'}")
|
|
309
|
+
elif event.event_type == EventType.ERROR:
|
|
310
|
+
steps.append(f"error:{event.error_type or 'unknown'}")
|
|
311
|
+
elif event.event_type == EventType.RETRIEVAL:
|
|
312
|
+
steps.append(f"retrieval:{event.vector_store or 'unknown'}")
|
|
313
|
+
elif event.event_type == EventType.EMBEDDING_CALL:
|
|
314
|
+
steps.append(f"embedding:{event.model or 'unknown'}")
|
|
315
|
+
elif event.event_type == EventType.MCP_TOOL_CALL:
|
|
316
|
+
steps.append(f"mcp:{event.server_name or 'unknown'}.{event.tool_name or 'unknown'}")
|
|
317
|
+
return steps
|
|
318
|
+
|
|
319
|
+
def fingerprint(self) -> str:
|
|
320
|
+
"""Generate a hash fingerprint of the trajectory.
|
|
321
|
+
|
|
322
|
+
Two traces with the same fingerprint took the same path
|
|
323
|
+
(same sequence of LLM calls, tool calls, and decisions).
|
|
324
|
+
"""
|
|
325
|
+
trajectory_str = "|".join(self.trajectory)
|
|
326
|
+
return hashlib.sha256(trajectory_str.encode()).hexdigest()[:16]
|
|
327
|
+
|
|
328
|
+
def to_dict(self) -> dict[str, Any]:
|
|
329
|
+
"""Serialize the full trace to a dict."""
|
|
330
|
+
return {
|
|
331
|
+
"version": "1",
|
|
332
|
+
"trace_id": self.trace_id,
|
|
333
|
+
"metadata": self.metadata.to_dict(),
|
|
334
|
+
"events": [e.to_dict() for e in self.events],
|
|
335
|
+
"summary": {
|
|
336
|
+
"total_llm_calls": self.total_llm_calls,
|
|
337
|
+
"total_tool_calls": self.total_tool_calls,
|
|
338
|
+
"total_tokens": self.total_tokens,
|
|
339
|
+
"total_cost_usd": self.total_cost_usd,
|
|
340
|
+
"total_duration_ms": self.total_duration_ms,
|
|
341
|
+
"trajectory": self.trajectory,
|
|
342
|
+
"fingerprint": self.fingerprint(),
|
|
343
|
+
},
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
@classmethod
|
|
347
|
+
def from_dict(cls, data: dict[str, Any]) -> Trace:
|
|
348
|
+
"""Deserialize a trace from a dict."""
|
|
349
|
+
trace = cls(
|
|
350
|
+
trace_id=data.get("trace_id", uuid4().hex[:16]),
|
|
351
|
+
metadata=TraceMetadata.from_dict(data.get("metadata", {})),
|
|
352
|
+
)
|
|
353
|
+
for event_data in data.get("events", []):
|
|
354
|
+
trace.events.append(TraceEvent.from_dict(event_data))
|
|
355
|
+
trace.finalize()
|
|
356
|
+
return trace
|
trace_ops/assertions.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Budget and behavioural assertions for agent traces.
|
|
2
|
+
|
|
3
|
+
These helpers let you guard against cost overruns, token bloat,
|
|
4
|
+
excessive LLM round-trips, and infinite tool-call loops directly
|
|
5
|
+
inside your test suite.
|
|
6
|
+
|
|
7
|
+
Usage::
|
|
8
|
+
|
|
9
|
+
from trace_ops.assertions import assert_cost_under, assert_no_loops
|
|
10
|
+
|
|
11
|
+
with Recorder() as rec:
|
|
12
|
+
agent.run("Summarize the report")
|
|
13
|
+
|
|
14
|
+
assert_cost_under(rec.trace, max_usd=0.50)
|
|
15
|
+
assert_no_loops(rec.trace)
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from trace_ops._types import EventType, Trace
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BudgetExceededError(AssertionError):
|
|
24
|
+
"""Raised when an agent trace exceeds a defined budget."""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class AgentLoopError(AssertionError):
|
|
28
|
+
"""Raised when an agent trace exhibits loop-like behaviour."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# ── Public assertion functions ──────────────────────────────────────
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def assert_cost_under(trace: Trace, *, max_usd: float) -> None:
|
|
35
|
+
"""Assert that total cost of a trace is within budget.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
trace: The recorded agent trace.
|
|
39
|
+
max_usd: Maximum allowed cost in US dollars.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
BudgetExceededError: If ``trace.total_cost_usd`` exceeds *max_usd*.
|
|
43
|
+
"""
|
|
44
|
+
if trace.total_cost_usd > max_usd:
|
|
45
|
+
raise BudgetExceededError(
|
|
46
|
+
f"Trace cost ${trace.total_cost_usd:.4f} exceeds budget of ${max_usd:.4f}.\n"
|
|
47
|
+
f"The agent made {trace.total_llm_calls} LLM calls "
|
|
48
|
+
f"using {trace.total_tokens:,} tokens.\n"
|
|
49
|
+
f"Optimise prompts or reduce tool-call loops to lower cost."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def assert_tokens_under(trace: Trace, *, max_tokens: int) -> None:
|
|
54
|
+
"""Assert that total token usage is within a limit.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
trace: The recorded agent trace.
|
|
58
|
+
max_tokens: Maximum allowed token count (input + output).
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
BudgetExceededError: If ``trace.total_tokens`` exceeds *max_tokens*.
|
|
62
|
+
"""
|
|
63
|
+
if trace.total_tokens > max_tokens:
|
|
64
|
+
raise BudgetExceededError(
|
|
65
|
+
f"Trace used {trace.total_tokens:,} tokens, "
|
|
66
|
+
f"exceeding limit of {max_tokens:,}.\n"
|
|
67
|
+
f"The agent made {trace.total_llm_calls} LLM calls.\n"
|
|
68
|
+
f"Reduce prompt size or limit tool-call depth."
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def assert_max_llm_calls(trace: Trace, *, max_calls: int) -> None:
|
|
73
|
+
"""Assert that the agent didn't make too many LLM round-trips.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
trace: The recorded agent trace.
|
|
77
|
+
max_calls: Maximum allowed LLM calls.
|
|
78
|
+
|
|
79
|
+
Raises:
|
|
80
|
+
BudgetExceededError: If ``trace.total_llm_calls`` exceeds *max_calls*.
|
|
81
|
+
"""
|
|
82
|
+
if trace.total_llm_calls > max_calls:
|
|
83
|
+
raise BudgetExceededError(
|
|
84
|
+
f"Trace made {trace.total_llm_calls} LLM calls, "
|
|
85
|
+
f"exceeding limit of {max_calls}.\n"
|
|
86
|
+
f"Trajectory: {' → '.join(trace.trajectory)}\n"
|
|
87
|
+
f"The agent may be stuck in a loop or using an inefficient strategy."
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def assert_no_loops(
|
|
92
|
+
trace: Trace,
|
|
93
|
+
*,
|
|
94
|
+
max_consecutive_same_tool: int = 3,
|
|
95
|
+
) -> None:
|
|
96
|
+
"""Assert that the trace doesn't contain tool-call loops.
|
|
97
|
+
|
|
98
|
+
Scans for *N* consecutive ``TOOL_CALL`` events with the same
|
|
99
|
+
``tool_name``. Such runs typically indicate the agent is stuck
|
|
100
|
+
retrying the same action.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
trace: The recorded agent trace.
|
|
104
|
+
max_consecutive_same_tool: Maximum allowed consecutive calls
|
|
105
|
+
to the same tool before raising.
|
|
106
|
+
|
|
107
|
+
Raises:
|
|
108
|
+
AgentLoopError: If a run of same-tool calls exceeds the limit.
|
|
109
|
+
"""
|
|
110
|
+
tool_events = [
|
|
111
|
+
e for e in trace.events if e.event_type == EventType.TOOL_CALL
|
|
112
|
+
]
|
|
113
|
+
if not tool_events:
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
consecutive = 1
|
|
117
|
+
for i in range(1, len(tool_events)):
|
|
118
|
+
if tool_events[i].tool_name == tool_events[i - 1].tool_name:
|
|
119
|
+
consecutive += 1
|
|
120
|
+
if consecutive > max_consecutive_same_tool:
|
|
121
|
+
raise AgentLoopError(
|
|
122
|
+
f"Detected {consecutive} consecutive calls to tool "
|
|
123
|
+
f"'{tool_events[i].tool_name}' "
|
|
124
|
+
f"(limit: {max_consecutive_same_tool}).\n"
|
|
125
|
+
f"The agent may be stuck in an infinite loop.\n"
|
|
126
|
+
f"Check the agent's exit conditions or add loop guards."
|
|
127
|
+
)
|
|
128
|
+
else:
|
|
129
|
+
consecutive = 1
|