agentcassette 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,83 @@
1
+ """
2
+ agentcassette — Deterministic agent test recorder and replayer.
3
+
4
+ Zero dependencies. Pure Python stdlib.
5
+
6
+ Testing agents is painful: live LLM calls are expensive, non-deterministic, and
7
+ slow. agentcassette records a real run once, then replays it forever from a JSON
8
+ "cassette" — no network, no cost, fully deterministic.
9
+
10
+ Quick start
11
+ -----------
12
+ Wrap the callable(s) you want captured once (your model-call function, and
13
+ optionally your tools), then drive recording and replay with context managers::
14
+
15
+ import agentcassette
16
+ from agentcassette import record, replay
17
+
18
+ call_model = agentcassette.intercept(call_model, kind="llm")
19
+
20
+ # Record a real run:
21
+ with record("cassettes/flight_search.json"):
22
+ my_agent.run("Find flights to NYC under $300")
23
+
24
+ # Replay it in a test — no API calls happen:
25
+ def test_flight_search():
26
+ with replay("cassettes/flight_search.json"):
27
+ result = my_agent.run("Find flights to NYC under $300")
28
+ assert result.success
29
+
30
+ Catch regressions with strict replay::
31
+
32
+ from agentcassette import replay, DivergenceError
33
+
34
+ with replay("cassettes/flight_search.json", strict=True):
35
+ my_agent.run("Find flights to NYC under $300") # DivergenceError on drift
36
+
37
+ Inspect and diff cassettes::
38
+
39
+ from agentcassette import Cassette, diff_cassettes
40
+
41
+ c = Cassette.load("cassettes/flight_search.json")
42
+ c.num_steps, c.total_input_tokens, c.total_output_tokens
43
+ c.redact("api_key") # scrub secrets before committing
44
+
45
+ delta = diff_cassettes("cassettes/v1.json", "cassettes/v2.json")
46
+ delta.new_calls, delta.dropped_calls, delta.token_delta
47
+
48
+ See the project README for the full cassette format and API reference.
49
+ """
50
+
51
+ from __future__ import annotations
52
+
53
+ from importlib.metadata import PackageNotFoundError, version as _version
54
+
55
+ from ._cassette import Cassette
56
+ from ._diff import CassetteDiff, diff_cassettes
57
+ from ._errors import (
58
+ AgentCassetteError,
59
+ CassetteNotFound,
60
+ DivergenceError,
61
+ ReplayExhausted,
62
+ )
63
+ from ._session import Player, Recorder, intercept, record, replay
64
+
65
+ __all__ = [
66
+ "record",
67
+ "replay",
68
+ "intercept",
69
+ "Cassette",
70
+ "Recorder",
71
+ "Player",
72
+ "diff_cassettes",
73
+ "CassetteDiff",
74
+ "AgentCassetteError",
75
+ "CassetteNotFound",
76
+ "ReplayExhausted",
77
+ "DivergenceError",
78
+ ]
79
+
80
+ try:
81
+ __version__ = _version("agentcassette")
82
+ except PackageNotFoundError: # running from a source tree without install metadata
83
+ __version__ = "0.0.0"
@@ -0,0 +1,169 @@
1
+ """The Cassette: agentcassette's on-disk recording format.
2
+
3
+ A cassette is a plain, human-readable JSON file — readable in a diff, safe to
4
+ commit to git, and portable across machines. Its shape:
5
+
6
+ {
7
+ "version": 1,
8
+ "recorded_at": "2026-06-30T12:00:00Z",
9
+ "model": "claude-sonnet-4-6", # optional label
10
+ "duration_ms": 1832.4, # wall time of the whole recorded run
11
+ "steps": [
12
+ {
13
+ "index": 0,
14
+ "type": "llm", # "llm" | "tool" | "call"
15
+ "name": "call_model",
16
+ "arguments": {"args": [...], "kwargs": {...}},
17
+ "result": {...},
18
+ "input_tokens": 420,
19
+ "output_tokens": 88,
20
+ "duration_ms": 512.0
21
+ }
22
+ ]
23
+ }
24
+
25
+ Every intercepted call becomes one step, in the exact order it happened.
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import json
31
+ import os
32
+ from typing import Any
33
+
34
+ from ._errors import CassetteNotFound
35
+
36
+ CASSETTE_VERSION = 1
37
+ REDACTION_PLACEHOLDER = "****"
38
+
39
+
40
+ def to_jsonable(obj: Any) -> Any:
41
+ """Coerce an arbitrary value into something ``json.dumps`` accepts.
42
+
43
+ JSON-native values pass through unchanged. Sets and tuples become lists.
44
+ Objects are reduced to their ``__dict__`` when available, otherwise their
45
+ ``repr`` wrapped in a marker so the cassette stays valid JSON.
46
+ """
47
+ if obj is None or isinstance(obj, (bool, int, float, str)):
48
+ return obj
49
+ if isinstance(obj, dict):
50
+ return {str(k): to_jsonable(v) for k, v in obj.items()}
51
+ if isinstance(obj, (list, tuple, set, frozenset)):
52
+ return [to_jsonable(v) for v in obj]
53
+ # Common SDK response objects expose their fields via model_dump()/dict().
54
+ for method in ("model_dump", "dict", "to_dict"):
55
+ fn = getattr(obj, method, None)
56
+ if callable(fn):
57
+ try:
58
+ return to_jsonable(fn())
59
+ except Exception: # pragma: no cover - defensive
60
+ break
61
+ data = getattr(obj, "__dict__", None)
62
+ if isinstance(data, dict) and data:
63
+ return {"__type__": type(obj).__name__, **{str(k): to_jsonable(v) for k, v in data.items()}}
64
+ return {"__repr__": repr(obj)}
65
+
66
+
67
+ def _redact_in_place(obj: Any, key: str, replacement: str) -> None:
68
+ if isinstance(obj, dict):
69
+ for k in list(obj.keys()):
70
+ if k == key:
71
+ obj[k] = replacement
72
+ else:
73
+ _redact_in_place(obj[k], key, replacement)
74
+ elif isinstance(obj, list):
75
+ for item in obj:
76
+ _redact_in_place(item, key, replacement)
77
+
78
+
79
+ class Cassette:
80
+ """An ordered recording of intercepted calls.
81
+
82
+ Usually produced by :func:`agentcassette.record` and consumed by
83
+ :func:`agentcassette.replay`, but can also be loaded directly for inspection.
84
+ """
85
+
86
+ def __init__(
87
+ self,
88
+ steps: list[dict] | None = None,
89
+ *,
90
+ model: str | None = None,
91
+ recorded_at: str | None = None,
92
+ duration_ms: float = 0.0,
93
+ version: int = CASSETTE_VERSION,
94
+ ) -> None:
95
+ self.version = version
96
+ self.recorded_at = recorded_at
97
+ self.model = model
98
+ self.duration_ms = duration_ms
99
+ self.steps: list[dict] = steps if steps is not None else []
100
+
101
+ # ---- persistence ----------------------------------------------------
102
+ @classmethod
103
+ def from_dict(cls, data: dict) -> "Cassette":
104
+ return cls(
105
+ steps=list(data.get("steps", [])),
106
+ model=data.get("model"),
107
+ recorded_at=data.get("recorded_at"),
108
+ duration_ms=data.get("duration_ms", 0.0),
109
+ version=data.get("version", CASSETTE_VERSION),
110
+ )
111
+
112
+ def to_dict(self) -> dict:
113
+ return {
114
+ "version": self.version,
115
+ "recorded_at": self.recorded_at,
116
+ "model": self.model,
117
+ "duration_ms": self.duration_ms,
118
+ "steps": self.steps,
119
+ }
120
+
121
+ @classmethod
122
+ def load(cls, path: str | os.PathLike) -> "Cassette":
123
+ """Load a cassette from disk. Raises :class:`CassetteNotFound` if absent."""
124
+ if not os.path.exists(path):
125
+ raise CassetteNotFound(f"No cassette at {os.fspath(path)!r}")
126
+ with open(path, "r", encoding="utf-8") as fh:
127
+ return cls.from_dict(json.load(fh))
128
+
129
+ def save(self, path: str | os.PathLike) -> None:
130
+ """Write the cassette to disk as pretty-printed JSON, creating dirs."""
131
+ parent = os.path.dirname(os.fspath(path))
132
+ if parent:
133
+ os.makedirs(parent, exist_ok=True)
134
+ with open(path, "w", encoding="utf-8") as fh:
135
+ json.dump(self.to_dict(), fh, indent=2, ensure_ascii=False)
136
+ fh.write("\n")
137
+
138
+ # ---- inspection -----------------------------------------------------
139
+ @property
140
+ def num_steps(self) -> int:
141
+ return len(self.steps)
142
+
143
+ @property
144
+ def total_input_tokens(self) -> int:
145
+ return sum(int(s.get("input_tokens", 0)) for s in self.steps)
146
+
147
+ @property
148
+ def total_output_tokens(self) -> int:
149
+ return sum(int(s.get("output_tokens", 0)) for s in self.steps)
150
+
151
+ @property
152
+ def total_tokens(self) -> int:
153
+ return self.total_input_tokens + self.total_output_tokens
154
+
155
+ def redact(self, key: str, replacement: str = REDACTION_PLACEHOLDER) -> "Cassette":
156
+ """Replace every value stored under ``key`` (at any depth). Returns self."""
157
+ for step in self.steps:
158
+ _redact_in_place(step, key, replacement)
159
+ return self
160
+
161
+ def __len__(self) -> int:
162
+ return len(self.steps)
163
+
164
+ def __repr__(self) -> str:
165
+ return (
166
+ f"Cassette(steps={self.num_steps}, "
167
+ f"input_tokens={self.total_input_tokens}, "
168
+ f"output_tokens={self.total_output_tokens})"
169
+ )
agentcassette/_diff.py ADDED
@@ -0,0 +1,70 @@
1
+ """Diff two cassettes to spot behavioral drift between agent versions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from collections import Counter
7
+ from typing import Union
8
+
9
+ from ._cassette import Cassette
10
+
11
+
12
+ class CassetteDiff:
13
+ """The delta between two cassettes (``a`` = baseline, ``b`` = new).
14
+
15
+ Attributes:
16
+ new_calls: Call names that appear more often in ``b`` than ``a``.
17
+ dropped_calls: Call names that appear more often in ``a`` than ``b``.
18
+ changed_calls: Steps at the same index whose name/arguments/result differ.
19
+ token_delta: ``b`` total tokens minus ``a`` total tokens.
20
+ input_token_delta / output_token_delta: the same, split by direction.
21
+ step_delta: ``b`` step count minus ``a`` step count.
22
+ """
23
+
24
+ def __init__(self, a: Cassette, b: Cassette) -> None:
25
+ names_a = Counter(s.get("name") for s in a.steps)
26
+ names_b = Counter(s.get("name") for s in b.steps)
27
+
28
+ self.new_calls: list[str] = sorted((names_b - names_a).elements())
29
+ self.dropped_calls: list[str] = sorted((names_a - names_b).elements())
30
+
31
+ self.changed_calls: list[dict] = []
32
+ for i in range(min(len(a.steps), len(b.steps))):
33
+ sa, sb = a.steps[i], b.steps[i]
34
+ if (
35
+ sa.get("name") != sb.get("name")
36
+ or sa.get("arguments") != sb.get("arguments")
37
+ or sa.get("result") != sb.get("result")
38
+ ):
39
+ self.changed_calls.append({"index": i, "a": sa, "b": sb})
40
+
41
+ self.input_token_delta = b.total_input_tokens - a.total_input_tokens
42
+ self.output_token_delta = b.total_output_tokens - a.total_output_tokens
43
+ self.token_delta = b.total_tokens - a.total_tokens
44
+ self.step_delta = b.num_steps - a.num_steps
45
+
46
+ @property
47
+ def identical(self) -> bool:
48
+ """True when the two cassettes have the same calls, args, and results."""
49
+ return (
50
+ not self.new_calls
51
+ and not self.dropped_calls
52
+ and not self.changed_calls
53
+ and self.step_delta == 0
54
+ )
55
+
56
+ def __repr__(self) -> str:
57
+ return (
58
+ f"CassetteDiff(new={self.new_calls}, dropped={self.dropped_calls}, "
59
+ f"changed={len(self.changed_calls)}, token_delta={self.token_delta})"
60
+ )
61
+
62
+
63
+ def diff_cassettes(
64
+ a: Union[str, os.PathLike, Cassette],
65
+ b: Union[str, os.PathLike, Cassette],
66
+ ) -> CassetteDiff:
67
+ """Compare two cassettes given as paths or already-loaded :class:`Cassette`s."""
68
+ cassette_a = a if isinstance(a, Cassette) else Cassette.load(a)
69
+ cassette_b = b if isinstance(b, Cassette) else Cassette.load(b)
70
+ return CassetteDiff(cassette_a, cassette_b)
@@ -0,0 +1,35 @@
1
+ """Exception types raised by agentcassette."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ class AgentCassetteError(Exception):
7
+ """Base class for all agentcassette errors."""
8
+
9
+
10
+ class CassetteNotFound(AgentCassetteError):
11
+ """Raised when a cassette file does not exist during replay."""
12
+
13
+
14
+ class ReplayExhausted(AgentCassetteError):
15
+ """Raised when the agent makes more intercepted calls than the cassette recorded."""
16
+
17
+
18
+ class DivergenceError(AgentCassetteError):
19
+ """Raised during a strict replay when a call does not match the recording.
20
+
21
+ Attributes:
22
+ step_index: Position of the diverging call in the cassette.
23
+ expected: The recorded ``{"name", "arguments"}`` at that position.
24
+ actual: The ``{"name", "arguments"}`` the agent produced instead.
25
+ """
26
+
27
+ def __init__(self, step_index: int, expected: dict, actual: dict) -> None:
28
+ self.step_index = step_index
29
+ self.expected = expected
30
+ self.actual = actual
31
+ super().__init__(
32
+ f"Replay diverged at step {step_index}: "
33
+ f"expected {expected.get('name')!r} with {expected.get('arguments')!r}, "
34
+ f"got {actual.get('name')!r} with {actual.get('arguments')!r}"
35
+ )
@@ -0,0 +1,249 @@
1
+ """Recording and replaying of intercepted calls.
2
+
3
+ The design is a deliberate, honest seam rather than monkey-patching: you wrap
4
+ the callables you want captured once with :func:`intercept`. A thread-local
5
+ "active session" then decides what happens each time a wrapped callable runs:
6
+
7
+ * No active session → the real function runs (production is unaffected).
8
+ * Inside ``record()`` → the real function runs and the call is taped.
9
+ * Inside ``replay()`` → the recorded result is returned; the real function is
10
+ never called (no network, no cost, fully deterministic).
11
+
12
+ This keeps agentcassette provider-agnostic and truly zero-dependency: it works
13
+ with OpenAI, Anthropic, a raw ``requests`` call, or a local model equally.
14
+
15
+ Both synchronous and ``async def`` callables are supported — :func:`intercept`
16
+ detects coroutine functions and returns an awaitable wrapper for them.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import functools
22
+ import inspect
23
+ import threading
24
+ import time
25
+ from contextlib import contextmanager
26
+ from datetime import datetime, timezone
27
+ from typing import Any, Callable, Iterator
28
+
29
+ from ._cassette import Cassette, to_jsonable
30
+ from ._errors import DivergenceError, ReplayExhausted
31
+ from ._tokens import count_tokens
32
+
33
+ _local = threading.local()
34
+
35
+
36
+ def _current_session() -> "Recorder | Player | None":
37
+ return getattr(_local, "session", None)
38
+
39
+
40
+ def _set_session(session: "Recorder | Player | None") -> None:
41
+ _local.session = session
42
+
43
+
44
+ def _now_iso() -> str:
45
+ return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
46
+
47
+
48
+ def _normalize_args(args: tuple, kwargs: dict) -> dict:
49
+ return {"args": to_jsonable(list(args)), "kwargs": to_jsonable(dict(kwargs))}
50
+
51
+
52
+ # ---------------------------------------------------------------------------
53
+ # The interception seam
54
+ # ---------------------------------------------------------------------------
55
+ def intercept(
56
+ fn: Callable | None = None,
57
+ *,
58
+ name: str | None = None,
59
+ kind: str = "call",
60
+ ) -> Callable:
61
+ """Mark a callable as recordable/replayable.
62
+
63
+ Usable as ``intercept(fn)``, ``intercept(fn, name=..., kind="llm")``, or as a
64
+ decorator ``@intercept`` / ``@intercept(kind="tool")``. Works on both regular
65
+ functions and ``async def`` coroutine functions.
66
+
67
+ Outside of a ``record``/``replay`` block the wrapped callable behaves exactly
68
+ like the original, so it is safe to leave in production code.
69
+ """
70
+
71
+ def decorator(func: Callable) -> Callable:
72
+ call_name = name or getattr(func, "__name__", "call")
73
+
74
+ if inspect.iscoroutinefunction(func):
75
+
76
+ @functools.wraps(func)
77
+ async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
78
+ session = _current_session()
79
+ if session is None:
80
+ return await func(*args, **kwargs)
81
+ return await session.handle_async(call_name, kind, func, args, kwargs)
82
+
83
+ async_wrapper.__agentcassette_intercepted__ = True # type: ignore[attr-defined]
84
+ return async_wrapper
85
+
86
+ @functools.wraps(func)
87
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
88
+ session = _current_session()
89
+ if session is None:
90
+ return func(*args, **kwargs)
91
+ return session.handle(call_name, kind, func, args, kwargs)
92
+
93
+ wrapper.__agentcassette_intercepted__ = True # type: ignore[attr-defined]
94
+ return wrapper
95
+
96
+ if fn is not None:
97
+ return decorator(fn)
98
+ return decorator
99
+
100
+
101
+ # ---------------------------------------------------------------------------
102
+ # Recorder
103
+ # ---------------------------------------------------------------------------
104
+ class Recorder:
105
+ """Captures each intercepted call into an ordered list of steps."""
106
+
107
+ def __init__(self, *, model: str | None = None) -> None:
108
+ self.model = model
109
+ self.steps: list[dict] = []
110
+ self.cassette: Cassette | None = None # populated after the block exits
111
+
112
+ def _append(
113
+ self, name: str, kind: str, args: tuple, kwargs: dict, result: Any, duration_ms: float
114
+ ) -> Any:
115
+ arguments = _normalize_args(args, kwargs)
116
+ jresult = to_jsonable(result)
117
+ input_tokens, output_tokens = count_tokens(arguments, jresult)
118
+ self.steps.append(
119
+ {
120
+ "index": len(self.steps),
121
+ "type": kind,
122
+ "name": name,
123
+ "arguments": arguments,
124
+ "result": jresult,
125
+ "input_tokens": input_tokens,
126
+ "output_tokens": output_tokens,
127
+ "duration_ms": round(duration_ms, 3),
128
+ }
129
+ )
130
+ return result # the live run still gets the real object
131
+
132
+ def handle(self, name: str, kind: str, func: Callable, args: tuple, kwargs: dict) -> Any:
133
+ start = time.perf_counter()
134
+ result = func(*args, **kwargs)
135
+ return self._append(name, kind, args, kwargs, result, (time.perf_counter() - start) * 1000)
136
+
137
+ async def handle_async(
138
+ self, name: str, kind: str, func: Callable, args: tuple, kwargs: dict
139
+ ) -> Any:
140
+ start = time.perf_counter()
141
+ result = await func(*args, **kwargs)
142
+ return self._append(name, kind, args, kwargs, result, (time.perf_counter() - start) * 1000)
143
+
144
+
145
+ # ---------------------------------------------------------------------------
146
+ # Player
147
+ # ---------------------------------------------------------------------------
148
+ class Player:
149
+ """Serves recorded results back in order, without calling the real function."""
150
+
151
+ def __init__(self, steps: list[dict], *, strict: bool = False) -> None:
152
+ self._steps = steps
153
+ self.strict = strict
154
+ self._cursor = 0
155
+ self.divergences: list[dict] = []
156
+
157
+ @property
158
+ def cursor(self) -> int:
159
+ return self._cursor
160
+
161
+ @property
162
+ def remaining(self) -> int:
163
+ return len(self._steps) - self._cursor
164
+
165
+ def _serve(self, name: str, kind: str, args: tuple, kwargs: dict) -> Any:
166
+ if self._cursor >= len(self._steps):
167
+ raise ReplayExhausted(
168
+ f"cassette recorded {len(self._steps)} calls but the agent asked for more "
169
+ f"(next was {name!r})"
170
+ )
171
+ step = self._steps[self._cursor]
172
+ self._cursor += 1
173
+
174
+ actual = {"name": name, "arguments": _normalize_args(args, kwargs)}
175
+ expected = {"name": step.get("name"), "arguments": step.get("arguments")}
176
+ if expected != actual:
177
+ if self.strict:
178
+ raise DivergenceError(step.get("index", self._cursor - 1), expected, actual)
179
+ self.divergences.append(
180
+ {"index": step.get("index"), "expected": expected, "actual": actual}
181
+ )
182
+ return step.get("result")
183
+
184
+ def handle(self, name: str, kind: str, func: Callable, args: tuple, kwargs: dict) -> Any:
185
+ return self._serve(name, kind, args, kwargs)
186
+
187
+ async def handle_async(
188
+ self, name: str, kind: str, func: Callable, args: tuple, kwargs: dict
189
+ ) -> Any:
190
+ return self._serve(name, kind, args, kwargs)
191
+
192
+
193
+ # ---------------------------------------------------------------------------
194
+ # Context managers
195
+ # ---------------------------------------------------------------------------
196
+ @contextmanager
197
+ def record(
198
+ path,
199
+ *,
200
+ model: str | None = None,
201
+ redact: "list[str] | None" = None,
202
+ ) -> Iterator[Recorder]:
203
+ """Record every intercepted call made inside the block to a cassette file.
204
+
205
+ The cassette is written on clean exit only; if the block raises, nothing is
206
+ saved. Pass ``redact=["api_key", ...]`` to scrub those keys before writing.
207
+ """
208
+ if _current_session() is not None:
209
+ raise RuntimeError("agentcassette: a record/replay session is already active")
210
+ recorder = Recorder(model=model)
211
+ _set_session(recorder)
212
+ start = time.perf_counter()
213
+ try:
214
+ yield recorder
215
+ except BaseException:
216
+ _set_session(None)
217
+ raise
218
+ _set_session(None)
219
+
220
+ duration_ms = (time.perf_counter() - start) * 1000
221
+ cassette = Cassette(
222
+ steps=recorder.steps,
223
+ model=model,
224
+ recorded_at=_now_iso(),
225
+ duration_ms=round(duration_ms, 3),
226
+ )
227
+ for key in redact or []:
228
+ cassette.redact(key)
229
+ cassette.save(path)
230
+ recorder.cassette = cassette
231
+
232
+
233
+ @contextmanager
234
+ def replay(path, *, strict: bool = False) -> Iterator[Player]:
235
+ """Replay a cassette: intercepted calls return recorded results, no real work.
236
+
237
+ With ``strict=True`` any call whose name or arguments differ from the
238
+ recording raises :class:`DivergenceError`. With ``strict=False`` (default)
239
+ divergences are collected on the yielded player's ``divergences`` list.
240
+ """
241
+ if _current_session() is not None:
242
+ raise RuntimeError("agentcassette: a record/replay session is already active")
243
+ cassette = Cassette.load(path)
244
+ player = Player(cassette.steps, strict=strict)
245
+ _set_session(player)
246
+ try:
247
+ yield player
248
+ finally:
249
+ _set_session(None)
@@ -0,0 +1,86 @@
1
+ """Token accounting for recorded steps.
2
+
3
+ agentcassette has zero dependencies, so it cannot call a real tokenizer
4
+ (``tiktoken`` and friends are third-party). Instead it:
5
+
6
+ 1. Prefers exact counts when the recorded response carries a usage block
7
+ (OpenAI ``usage.prompt_tokens`` / Anthropic ``usage.input_tokens`` etc.).
8
+ 2. Falls back to a character-length heuristic (~4 chars per token), which is
9
+ accurate enough for budgeting and regression comparisons.
10
+
11
+ The heuristic is intentionally simple and deterministic so cassettes recorded
12
+ on one machine reproduce identical counts on another.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ from typing import Any
19
+
20
+ # Common usage-block key spellings across providers, mapped to (input, output).
21
+ _INPUT_KEYS = ("input_tokens", "prompt_tokens")
22
+ _OUTPUT_KEYS = ("output_tokens", "completion_tokens")
23
+
24
+ _CHARS_PER_TOKEN = 4
25
+
26
+
27
+ def estimate_tokens(obj: Any) -> int:
28
+ """Estimate token count for an arbitrary JSON-able object via char heuristic."""
29
+ if obj is None:
30
+ return 0
31
+ if isinstance(obj, str):
32
+ text = obj
33
+ else:
34
+ try:
35
+ text = json.dumps(obj, sort_keys=True, default=str)
36
+ except (TypeError, ValueError):
37
+ text = str(obj)
38
+ if not text:
39
+ return 0
40
+ # Round up so any non-empty payload counts as at least one token.
41
+ return max(1, -(-len(text) // _CHARS_PER_TOKEN))
42
+
43
+
44
+ def _find_usage(obj: Any) -> dict | None:
45
+ """Return the first dict named 'usage' found anywhere in a nested structure."""
46
+ if isinstance(obj, dict):
47
+ usage = obj.get("usage")
48
+ if isinstance(usage, dict):
49
+ return usage
50
+ for value in obj.values():
51
+ found = _find_usage(value)
52
+ if found is not None:
53
+ return found
54
+ elif isinstance(obj, (list, tuple)):
55
+ for item in obj:
56
+ found = _find_usage(item)
57
+ if found is not None:
58
+ return found
59
+ return None
60
+
61
+
62
+ def _pick(usage: dict, keys: tuple) -> int | None:
63
+ for key in keys:
64
+ value = usage.get(key)
65
+ if isinstance(value, (int, float)):
66
+ return int(value)
67
+ return None
68
+
69
+
70
+ def count_tokens(request: Any, response: Any) -> tuple[int, int]:
71
+ """Return ``(input_tokens, output_tokens)`` for a recorded call.
72
+
73
+ Uses an exact usage block from the response when present, otherwise falls
74
+ back to the character heuristic over the request (input) and response
75
+ (output) payloads.
76
+ """
77
+ usage = _find_usage(response)
78
+ if usage is not None:
79
+ input_tokens = _pick(usage, _INPUT_KEYS)
80
+ output_tokens = _pick(usage, _OUTPUT_KEYS)
81
+ if input_tokens is not None or output_tokens is not None:
82
+ return (
83
+ input_tokens if input_tokens is not None else estimate_tokens(request),
84
+ output_tokens if output_tokens is not None else estimate_tokens(response),
85
+ )
86
+ return estimate_tokens(request), estimate_tokens(response)
agentcassette/py.typed ADDED
File without changes
@@ -0,0 +1,290 @@
1
+ Metadata-Version: 2.4
2
+ Name: agentcassette
3
+ Version: 0.1.0
4
+ Summary: Deterministic agent test recorder and replayer. Record live runs, replay as mocks. Zero dependencies.
5
+ Project-URL: Homepage, https://github.com/aenealabs/agentcassette
6
+ Project-URL: Repository, https://github.com/aenealabs/agentcassette
7
+ Project-URL: Issues, https://github.com/aenealabs/agentcassette/issues
8
+ Project-URL: Changelog, https://github.com/aenealabs/agentcassette/blob/main/CHANGELOG.md
9
+ Author-email: LaVon Rutledge <32437530+lavrut@users.noreply.github.com>
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: ai-agents,cassette,deterministic,llm,mocking,record,replay,testing,vcr
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
+ Classifier: Topic :: Software Development :: Testing
26
+ Classifier: Typing :: Typed
27
+ Requires-Python: >=3.9
28
+ Provides-Extra: dev
29
+ Requires-Dist: hatch; extra == 'dev'
30
+ Requires-Dist: pytest; extra == 'dev'
31
+ Description-Content-Type: text/markdown
32
+
33
+ # agentcassette
34
+
35
+ [![PyPI](https://img.shields.io/pypi/v/agentcassette?color=blue)](https://pypi.org/project/agentcassette/)
36
+ [![Python](https://img.shields.io/pypi/pyversions/agentcassette)](https://pypi.org/project/agentcassette/)
37
+ [![CI](https://img.shields.io/github/actions/workflow/status/aenealabs/agentcassette/ci.yml?label=CI)](https://github.com/aenealabs/agentcassette/actions/workflows/ci.yml)
38
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
39
+ [![Zero dependencies](https://img.shields.io/badge/dependencies-none-brightgreen)](pyproject.toml)
40
+
41
+ **Deterministic agent test recorder and replayer.**
42
+
43
+ Record a real agent run once, replay it forever as a mock — no network, no cost, fully deterministic. Like VCR/pytest-recording, but purpose-built for LLM agents and with zero dependencies.
44
+
45
+ ```python
46
+ import agentcassette
47
+ from agentcassette import record, replay
48
+
49
+ call_model = agentcassette.intercept(call_model, kind="llm")
50
+
51
+ # Record a real run once:
52
+ with record("cassettes/flight_search.json"):
53
+ my_agent.run("Find flights to NYC under $300")
54
+
55
+ # Replay it in tests — no API calls, no tokens spent, same result every time:
56
+ def test_flight_search():
57
+ with replay("cassettes/flight_search.json"):
58
+ result = my_agent.run("Find flights to NYC under $300")
59
+ assert result.success
60
+ ```
61
+
62
+ ## Why agentcassette?
63
+
64
+ Testing agents is painful. Live LLM calls are **expensive** (every test run costs money), **non-deterministic** (a different answer each time), and **slow** (seconds per call). So most teams either skip agent testing or maintain a costly, flaky integration suite.
65
+
66
+ agentcassette records the real calls an agent makes into a plain-JSON **cassette**, then replays them on demand. Your tests become fast, free, and deterministic — and you can assert on exactly what the agent did.
67
+
68
+ Unlike VCR-style tools that monkey-patch the HTTP layer, agentcassette uses an explicit, honest seam: you wrap the callables you want captured. That keeps it **provider-agnostic** (OpenAI, Anthropic, Gemini, a raw `requests` call, or a local model all work identically) and **truly zero-dependency**.
69
+
70
+ ## Installation
71
+
72
+ ```bash
73
+ pip install agentcassette
74
+ ```
75
+
76
+ Requires Python 3.9+. No other dependencies, ever.
77
+
78
+ ## Quick Start
79
+
80
+ ### 1. Wrap what you want captured
81
+
82
+ Wrap your model-call function once (and any tools you want taped). Outside a record/replay block, wrapped callables behave exactly like the original — safe to leave in production code.
83
+
84
+ ```python
85
+ import agentcassette
86
+
87
+ # As a wrapper:
88
+ call_model = agentcassette.intercept(call_model, kind="llm")
89
+
90
+ # Or as a decorator:
91
+ @agentcassette.intercept(kind="tool")
92
+ def search_web(query: str) -> list[str]:
93
+ ...
94
+ ```
95
+
96
+ ### 2. Record a real run
97
+
98
+ ```python
99
+ from agentcassette import record
100
+
101
+ with record("cassettes/flight_search.json", model="claude-sonnet-4-6"):
102
+ my_agent.run("Find flights to NYC under $300")
103
+ # Cassette is written on clean exit.
104
+ ```
105
+
106
+ ### 3. Replay it in your tests
107
+
108
+ ```python
109
+ from agentcassette import replay
110
+
111
+ def test_flight_search():
112
+ with replay("cassettes/flight_search.json"):
113
+ result = my_agent.run("Find flights to NYC under $300")
114
+ assert result.success
115
+ ```
116
+
117
+ During replay, every intercepted call returns its recorded result and the real function is **never called**.
118
+
119
+ ## Async agents
120
+
121
+ `intercept` detects `async def` callables and returns an awaitable wrapper, so async agents work the same way — including a mix of async and sync tools in one run:
122
+
123
+ ```python
124
+ import agentcassette
125
+ from agentcassette import record, replay
126
+
127
+ acall_model = agentcassette.intercept(acall_model, kind="llm") # an async def
128
+
129
+ async def agent(task):
130
+ plan = await acall_model(f"plan: {task}")
131
+ ...
132
+
133
+ with record("cassettes/run.json"):
134
+ asyncio.run(agent("book a trip"))
135
+
136
+ with replay("cassettes/run.json"):
137
+ asyncio.run(agent("book a trip")) # awaited calls served from the cassette
138
+ ```
139
+
140
+ ## Catching regressions with strict replay
141
+
142
+ By default, replay serves recorded results best-effort and collects any divergences. With `strict=True`, a call whose name or arguments differ from the recording raises `DivergenceError` — turning your cassette into a behavioral contract.
143
+
144
+ ```python
145
+ from agentcassette import replay, DivergenceError
146
+
147
+ with replay("cassettes/flight_search.json", strict=True):
148
+ my_agent.run("Find flights to NYC under $300") # raises on drift
149
+ ```
150
+
151
+ Best-effort mode exposes what changed without failing:
152
+
153
+ ```python
154
+ with replay("cassettes/flight_search.json") as player:
155
+ my_agent.run("Find flights to NYC under $300")
156
+
157
+ for d in player.divergences:
158
+ print(d["index"], d["expected"], "->", d["actual"])
159
+ ```
160
+
161
+ ## Inspecting cassettes
162
+
163
+ ```python
164
+ from agentcassette import Cassette
165
+
166
+ c = Cassette.load("cassettes/flight_search.json")
167
+ c.num_steps # number of intercepted calls
168
+ c.total_input_tokens # summed across steps
169
+ c.total_output_tokens
170
+ c.total_tokens
171
+ c.duration_ms # wall time of the original run
172
+
173
+ c.redact("api_key") # scrub secrets before committing to git
174
+ c.save("cassettes/flight_search.json")
175
+ ```
176
+
177
+ Token counts use exact usage blocks when the recorded response carries one (OpenAI `usage.prompt_tokens`, Anthropic `usage.input_tokens`, …), falling back to a deterministic ~4-chars-per-token heuristic otherwise.
178
+
179
+ ## Redacting secrets
180
+
181
+ Scrub sensitive keys either when recording or after loading:
182
+
183
+ ```python
184
+ # At record time:
185
+ with record("cassettes/run.json", redact=["api_key", "authorization"]):
186
+ my_agent.run(task)
187
+
188
+ # Or later:
189
+ Cassette.load("cassettes/run.json").redact("api_key").save("cassettes/run.json")
190
+ ```
191
+
192
+ ## Diffing runs
193
+
194
+ Compare two cassettes to see how an agent's behavior drifted between versions:
195
+
196
+ ```python
197
+ from agentcassette import diff_cassettes
198
+
199
+ delta = diff_cassettes("cassettes/v1.json", "cassettes/v2.json")
200
+ delta.new_calls # call names in v2 but not v1
201
+ delta.dropped_calls # call names in v1 but not v2
202
+ delta.changed_calls # same-position steps whose args/results changed
203
+ delta.token_delta # total token change (v2 - v1)
204
+ delta.identical # True if nothing changed
205
+ ```
206
+
207
+ ## Cassette format
208
+
209
+ Cassettes are plain, human-readable JSON — diffable and safe to commit:
210
+
211
+ ```json
212
+ {
213
+ "version": 1,
214
+ "recorded_at": "2026-06-30T12:00:00Z",
215
+ "model": "claude-sonnet-4-6",
216
+ "duration_ms": 1832.4,
217
+ "steps": [
218
+ {
219
+ "index": 0,
220
+ "type": "llm",
221
+ "name": "call_model",
222
+ "arguments": {"args": ["plan the task"], "kwargs": {}},
223
+ "result": {"text": "...", "usage": {"input_tokens": 420, "output_tokens": 88}},
224
+ "input_tokens": 420,
225
+ "output_tokens": 88,
226
+ "duration_ms": 512.0
227
+ }
228
+ ]
229
+ }
230
+ ```
231
+
232
+ Every intercepted call becomes one step, in the exact order it happened.
233
+
234
+ ## API Reference
235
+
236
+ ### `intercept(fn=None, *, name=None, kind="call")`
237
+
238
+ Marks a callable as recordable/replayable. Usable as `intercept(fn)`, `intercept(fn, kind="llm")`, or as a decorator. Works on both regular functions and `async def` coroutine functions (async callables get an awaitable wrapper). `kind` is a free-form label stored on each step (e.g. `"llm"`, `"tool"`). Outside a session, the wrapped callable is a transparent pass-through.
239
+
240
+ ### `record(path, *, model=None, redact=None)`
241
+
242
+ Context manager. Records every intercepted call made inside the block to `path`, written on clean exit only. `redact` is a list of key names to scrub before saving. Yields the `Recorder`.
243
+
244
+ ### `replay(path, *, strict=False)`
245
+
246
+ Context manager. Serves recorded results for intercepted calls without running the real functions. `strict=True` raises `DivergenceError` on any mismatch. Yields the `Player` (with `.divergences`, `.remaining`, `.cursor`).
247
+
248
+ ### `Cassette`
249
+
250
+ | Member | Description |
251
+ |---|---|
252
+ | `Cassette.load(path)` | Load from disk (raises `CassetteNotFound`) |
253
+ | `.save(path)` | Write pretty-printed JSON, creating parent dirs |
254
+ | `.num_steps` | Number of recorded steps |
255
+ | `.total_input_tokens` / `.total_output_tokens` / `.total_tokens` | Token totals |
256
+ | `.duration_ms` | Wall time of the recorded run |
257
+ | `.redact(key, replacement="****")` | Scrub every value under `key`, at any depth |
258
+
259
+ ### `diff_cassettes(a, b) -> CassetteDiff`
260
+
261
+ Compare two cassettes (paths or `Cassette` objects). Returns a `CassetteDiff` with `new_calls`, `dropped_calls`, `changed_calls`, `token_delta`, `input_token_delta`, `output_token_delta`, `step_delta`, and `identical`.
262
+
263
+ ### Exceptions
264
+
265
+ All inherit from `AgentCassetteError`:
266
+
267
+ | Exception | Raised when |
268
+ |---|---|
269
+ | `CassetteNotFound` | Replaying a path that doesn't exist |
270
+ | `ReplayExhausted` | The agent makes more calls than the cassette recorded |
271
+ | `DivergenceError` | A strict replay sees a call that differs from the recording |
272
+
273
+ ## Notes & limitations
274
+
275
+ - **Replayed results are JSON.** Recorded values round-trip through JSON, so on replay you get plain dicts/lists/primitives, not the original SDK objects. For typical LLM responses (dicts) this is exactly what you want.
276
+ - **Ordering matters.** Calls replay in the order they were recorded. agentcassette matches sequentially, which is deterministic and mirrors how an agent actually executes. Truly concurrent calls (e.g. `asyncio.gather`) are recorded in completion order; if that order isn't stable across runs, replay matching is best-effort — record such sections sequentially if you need strict determinism.
277
+ - **Sync and async.** Both `def` and `async def` callables are supported. `record`/`replay` are thread-local and cover the event loop running on that thread; wrap per-thread if your agent fans out across OS threads.
278
+ - **Streaming responses** (token iterators) are not specially handled yet — wrap at a boundary where the response is already materialized.
279
+
280
+ ## Contributing
281
+
282
+ See [CONTRIBUTING.md](CONTRIBUTING.md).
283
+
284
+ ## License
285
+
286
+ MIT — see [LICENSE](LICENSE).
287
+
288
+ ---
289
+
290
+ Part of the [aenealabs](https://github.com/aenealabs) AI agent toolkit.
@@ -0,0 +1,11 @@
1
+ agentcassette/__init__.py,sha256=WdBqiRG9OfsoSWGGBzAYVg5ik9SxmhNRRc5ClpzpggU,2513
2
+ agentcassette/_cassette.py,sha256=SEk9cd5f5BR8YNJTkWU512yBIiAAOsd4G0iZBJOr5jU,5779
3
+ agentcassette/_diff.py,sha256=_mcod434ty-O3m-AG1GQEETVGx4pmXq7-ETo9mi3PIA,2707
4
+ agentcassette/_errors.py,sha256=tsI34kUVRvd8GDKc6-kdnXkal5jpUh0bD6fjsopTfD8,1217
5
+ agentcassette/_session.py,sha256=sr9kxFxVRO2SVs2rqiQ34wp_c1DGzTJk8kA55YpU0Kk,9017
6
+ agentcassette/_tokens.py,sha256=x3S2BEaa0YYkE6gouTmpPKEWuJdTAqrLEb0PCWLLceE,2985
7
+ agentcassette/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ agentcassette-0.1.0.dist-info/METADATA,sha256=RCFPSZDRK12pp_8hC2lVxY_BQa-hwRn-M3QHfQXjnAk,11418
9
+ agentcassette-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
10
+ agentcassette-0.1.0.dist-info/licenses/LICENSE,sha256=MYtxdpJgHsIGUqnc1UYrqDff94Vn7SDKXTxLhqOyCi8,1071
11
+ agentcassette-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 LaVon Rutledge
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.